evalscope 0.6.0rc0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
  2. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
  3. evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
  4. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
  5. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
  6. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
  7. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
  8. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
  9. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
  10. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
  11. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
  12. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  13. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  14. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  15. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  16. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
  17. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
  18. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
  19. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  20. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  21. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
  22. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
  23. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
  24. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +120 -100
  25. evalscope/backend/rag_eval/utils/clip.py +149 -0
  26. evalscope/backend/rag_eval/utils/embedding.py +183 -0
  27. evalscope/backend/rag_eval/utils/llm.py +72 -0
  28. evalscope/backend/rag_eval/utils/tools.py +63 -0
  29. evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
  30. evalscope/benchmarks/ceval/samples.jsonl +1 -0
  31. evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
  32. evalscope/benchmarks/mmlu/samples.jsonl +5 -0
  33. evalscope/benchmarks/race/samples.jsonl +5 -0
  34. evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
  35. evalscope/cli/start_perf.py +8 -11
  36. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  37. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
  38. evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
  39. evalscope/metrics/rouge_metric.py +30 -15
  40. evalscope/perf/arguments.py +179 -0
  41. evalscope/perf/benchmark.py +245 -0
  42. evalscope/perf/http_client.py +127 -711
  43. evalscope/perf/main.py +35 -0
  44. evalscope/perf/plugin/__init__.py +2 -0
  45. evalscope/perf/plugin/api/__init__.py +3 -0
  46. evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
  47. evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
  48. evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
  49. evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
  50. evalscope/perf/plugin/datasets/__init__.py +6 -0
  51. evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
  52. evalscope/perf/plugin/datasets/custom.py +21 -0
  53. evalscope/perf/plugin/datasets/flickr8k.py +51 -0
  54. evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
  55. evalscope/perf/plugin/datasets/longalpaca.py +28 -0
  56. evalscope/perf/plugin/datasets/openqa.py +38 -0
  57. evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
  58. evalscope/perf/plugin/registry.py +54 -0
  59. evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
  60. evalscope/perf/utils/benchmark_util.py +135 -0
  61. evalscope/perf/utils/chat_service.py +252 -0
  62. evalscope/perf/utils/db_util.py +200 -0
  63. evalscope/perf/utils/handler.py +46 -0
  64. evalscope/perf/utils/local_server.py +139 -0
  65. evalscope/registry/config/cfg_arena.yaml +77 -0
  66. evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
  67. evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
  68. evalscope/registry/config/cfg_single.yaml +78 -0
  69. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
  70. evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
  71. evalscope/registry/data/qa_browser/battle.jsonl +634 -0
  72. evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
  73. evalscope/registry/data/question.jsonl +80 -0
  74. evalscope/third_party/longbench_write/README.md +118 -0
  75. evalscope/third_party/longbench_write/default_task.json +27 -0
  76. evalscope/third_party/longbench_write/default_task.yaml +24 -0
  77. evalscope/third_party/toolbench_static/README.md +118 -0
  78. evalscope/third_party/toolbench_static/config_default.json +15 -0
  79. evalscope/third_party/toolbench_static/config_default.yaml +12 -0
  80. evalscope/third_party/toolbench_static/requirements.txt +2 -0
  81. evalscope/utils/logger.py +18 -20
  82. evalscope/utils/utils.py +41 -42
  83. evalscope/version.py +2 -2
  84. evalscope-0.7.0.dist-info/LICENSE +203 -0
  85. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/METADATA +162 -103
  86. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/RECORD +107 -32
  87. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/WHEEL +1 -1
  88. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/top_level.txt +1 -0
  89. tests/cli/__init__.py +1 -0
  90. tests/cli/test_run.py +76 -0
  91. tests/perf/__init__.py +1 -0
  92. tests/perf/test_perf.py +96 -0
  93. tests/rag/__init__.py +0 -0
  94. tests/rag/test_clip_benchmark.py +85 -0
  95. tests/rag/test_mteb.py +136 -0
  96. tests/rag/test_ragas.py +120 -0
  97. tests/swift/__init__.py +1 -0
  98. tests/swift/test_run_swift_eval.py +146 -0
  99. tests/swift/test_run_swift_vlm_eval.py +128 -0
  100. tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
  101. tests/test_run_all.py +12 -0
  102. tests/vlm/__init__.py +1 -0
  103. tests/vlm/test_vlmeval.py +59 -0
  104. evalscope/perf/_logging.py +0 -32
  105. evalscope/perf/datasets/longalpaca_12k.py +0 -20
  106. evalscope/perf/datasets/openqa.py +0 -22
  107. evalscope/perf/plugin_registry.py +0 -35
  108. evalscope/perf/query_parameters.py +0 -42
  109. evalscope/perf/server_sent_event.py +0 -43
  110. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
  111. /evalscope/{perf/datasets → backend/rag_eval/utils}/__init__.py +0 -0
  112. /evalscope/{preprocess/tokenizers → perf/utils}/__init__.py +0 -0
  113. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/entry_points.txt +0 -0
  114. {evalscope/preprocess → tests}/__init__.py +0 -0
@@ -0,0 +1,39 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": 2334929353739018813,
4
+ "language": "chinese",
5
+ "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "themes": [
10
+ "同理心",
11
+ "包容性",
12
+ "远程工作"
13
+ ],
14
+ "personas": [
15
+ {
16
+ "name": "人力资源经理",
17
+ "role_description": "专注于包容性和员工支持。"
18
+ },
19
+ {
20
+ "name": "远程团队负责人",
21
+ "role_description": "管理远程团队沟通。"
22
+ }
23
+ ]
24
+ },
25
+ "output": {
26
+ "mapping": {
27
+ "HR Manager": [
28
+ "包容性",
29
+ "同理心"
30
+ ],
31
+ "Remote Team Lead": [
32
+ "远程工作",
33
+ "同理心"
34
+ ]
35
+ }
36
+ }
37
+ }
38
+ ]
39
+ }
@@ -0,0 +1,7 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -1698100170803872933,
4
+ "language": "chinese",
5
+ "instruction": "根据指定的条件(角色、主题、风格、长度)和提供的上下文生成查询和答案。确保答案完全忠实于上下文,仅使用直接来自提供节点的信息。### 指令:\n1. **生成查询**:根据上下文、角色、主题、风格和长度,创建一个与角色视角一致并反映主题的问题。\n2. **生成答案**:仅使用提供的上下文内容,创建一个忠实且详细的答案。不要包含任何不在或无法从给定上下文中推断的信息。\n### 示例输出:\n\n",
6
+ "examples": []
7
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": 2334929353739018813,
4
+ "language": "chinese",
5
+ "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "themes": [
10
+ "同理心",
11
+ "包容性",
12
+ "远程工作"
13
+ ],
14
+ "personas": [
15
+ {
16
+ "name": "人力资源经理",
17
+ "role_description": "专注于包容性和员工支持。"
18
+ },
19
+ {
20
+ "name": "远程团队负责人",
21
+ "role_description": "管理远程团队沟通。"
22
+ }
23
+ ]
24
+ },
25
+ "output": {
26
+ "mapping": {
27
+ "HR Manager": [
28
+ "包容性",
29
+ "同理心"
30
+ ],
31
+ "Remote Team Lead": [
32
+ "远程工作",
33
+ "同理心"
34
+ ]
35
+ }
36
+ }
37
+ }
38
+ ]
39
+ }
@@ -0,0 +1,34 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -2189588237940965149,
4
+ "language": "chinese",
5
+ "instruction": "请说明给定的信息是否得到视觉和文本上下文信息的支持。您需要回答“是”或“否”。如果任何图像和文本上下文支持该信息,请回答“是”。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "response": "苹果派通常是双层皮的。",
10
+ "retrieved_contexts": [
11
+ "苹果派是一种水果派,其主要馅料成分是苹果。",
12
+ "苹果派通常与奶油、冰淇淋(“苹果派 à la mode”)、蛋奶沙司或切达干酪一起食用。",
13
+ "它通常是双层皮的,上下都有糕点;上层皮可以是实心的或格状的(交叉条纹编织而成)。"
14
+ ]
15
+ },
16
+ "output": {
17
+ "faithful": true
18
+ }
19
+ },
20
+ {
21
+ "input": {
22
+ "response": "苹果派味道不好。",
23
+ "retrieved_contexts": [
24
+ "苹果派是一种水果派,其主要馅料成分是苹果。",
25
+ "苹果派通常与奶油、冰淇淋(“苹果派 à la mode”)、蛋奶沙司或切达干酪一起食用。",
26
+ "它通常是双层皮的,上下都有糕点;上层皮可以是实心的或格状的(交叉条纹编织而成)。"
27
+ ]
28
+ },
29
+ "output": {
30
+ "faithful": false
31
+ }
32
+ }
33
+ ]
34
+ }
@@ -0,0 +1,36 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -7302860412443151372,
4
+ "language": "chinese",
5
+ "instruction": "\n您的任务是评估查询的响应是否与提供的图像和文本上下文信息一致。\n您有两个选项可以回答。要么是 True / False。\n如果查询的响应与上下文信息一致,则回答 - True,否则为 False。\n",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "user_input": "传统玛格丽塔披萨的主要成分是什么?",
10
+ "response": "玛格丽塔披萨的主要成分是番茄、马苏里拉奶酪和新鲜罗勒。",
11
+ "retrieved_contexts": [
12
+ "传统的玛格丽塔披萨由薄薄的饼皮组成。",
13
+ "主要的配料包括番茄、马苏里拉奶酪、新鲜罗勒、盐和橄榄油。",
14
+ "它是最简单和最经典的披萨类型之一。"
15
+ ]
16
+ },
17
+ "output": {
18
+ "relevance": true
19
+ }
20
+ },
21
+ {
22
+ "input": {
23
+ "user_input": "谁在2021年奥斯卡颁奖典礼上获得了最佳男演员奖?",
24
+ "response": "2021年的最佳男演员奖由莱昂纳多·迪卡普里奥获得。",
25
+ "retrieved_contexts": [
26
+ "第93届奥斯卡颁奖典礼于2021年举行。",
27
+ "安东尼·霍普金斯凭借在《困在时间里的父亲》中的角色赢得了最佳男演员奖。",
28
+ "由于COVID-19的限制,这次活动具有独特性。"
29
+ ]
30
+ },
31
+ "output": {
32
+ "relevance": false
33
+ }
34
+ }
35
+ ]
36
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -7036736759899743798,
4
+ "language": "chinese",
5
+ "instruction": "从给定文本中提取命名实体,限制输出为最重要的实体。确保实体数量不超过指定的最大值。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "text": "特斯拉和SpaceX的首席执行官埃隆·马斯克宣布计划将业务扩展到欧洲和亚洲的新地点。\n 此次扩展预计将创造数千个就业机会,特别是在柏林和上海等城市。",
10
+ "max_num": 10
11
+ },
12
+ "output": {
13
+ "entities": [
14
+ "埃隆·马斯克",
15
+ "特斯拉",
16
+ "SpaceX",
17
+ "欧洲",
18
+ "亚洲",
19
+ "柏林",
20
+ "上海"
21
+ ]
22
+ }
23
+ }
24
+ ]
25
+ }
@@ -0,0 +1,7 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -1422723613754983378,
4
+ "language": "chinese",
5
+ "instruction": "根据指定的条件(角色、术语、风格、长度)和提供的上下文生成查询和答案。确保答案完全忠实于上下文,仅使用直接来自提供上下文的信息。### 指令:\n1. **生成查询**:根据上下文、角色、术语、风格和长度,创建一个与角色视角一致并包含术语的问题。\n2. **生成答案**:仅使用提供的上下文中的内容,构建对查询的详细答案。不要添加上下文中未包含或无法推断的信息。\n### 示例输出:\n\n",
6
+ "examples": []
7
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": 2334929353739018813,
4
+ "language": "chinese",
5
+ "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "themes": [
10
+ "同理心",
11
+ "包容性",
12
+ "远程工作"
13
+ ],
14
+ "personas": [
15
+ {
16
+ "name": "人力资源经理",
17
+ "role_description": "专注于包容性和员工支持。"
18
+ },
19
+ {
20
+ "name": "远程团队领导",
21
+ "role_description": "管理远程团队沟通。"
22
+ }
23
+ ]
24
+ },
25
+ "output": {
26
+ "mapping": {
27
+ "HR Manager": [
28
+ "包容性",
29
+ "同理心"
30
+ ],
31
+ "Remote Team Lead": [
32
+ "远程工作",
33
+ "同理心"
34
+ ]
35
+ }
36
+ }
37
+ }
38
+ ]
39
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -5467318232123540806,
4
+ "language": "chinese",
5
+ "instruction": "将给定文本总结为少于10个句子。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "text": "人工智能\n\n人工智能正在通过自动化以前需要人类智能的任务来改变各个行业。从医疗到金融,人工智能正在被用来快速准确地分析大量数据。这项技术还推动了自动驾驶汽车和个性化推荐等领域的创新。"
10
+ },
11
+ "output": {
12
+ "text": "人工智能通过自动化任务、分析数据和推动自动驾驶汽车和个性化推荐等创新,正在革新各个行业。"
13
+ }
14
+ }
15
+ ]
16
+ }
@@ -0,0 +1,24 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": 2452110859551524285,
4
+ "language": "chinese",
5
+ "instruction": "从给定的文本中提取主要主题和概念。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "text": "人工智能通过自动化需要人类智能的任务来改变行业。人工智能快速准确地分析大量数据,推动了自动驾驶汽车和个性化推荐等创新。",
10
+ "max_num": 10
11
+ },
12
+ "output": {
13
+ "output": [
14
+ "人工智能",
15
+ "自动化",
16
+ "数据分析",
17
+ "创新",
18
+ "自动驾驶汽车",
19
+ "个性化推荐"
20
+ ]
21
+ }
22
+ }
23
+ ]
24
+ }
@@ -0,0 +1,18 @@
1
+ import typing as t
2
+
3
+ from pydantic import BaseModel
4
+ from ragas.prompt import PydanticPrompt, StringIO
5
+ from ragas.testset.persona import Persona
6
+
7
+
8
+ class PersonaGenerationPromptZH(PydanticPrompt[StringIO, Persona]):
9
+ instruction: str = ('使用提供的摘要,生成一个可能会与内容互动或从中受益的角色。包括一个独特的名字和一个简洁的角色描述。')
10
+ input_model: t.Type[StringIO] = StringIO
11
+ output_model: t.Type[Persona] = Persona
12
+ examples: t.List[t.Tuple[StringIO, Persona]] = [(
13
+ StringIO(text='《数字营销指南》解释了在各种在线平台上吸引受众的策略。'),
14
+ Persona(
15
+ name='数字营销专家',
16
+ role_description='专注于吸引受众并在线上提升品牌。',
17
+ ),
18
+ )]
@@ -1,15 +1,15 @@
1
- import os
2
1
  import asyncio
2
+ import os
3
+
3
4
  import pandas as pd
4
- from tqdm import tqdm
5
- from ragas.llms import LangchainLLMWrapper
6
5
  from ragas.embeddings import LangchainEmbeddingsWrapper
7
- from .translate_prompt import translate_prompts
8
- from evalscope.utils.logger import get_logger
9
- from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
10
- from evalscope.backend.rag_eval import EmbeddingModel, LLM, ChatOpenAI
6
+ from ragas.llms import LangchainLLMWrapper
7
+ from tqdm import tqdm
11
8
 
12
- os.environ['DO_NOT_TRACK'] = 'true'
9
+ from evalscope.backend.rag_eval import LLM, ChatOpenAI, EmbeddingModel
10
+ from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
11
+ from evalscope.utils.logger import get_logger
12
+ from .translate_prompt import translate_prompts
13
13
 
14
14
  logger = get_logger()
15
15
 
@@ -17,116 +17,110 @@ logger = get_logger()
17
17
  def get_transform(llm, embedding, language):
18
18
  """
19
19
  Creates and returns a default set of transforms for processing a knowledge graph.
20
-
21
- This function defines a series of transformation steps to be applied to a
22
- knowledge graph, including extracting summaries, keyphrases, titles,
23
- headlines, and embeddings, as well as building similarity relationships
24
- between nodes.
25
-
26
- The transforms are applied in the following order:
27
- 1. Parallel extraction of summaries and headlines
28
- 2. Embedding of summaries for document nodes
29
- 3. Splitting of headlines
30
- 4. Parallel extraction of embeddings, keyphrases, and titles
31
- 5. Building cosine similarity relationships between nodes
32
- 6. Building cosine similarity relationships between summaries
33
-
34
- Returns
35
- -------
36
- Transforms
37
- A list of transformation steps to be applied to the knowledge graph.
38
-
39
20
  """
40
21
  from ragas.testset.transforms.engine import Parallel
41
22
  from ragas.testset.transforms.extractors import (
42
23
  EmbeddingExtractor,
43
24
  HeadlinesExtractor,
44
- KeyphrasesExtractor,
45
25
  SummaryExtractor,
46
- TitleExtractor,
47
26
  )
48
- from ragas.testset.transforms.relationship_builders.cosine import (
27
+ from ragas.testset.transforms.extractors.llm_based import NERExtractor, ThemesExtractor
28
+ from ragas.testset.transforms.relationship_builders import (
49
29
  CosineSimilarityBuilder,
50
- SummaryCosineSimilarityBuilder,
30
+ OverlapScoreBuilder,
51
31
  )
52
32
  from ragas.testset.transforms.splitters import HeadlineSplitter
33
+ from ragas.testset.transforms.filters import CustomNodeFilter
53
34
  from ragas.testset.graph import NodeType
35
+ from ragas.utils import num_tokens_from_string
36
+
37
+ def summary_filter(node):
38
+ return (node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > 500)
54
39
 
55
- # define the transforms
56
- summary_extractor = SummaryExtractor(llm=llm)
57
- keyphrase_extractor = KeyphrasesExtractor(llm=llm)
58
- title_extractor = TitleExtractor(llm=llm)
40
+ summary_extractor = SummaryExtractor(llm=llm, filter_nodes=lambda node: summary_filter(node))
41
+ ner_extractor = NERExtractor(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
42
+ theme_extractor = ThemesExtractor(llm=llm)
59
43
  headline_extractor = HeadlinesExtractor(llm=llm)
60
44
 
61
45
  asyncio.run(
62
46
  translate_prompts(
63
47
  prompts=[
64
48
  summary_extractor,
65
- keyphrase_extractor,
66
- title_extractor,
49
+ theme_extractor,
50
+ ner_extractor,
67
51
  headline_extractor,
68
52
  ],
69
53
  target_lang=language,
70
54
  llm=llm,
71
55
  adapt_instruction=True,
72
- )
73
- )
56
+ ))
57
+
58
+ splitter = HeadlineSplitter(min_tokens=500)
74
59
 
75
- embedding_extractor = EmbeddingExtractor(embedding_model=embedding)
76
- headline_splitter = HeadlineSplitter()
77
- cosine_sim_builder = CosineSimilarityBuilder(threshold=0.8)
78
- summary_embedder = EmbeddingExtractor(
79
- name='summary_embedder',
80
- filter_nodes=lambda node: True if node.type == NodeType.DOCUMENT else False,
60
+ summary_emb_extractor = EmbeddingExtractor(
61
+ embedding_model=embedding,
81
62
  property_name='summary_embedding',
82
63
  embed_property_name='summary',
83
- embedding_model=embedding,
64
+ filter_nodes=lambda node: summary_filter(node),
84
65
  )
85
- summary_cosine_sim_builder = SummaryCosineSimilarityBuilder(threshold=0.6)
86
66
 
87
- # specify the transforms and their order to be applied
67
+ cosine_sim_builder = CosineSimilarityBuilder(
68
+ property_name='summary_embedding',
69
+ new_property_name='summary_similarity',
70
+ threshold=0.7,
71
+ filter_nodes=lambda node: summary_filter(node),
72
+ )
73
+
74
+ ner_overlap_sim = OverlapScoreBuilder(threshold=0.01, filter_nodes=lambda node: node.type == NodeType.CHUNK)
75
+
76
+ node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
77
+
88
78
  transforms = [
89
- Parallel(summary_extractor, headline_extractor),
90
- summary_embedder,
91
- headline_splitter,
92
- Parallel(embedding_extractor, keyphrase_extractor, title_extractor),
93
- cosine_sim_builder,
94
- summary_cosine_sim_builder,
79
+ headline_extractor,
80
+ splitter,
81
+ summary_extractor,
82
+ node_filter,
83
+ Parallel(summary_emb_extractor, theme_extractor, ner_extractor),
84
+ Parallel(cosine_sim_builder, ner_overlap_sim),
95
85
  ]
86
+
96
87
  return transforms
97
88
 
98
89
 
99
90
  def get_distribution(llm, distribution, language):
100
- from ragas.testset.synthesizers.abstract_query import (
101
- AbstractQuerySynthesizer,
102
- ComparativeAbstractQuerySynthesizer,
91
+ from ragas.testset.synthesizers.multi_hop import (
92
+ MultiHopAbstractQuerySynthesizer,
93
+ MultiHopSpecificQuerySynthesizer,
103
94
  )
104
- from ragas.testset.synthesizers.specific_query import SpecificQuerySynthesizer
95
+ from ragas.testset.synthesizers.single_hop.specific import (
96
+ SingleHopSpecificQuerySynthesizer, )
105
97
 
106
- abstract = AbstractQuerySynthesizer(llm=llm)
107
- comparative = ComparativeAbstractQuerySynthesizer(llm=llm)
108
- specific = SpecificQuerySynthesizer(llm=llm)
98
+ single_hop = SingleHopSpecificQuerySynthesizer(llm=llm)
99
+ multi_hop_abs = MultiHopAbstractQuerySynthesizer(llm=llm)
100
+ multi_hop_spec = MultiHopSpecificQuerySynthesizer(llm=llm)
109
101
 
110
102
  asyncio.run(
111
103
  translate_prompts(
112
104
  prompts=[
113
- abstract,
114
- comparative,
115
- specific,
105
+ single_hop,
106
+ multi_hop_abs,
107
+ multi_hop_spec,
116
108
  ],
117
109
  target_lang=language,
118
110
  llm=llm,
119
111
  adapt_instruction=True,
120
- )
121
- )
122
- return [
123
- (abstract, distribution['simple']),
124
- (comparative, distribution['multi_context']),
125
- (specific, distribution['reasoning']),
126
- ]
112
+ ))
113
+
114
+ mapping = {
115
+ 'simple': single_hop,
116
+ 'multi_context': multi_hop_abs,
117
+ 'reasoning': multi_hop_spec,
118
+ }
119
+
120
+ return [(mapping[key], distribution[key]) for key in mapping if key in distribution]
127
121
 
128
122
 
129
- def get_knowledge_graph(documents, transforms, local_file):
123
+ def get_knowledge_graph(documents, transforms, local_file, run_config):
130
124
  from ragas.testset.graph import KnowledgeGraph, Node, NodeType
131
125
  from ragas.testset.transforms import apply_transforms
132
126
 
@@ -148,7 +142,7 @@ def get_knowledge_graph(documents, transforms, local_file):
148
142
  kg = KnowledgeGraph(nodes=nodes)
149
143
 
150
144
  # apply transforms and update the knowledge graph
151
- apply_transforms(kg, transforms)
145
+ apply_transforms(kg, transforms, run_config=run_config)
152
146
 
153
147
  # save the knowledge graph
154
148
  output_path = os.path.dirname(local_file)
@@ -158,6 +152,39 @@ def get_knowledge_graph(documents, transforms, local_file):
158
152
  return kg
159
153
 
160
154
 
155
+ def get_persona(llm, kg, language):
156
+ from evalscope.backend.rag_eval.ragas.prompts.persona_prompt import PersonaGenerationPromptZH
157
+ from ragas.testset.persona import generate_personas_from_kg, PersonaGenerationPrompt
158
+ from ragas.testset.graph import Node
159
+
160
+ def filter(node: Node) -> bool:
161
+ if (node.type.name == 'DOCUMENT' and node.properties.get('summary_embedding') is not None):
162
+ return True
163
+ else:
164
+ return False
165
+
166
+ if language == 'chinese':
167
+ persona_prompt = PersonaGenerationPromptZH()
168
+ else:
169
+ persona_prompt = PersonaGenerationPrompt()
170
+ # NOTE: can't translate this yet
171
+ # asyncio.run(
172
+ # translate_prompts(
173
+ # prompts=[persona_prompt],
174
+ # target_lang=language,
175
+ # llm=llm,
176
+ # adapt_instruction=True,
177
+ # ))
178
+
179
+ return generate_personas_from_kg(
180
+ llm=llm,
181
+ kg=kg,
182
+ num_personas=3,
183
+ persona_generation_prompt=persona_prompt,
184
+ filter_fn=filter,
185
+ )
186
+
187
+
161
188
  def load_data(file_path):
162
189
  from langchain_community.document_loaders import UnstructuredFileLoader
163
190
 
@@ -178,32 +205,31 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
178
205
  generator_llm = LLM.load(**args.generator_llm)
179
206
  embeddings = EmbeddingModel.load(**args.embeddings)
180
207
 
208
+ wrapped_llm = LangchainLLMWrapper(generator_llm)
209
+ wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)
210
+
181
211
  # Change resulting question type distribution
182
- distributions = get_distribution(
183
- LangchainLLMWrapper(generator_llm), args.distribution, args.language
184
- )
212
+ distributions = get_distribution(wrapped_llm, args.distribution, args.language)
185
213
 
214
+ run_config = RunConfig(timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True)
186
215
  # get transforms
187
216
  transforms = get_transform(
188
- LangchainLLMWrapper(generator_llm),
189
- LangchainEmbeddingsWrapper(embeddings),
217
+ wrapped_llm,
218
+ wrapped_embeddings,
190
219
  args.language,
191
220
  )
192
221
 
193
222
  # get knowledge graph
194
- knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph)
223
+ knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph, run_config)
195
224
 
196
- generator = TestsetGenerator.from_langchain(
197
- generator_llm, embeddings, knowledge_graph
198
- )
225
+ persona_list = get_persona(llm=wrapped_llm, kg=knowledge_graph, language=args.language)
226
+
227
+ generator = TestsetGenerator(llm=wrapped_llm, knowledge_graph=knowledge_graph, persona_list=persona_list)
199
228
 
200
- runconfig = RunConfig(
201
- timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True
202
- )
203
229
  testset = generator.generate(
204
230
  testset_size=args.test_size,
205
231
  query_distribution=distributions,
206
- run_config=runconfig,
232
+ run_config=run_config,
207
233
  with_debugging_logs=True,
208
234
  raise_exceptions=True,
209
235
  )
@@ -212,9 +238,7 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
212
238
  testset_df = testset.to_pandas()
213
239
  output_path = os.path.dirname(args.output_file)
214
240
  os.makedirs(output_path, exist_ok=True)
215
- testset_df.to_json(
216
- args.output_file, indent=4, index=False, orient='records', force_ascii=False
217
- )
241
+ testset_df.to_json(args.output_file, indent=4, index=False, orient='records', force_ascii=False)
218
242
 
219
243
  # get answer
220
244
  testset_with_answer = get_answer(testset_df, generator_llm, args.language)
@@ -243,21 +267,17 @@ Answer:
243
267
  contexts = '\n'.join(row['reference_contexts'])
244
268
 
245
269
  # Combine question and contexts as input for the LLM
246
- input_text = template.format(
247
- language=language, question=question, contexts=contexts
248
- )
270
+ input_text = template.format(language=language, question=question, contexts=contexts)
249
271
 
250
272
  # Generate the answer using the generator LLM
251
273
  answer = generator_llm.invoke(input_text)
252
274
  if isinstance(generator_llm, ChatOpenAI):
253
275
  answer = answer.content
254
- items.append(
255
- {
256
- 'user_input': question,
257
- 'retrieved_contexts': row['reference_contexts'],
258
- 'response': answer,
259
- 'reference': row['reference'],
260
- }
261
- )
276
+ items.append({
277
+ 'user_input': question,
278
+ 'retrieved_contexts': row['reference_contexts'],
279
+ 'response': answer,
280
+ 'reference': row['reference'],
281
+ })
262
282
 
263
283
  return pd.DataFrame.from_dict(items)