evalscope 0.6.0rc0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
- evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +120 -100
- evalscope/backend/rag_eval/utils/clip.py +149 -0
- evalscope/backend/rag_eval/utils/embedding.py +183 -0
- evalscope/backend/rag_eval/utils/llm.py +72 -0
- evalscope/backend/rag_eval/utils/tools.py +63 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
- evalscope/benchmarks/ceval/samples.jsonl +1 -0
- evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
- evalscope/benchmarks/mmlu/samples.jsonl +5 -0
- evalscope/benchmarks/race/samples.jsonl +5 -0
- evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
- evalscope/cli/start_perf.py +8 -11
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
- evalscope/metrics/rouge_metric.py +30 -15
- evalscope/perf/arguments.py +179 -0
- evalscope/perf/benchmark.py +245 -0
- evalscope/perf/http_client.py +127 -711
- evalscope/perf/main.py +35 -0
- evalscope/perf/plugin/__init__.py +2 -0
- evalscope/perf/plugin/api/__init__.py +3 -0
- evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
- evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
- evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
- evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
- evalscope/perf/plugin/datasets/__init__.py +6 -0
- evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
- evalscope/perf/plugin/datasets/custom.py +21 -0
- evalscope/perf/plugin/datasets/flickr8k.py +51 -0
- evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
- evalscope/perf/plugin/datasets/longalpaca.py +28 -0
- evalscope/perf/plugin/datasets/openqa.py +38 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
- evalscope/perf/plugin/registry.py +54 -0
- evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
- evalscope/perf/utils/benchmark_util.py +135 -0
- evalscope/perf/utils/chat_service.py +252 -0
- evalscope/perf/utils/db_util.py +200 -0
- evalscope/perf/utils/handler.py +46 -0
- evalscope/perf/utils/local_server.py +139 -0
- evalscope/registry/config/cfg_arena.yaml +77 -0
- evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
- evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
- evalscope/registry/config/cfg_single.yaml +78 -0
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
- evalscope/registry/data/qa_browser/battle.jsonl +634 -0
- evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
- evalscope/registry/data/question.jsonl +80 -0
- evalscope/third_party/longbench_write/README.md +118 -0
- evalscope/third_party/longbench_write/default_task.json +27 -0
- evalscope/third_party/longbench_write/default_task.yaml +24 -0
- evalscope/third_party/toolbench_static/README.md +118 -0
- evalscope/third_party/toolbench_static/config_default.json +15 -0
- evalscope/third_party/toolbench_static/config_default.yaml +12 -0
- evalscope/third_party/toolbench_static/requirements.txt +2 -0
- evalscope/utils/logger.py +18 -20
- evalscope/utils/utils.py +41 -42
- evalscope/version.py +2 -2
- evalscope-0.7.0.dist-info/LICENSE +203 -0
- {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/METADATA +162 -103
- {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/RECORD +107 -32
- {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/WHEEL +1 -1
- {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/top_level.txt +1 -0
- tests/cli/__init__.py +1 -0
- tests/cli/test_run.py +76 -0
- tests/perf/__init__.py +1 -0
- tests/perf/test_perf.py +96 -0
- tests/rag/__init__.py +0 -0
- tests/rag/test_clip_benchmark.py +85 -0
- tests/rag/test_mteb.py +136 -0
- tests/rag/test_ragas.py +120 -0
- tests/swift/__init__.py +1 -0
- tests/swift/test_run_swift_eval.py +146 -0
- tests/swift/test_run_swift_vlm_eval.py +128 -0
- tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
- tests/test_run_all.py +12 -0
- tests/vlm/__init__.py +1 -0
- tests/vlm/test_vlmeval.py +59 -0
- evalscope/perf/_logging.py +0 -32
- evalscope/perf/datasets/longalpaca_12k.py +0 -20
- evalscope/perf/datasets/openqa.py +0 -22
- evalscope/perf/plugin_registry.py +0 -35
- evalscope/perf/query_parameters.py +0 -42
- evalscope/perf/server_sent_event.py +0 -43
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
- /evalscope/{perf/datasets → backend/rag_eval/utils}/__init__.py +0 -0
- /evalscope/{preprocess/tokenizers → perf/utils}/__init__.py +0 -0
- {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/entry_points.txt +0 -0
- {evalscope/preprocess → tests}/__init__.py +0 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"ragas_version": "0.2.5",
|
|
3
|
+
"original_hash": 2334929353739018813,
|
|
4
|
+
"language": "chinese",
|
|
5
|
+
"instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
|
|
6
|
+
"examples": [
|
|
7
|
+
{
|
|
8
|
+
"input": {
|
|
9
|
+
"themes": [
|
|
10
|
+
"同理心",
|
|
11
|
+
"包容性",
|
|
12
|
+
"远程工作"
|
|
13
|
+
],
|
|
14
|
+
"personas": [
|
|
15
|
+
{
|
|
16
|
+
"name": "人力资源经理",
|
|
17
|
+
"role_description": "专注于包容性和员工支持。"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"name": "远程团队负责人",
|
|
21
|
+
"role_description": "管理远程团队沟通。"
|
|
22
|
+
}
|
|
23
|
+
]
|
|
24
|
+
},
|
|
25
|
+
"output": {
|
|
26
|
+
"mapping": {
|
|
27
|
+
"HR Manager": [
|
|
28
|
+
"包容性",
|
|
29
|
+
"同理心"
|
|
30
|
+
],
|
|
31
|
+
"Remote Team Lead": [
|
|
32
|
+
"远程工作",
|
|
33
|
+
"同理心"
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
]
|
|
39
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"ragas_version": "0.2.5",
|
|
3
|
+
"original_hash": -1698100170803872933,
|
|
4
|
+
"language": "chinese",
|
|
5
|
+
"instruction": "根据指定的条件(角色、主题、风格、长度)和提供的上下文生成查询和答案。确保答案完全忠实于上下文,仅使用直接来自提供节点的信息。### 指令:\n1. **生成查询**:根据上下文、角色、主题、风格和长度,创建一个与角色视角一致并反映主题的问题。\n2. **生成答案**:仅使用提供的上下文内容,创建一个忠实且详细的答案。不要包含任何不在或无法从给定上下文中推断的信息。\n### 示例输出:\n\n",
|
|
6
|
+
"examples": []
|
|
7
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"ragas_version": "0.2.5",
|
|
3
|
+
"original_hash": 2334929353739018813,
|
|
4
|
+
"language": "chinese",
|
|
5
|
+
"instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
|
|
6
|
+
"examples": [
|
|
7
|
+
{
|
|
8
|
+
"input": {
|
|
9
|
+
"themes": [
|
|
10
|
+
"同理心",
|
|
11
|
+
"包容性",
|
|
12
|
+
"远程工作"
|
|
13
|
+
],
|
|
14
|
+
"personas": [
|
|
15
|
+
{
|
|
16
|
+
"name": "人力资源经理",
|
|
17
|
+
"role_description": "专注于包容性和员工支持。"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"name": "远程团队负责人",
|
|
21
|
+
"role_description": "管理远程团队沟通。"
|
|
22
|
+
}
|
|
23
|
+
]
|
|
24
|
+
},
|
|
25
|
+
"output": {
|
|
26
|
+
"mapping": {
|
|
27
|
+
"HR Manager": [
|
|
28
|
+
"包容性",
|
|
29
|
+
"同理心"
|
|
30
|
+
],
|
|
31
|
+
"Remote Team Lead": [
|
|
32
|
+
"远程工作",
|
|
33
|
+
"同理心"
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
]
|
|
39
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"ragas_version": "0.2.5",
|
|
3
|
+
"original_hash": -2189588237940965149,
|
|
4
|
+
"language": "chinese",
|
|
5
|
+
"instruction": "请说明给定的信息是否得到视觉和文本上下文信息的支持。您需要回答“是”或“否”。如果任何图像和文本上下文支持该信息,请回答“是”。",
|
|
6
|
+
"examples": [
|
|
7
|
+
{
|
|
8
|
+
"input": {
|
|
9
|
+
"response": "苹果派通常是双层皮的。",
|
|
10
|
+
"retrieved_contexts": [
|
|
11
|
+
"苹果派是一种水果派,其主要馅料成分是苹果。",
|
|
12
|
+
"苹果派通常与奶油、冰淇淋(“苹果派 à la mode”)、蛋奶沙司或切达干酪一起食用。",
|
|
13
|
+
"它通常是双层皮的,上下都有糕点;上层皮可以是实心的或格状的(交叉条纹编织而成)。"
|
|
14
|
+
]
|
|
15
|
+
},
|
|
16
|
+
"output": {
|
|
17
|
+
"faithful": true
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"input": {
|
|
22
|
+
"response": "苹果派味道不好。",
|
|
23
|
+
"retrieved_contexts": [
|
|
24
|
+
"苹果派是一种水果派,其主要馅料成分是苹果。",
|
|
25
|
+
"苹果派通常与奶油、冰淇淋(“苹果派 à la mode”)、蛋奶沙司或切达干酪一起食用。",
|
|
26
|
+
"它通常是双层皮的,上下都有糕点;上层皮可以是实心的或格状的(交叉条纹编织而成)。"
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
"output": {
|
|
30
|
+
"faithful": false
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
]
|
|
34
|
+
}
|
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"ragas_version": "0.2.5",
|
|
3
|
+
"original_hash": -7302860412443151372,
|
|
4
|
+
"language": "chinese",
|
|
5
|
+
"instruction": "\n您的任务是评估查询的响应是否与提供的图像和文本上下文信息一致。\n您有两个选项可以回答。要么是 True / False。\n如果查询的响应与上下文信息一致,则回答 - True,否则为 False。\n",
|
|
6
|
+
"examples": [
|
|
7
|
+
{
|
|
8
|
+
"input": {
|
|
9
|
+
"user_input": "传统玛格丽塔披萨的主要成分是什么?",
|
|
10
|
+
"response": "玛格丽塔披萨的主要成分是番茄、马苏里拉奶酪和新鲜罗勒。",
|
|
11
|
+
"retrieved_contexts": [
|
|
12
|
+
"传统的玛格丽塔披萨由薄薄的饼皮组成。",
|
|
13
|
+
"主要的配料包括番茄、马苏里拉奶酪、新鲜罗勒、盐和橄榄油。",
|
|
14
|
+
"它是最简单和最经典的披萨类型之一。"
|
|
15
|
+
]
|
|
16
|
+
},
|
|
17
|
+
"output": {
|
|
18
|
+
"relevance": true
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"input": {
|
|
23
|
+
"user_input": "谁在2021年奥斯卡颁奖典礼上获得了最佳男演员奖?",
|
|
24
|
+
"response": "2021年的最佳男演员奖由莱昂纳多·迪卡普里奥获得。",
|
|
25
|
+
"retrieved_contexts": [
|
|
26
|
+
"第93届奥斯卡颁奖典礼于2021年举行。",
|
|
27
|
+
"安东尼·霍普金斯凭借在《困在时间里的父亲》中的角色赢得了最佳男演员奖。",
|
|
28
|
+
"由于COVID-19的限制,这次活动具有独特性。"
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
"output": {
|
|
32
|
+
"relevance": false
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
]
|
|
36
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"ragas_version": "0.2.5",
|
|
3
|
+
"original_hash": -7036736759899743798,
|
|
4
|
+
"language": "chinese",
|
|
5
|
+
"instruction": "从给定文本中提取命名实体,限制输出为最重要的实体。确保实体数量不超过指定的最大值。",
|
|
6
|
+
"examples": [
|
|
7
|
+
{
|
|
8
|
+
"input": {
|
|
9
|
+
"text": "特斯拉和SpaceX的首席执行官埃隆·马斯克宣布计划将业务扩展到欧洲和亚洲的新地点。\n 此次扩展预计将创造数千个就业机会,特别是在柏林和上海等城市。",
|
|
10
|
+
"max_num": 10
|
|
11
|
+
},
|
|
12
|
+
"output": {
|
|
13
|
+
"entities": [
|
|
14
|
+
"埃隆·马斯克",
|
|
15
|
+
"特斯拉",
|
|
16
|
+
"SpaceX",
|
|
17
|
+
"欧洲",
|
|
18
|
+
"亚洲",
|
|
19
|
+
"柏林",
|
|
20
|
+
"上海"
|
|
21
|
+
]
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
]
|
|
25
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"ragas_version": "0.2.5",
|
|
3
|
+
"original_hash": -1422723613754983378,
|
|
4
|
+
"language": "chinese",
|
|
5
|
+
"instruction": "根据指定的条件(角色、术语、风格、长度)和提供的上下文生成查询和答案。确保答案完全忠实于上下文,仅使用直接来自提供上下文的信息。### 指令:\n1. **生成查询**:根据上下文、角色、术语、风格和长度,创建一个与角色视角一致并包含术语的问题。\n2. **生成答案**:仅使用提供的上下文中的内容,构建对查询的详细答案。不要添加上下文中未包含或无法推断的信息。\n### 示例输出:\n\n",
|
|
6
|
+
"examples": []
|
|
7
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"ragas_version": "0.2.5",
|
|
3
|
+
"original_hash": 2334929353739018813,
|
|
4
|
+
"language": "chinese",
|
|
5
|
+
"instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
|
|
6
|
+
"examples": [
|
|
7
|
+
{
|
|
8
|
+
"input": {
|
|
9
|
+
"themes": [
|
|
10
|
+
"同理心",
|
|
11
|
+
"包容性",
|
|
12
|
+
"远程工作"
|
|
13
|
+
],
|
|
14
|
+
"personas": [
|
|
15
|
+
{
|
|
16
|
+
"name": "人力资源经理",
|
|
17
|
+
"role_description": "专注于包容性和员工支持。"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"name": "远程团队领导",
|
|
21
|
+
"role_description": "管理远程团队沟通。"
|
|
22
|
+
}
|
|
23
|
+
]
|
|
24
|
+
},
|
|
25
|
+
"output": {
|
|
26
|
+
"mapping": {
|
|
27
|
+
"HR Manager": [
|
|
28
|
+
"包容性",
|
|
29
|
+
"同理心"
|
|
30
|
+
],
|
|
31
|
+
"Remote Team Lead": [
|
|
32
|
+
"远程工作",
|
|
33
|
+
"同理心"
|
|
34
|
+
]
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
]
|
|
39
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"ragas_version": "0.2.5",
|
|
3
|
+
"original_hash": -5467318232123540806,
|
|
4
|
+
"language": "chinese",
|
|
5
|
+
"instruction": "将给定文本总结为少于10个句子。",
|
|
6
|
+
"examples": [
|
|
7
|
+
{
|
|
8
|
+
"input": {
|
|
9
|
+
"text": "人工智能\n\n人工智能正在通过自动化以前需要人类智能的任务来改变各个行业。从医疗到金融,人工智能正在被用来快速准确地分析大量数据。这项技术还推动了自动驾驶汽车和个性化推荐等领域的创新。"
|
|
10
|
+
},
|
|
11
|
+
"output": {
|
|
12
|
+
"text": "人工智能通过自动化任务、分析数据和推动自动驾驶汽车和个性化推荐等创新,正在革新各个行业。"
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"ragas_version": "0.2.5",
|
|
3
|
+
"original_hash": 2452110859551524285,
|
|
4
|
+
"language": "chinese",
|
|
5
|
+
"instruction": "从给定的文本中提取主要主题和概念。",
|
|
6
|
+
"examples": [
|
|
7
|
+
{
|
|
8
|
+
"input": {
|
|
9
|
+
"text": "人工智能通过自动化需要人类智能的任务来改变行业。人工智能快速准确地分析大量数据,推动了自动驾驶汽车和个性化推荐等创新。",
|
|
10
|
+
"max_num": 10
|
|
11
|
+
},
|
|
12
|
+
"output": {
|
|
13
|
+
"output": [
|
|
14
|
+
"人工智能",
|
|
15
|
+
"自动化",
|
|
16
|
+
"数据分析",
|
|
17
|
+
"创新",
|
|
18
|
+
"自动驾驶汽车",
|
|
19
|
+
"个性化推荐"
|
|
20
|
+
]
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
]
|
|
24
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from ragas.prompt import PydanticPrompt, StringIO
|
|
5
|
+
from ragas.testset.persona import Persona
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PersonaGenerationPromptZH(PydanticPrompt[StringIO, Persona]):
|
|
9
|
+
instruction: str = ('使用提供的摘要,生成一个可能会与内容互动或从中受益的角色。包括一个独特的名字和一个简洁的角色描述。')
|
|
10
|
+
input_model: t.Type[StringIO] = StringIO
|
|
11
|
+
output_model: t.Type[Persona] = Persona
|
|
12
|
+
examples: t.List[t.Tuple[StringIO, Persona]] = [(
|
|
13
|
+
StringIO(text='《数字营销指南》解释了在各种在线平台上吸引受众的策略。'),
|
|
14
|
+
Persona(
|
|
15
|
+
name='数字营销专家',
|
|
16
|
+
role_description='专注于吸引受众并在线上提升品牌。',
|
|
17
|
+
),
|
|
18
|
+
)]
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
|
|
3
4
|
import pandas as pd
|
|
4
|
-
from tqdm import tqdm
|
|
5
|
-
from ragas.llms import LangchainLLMWrapper
|
|
6
5
|
from ragas.embeddings import LangchainEmbeddingsWrapper
|
|
7
|
-
from .
|
|
8
|
-
from
|
|
9
|
-
from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
|
|
10
|
-
from evalscope.backend.rag_eval import EmbeddingModel, LLM, ChatOpenAI
|
|
6
|
+
from ragas.llms import LangchainLLMWrapper
|
|
7
|
+
from tqdm import tqdm
|
|
11
8
|
|
|
12
|
-
|
|
9
|
+
from evalscope.backend.rag_eval import LLM, ChatOpenAI, EmbeddingModel
|
|
10
|
+
from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
from .translate_prompt import translate_prompts
|
|
13
13
|
|
|
14
14
|
logger = get_logger()
|
|
15
15
|
|
|
@@ -17,116 +17,110 @@ logger = get_logger()
|
|
|
17
17
|
def get_transform(llm, embedding, language):
|
|
18
18
|
"""
|
|
19
19
|
Creates and returns a default set of transforms for processing a knowledge graph.
|
|
20
|
-
|
|
21
|
-
This function defines a series of transformation steps to be applied to a
|
|
22
|
-
knowledge graph, including extracting summaries, keyphrases, titles,
|
|
23
|
-
headlines, and embeddings, as well as building similarity relationships
|
|
24
|
-
between nodes.
|
|
25
|
-
|
|
26
|
-
The transforms are applied in the following order:
|
|
27
|
-
1. Parallel extraction of summaries and headlines
|
|
28
|
-
2. Embedding of summaries for document nodes
|
|
29
|
-
3. Splitting of headlines
|
|
30
|
-
4. Parallel extraction of embeddings, keyphrases, and titles
|
|
31
|
-
5. Building cosine similarity relationships between nodes
|
|
32
|
-
6. Building cosine similarity relationships between summaries
|
|
33
|
-
|
|
34
|
-
Returns
|
|
35
|
-
-------
|
|
36
|
-
Transforms
|
|
37
|
-
A list of transformation steps to be applied to the knowledge graph.
|
|
38
|
-
|
|
39
20
|
"""
|
|
40
21
|
from ragas.testset.transforms.engine import Parallel
|
|
41
22
|
from ragas.testset.transforms.extractors import (
|
|
42
23
|
EmbeddingExtractor,
|
|
43
24
|
HeadlinesExtractor,
|
|
44
|
-
KeyphrasesExtractor,
|
|
45
25
|
SummaryExtractor,
|
|
46
|
-
TitleExtractor,
|
|
47
26
|
)
|
|
48
|
-
from ragas.testset.transforms.
|
|
27
|
+
from ragas.testset.transforms.extractors.llm_based import NERExtractor, ThemesExtractor
|
|
28
|
+
from ragas.testset.transforms.relationship_builders import (
|
|
49
29
|
CosineSimilarityBuilder,
|
|
50
|
-
|
|
30
|
+
OverlapScoreBuilder,
|
|
51
31
|
)
|
|
52
32
|
from ragas.testset.transforms.splitters import HeadlineSplitter
|
|
33
|
+
from ragas.testset.transforms.filters import CustomNodeFilter
|
|
53
34
|
from ragas.testset.graph import NodeType
|
|
35
|
+
from ragas.utils import num_tokens_from_string
|
|
36
|
+
|
|
37
|
+
def summary_filter(node):
|
|
38
|
+
return (node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > 500)
|
|
54
39
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
title_extractor = TitleExtractor(llm=llm)
|
|
40
|
+
summary_extractor = SummaryExtractor(llm=llm, filter_nodes=lambda node: summary_filter(node))
|
|
41
|
+
ner_extractor = NERExtractor(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
|
|
42
|
+
theme_extractor = ThemesExtractor(llm=llm)
|
|
59
43
|
headline_extractor = HeadlinesExtractor(llm=llm)
|
|
60
44
|
|
|
61
45
|
asyncio.run(
|
|
62
46
|
translate_prompts(
|
|
63
47
|
prompts=[
|
|
64
48
|
summary_extractor,
|
|
65
|
-
|
|
66
|
-
|
|
49
|
+
theme_extractor,
|
|
50
|
+
ner_extractor,
|
|
67
51
|
headline_extractor,
|
|
68
52
|
],
|
|
69
53
|
target_lang=language,
|
|
70
54
|
llm=llm,
|
|
71
55
|
adapt_instruction=True,
|
|
72
|
-
)
|
|
73
|
-
|
|
56
|
+
))
|
|
57
|
+
|
|
58
|
+
splitter = HeadlineSplitter(min_tokens=500)
|
|
74
59
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
cosine_sim_builder = CosineSimilarityBuilder(threshold=0.8)
|
|
78
|
-
summary_embedder = EmbeddingExtractor(
|
|
79
|
-
name='summary_embedder',
|
|
80
|
-
filter_nodes=lambda node: True if node.type == NodeType.DOCUMENT else False,
|
|
60
|
+
summary_emb_extractor = EmbeddingExtractor(
|
|
61
|
+
embedding_model=embedding,
|
|
81
62
|
property_name='summary_embedding',
|
|
82
63
|
embed_property_name='summary',
|
|
83
|
-
|
|
64
|
+
filter_nodes=lambda node: summary_filter(node),
|
|
84
65
|
)
|
|
85
|
-
summary_cosine_sim_builder = SummaryCosineSimilarityBuilder(threshold=0.6)
|
|
86
66
|
|
|
87
|
-
|
|
67
|
+
cosine_sim_builder = CosineSimilarityBuilder(
|
|
68
|
+
property_name='summary_embedding',
|
|
69
|
+
new_property_name='summary_similarity',
|
|
70
|
+
threshold=0.7,
|
|
71
|
+
filter_nodes=lambda node: summary_filter(node),
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
ner_overlap_sim = OverlapScoreBuilder(threshold=0.01, filter_nodes=lambda node: node.type == NodeType.CHUNK)
|
|
75
|
+
|
|
76
|
+
node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
|
|
77
|
+
|
|
88
78
|
transforms = [
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
79
|
+
headline_extractor,
|
|
80
|
+
splitter,
|
|
81
|
+
summary_extractor,
|
|
82
|
+
node_filter,
|
|
83
|
+
Parallel(summary_emb_extractor, theme_extractor, ner_extractor),
|
|
84
|
+
Parallel(cosine_sim_builder, ner_overlap_sim),
|
|
95
85
|
]
|
|
86
|
+
|
|
96
87
|
return transforms
|
|
97
88
|
|
|
98
89
|
|
|
99
90
|
def get_distribution(llm, distribution, language):
|
|
100
|
-
from ragas.testset.synthesizers.
|
|
101
|
-
|
|
102
|
-
|
|
91
|
+
from ragas.testset.synthesizers.multi_hop import (
|
|
92
|
+
MultiHopAbstractQuerySynthesizer,
|
|
93
|
+
MultiHopSpecificQuerySynthesizer,
|
|
103
94
|
)
|
|
104
|
-
from ragas.testset.synthesizers.
|
|
95
|
+
from ragas.testset.synthesizers.single_hop.specific import (
|
|
96
|
+
SingleHopSpecificQuerySynthesizer, )
|
|
105
97
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
98
|
+
single_hop = SingleHopSpecificQuerySynthesizer(llm=llm)
|
|
99
|
+
multi_hop_abs = MultiHopAbstractQuerySynthesizer(llm=llm)
|
|
100
|
+
multi_hop_spec = MultiHopSpecificQuerySynthesizer(llm=llm)
|
|
109
101
|
|
|
110
102
|
asyncio.run(
|
|
111
103
|
translate_prompts(
|
|
112
104
|
prompts=[
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
105
|
+
single_hop,
|
|
106
|
+
multi_hop_abs,
|
|
107
|
+
multi_hop_spec,
|
|
116
108
|
],
|
|
117
109
|
target_lang=language,
|
|
118
110
|
llm=llm,
|
|
119
111
|
adapt_instruction=True,
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
112
|
+
))
|
|
113
|
+
|
|
114
|
+
mapping = {
|
|
115
|
+
'simple': single_hop,
|
|
116
|
+
'multi_context': multi_hop_abs,
|
|
117
|
+
'reasoning': multi_hop_spec,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return [(mapping[key], distribution[key]) for key in mapping if key in distribution]
|
|
127
121
|
|
|
128
122
|
|
|
129
|
-
def get_knowledge_graph(documents, transforms, local_file):
|
|
123
|
+
def get_knowledge_graph(documents, transforms, local_file, run_config):
|
|
130
124
|
from ragas.testset.graph import KnowledgeGraph, Node, NodeType
|
|
131
125
|
from ragas.testset.transforms import apply_transforms
|
|
132
126
|
|
|
@@ -148,7 +142,7 @@ def get_knowledge_graph(documents, transforms, local_file):
|
|
|
148
142
|
kg = KnowledgeGraph(nodes=nodes)
|
|
149
143
|
|
|
150
144
|
# apply transforms and update the knowledge graph
|
|
151
|
-
apply_transforms(kg, transforms)
|
|
145
|
+
apply_transforms(kg, transforms, run_config=run_config)
|
|
152
146
|
|
|
153
147
|
# save the knowledge graph
|
|
154
148
|
output_path = os.path.dirname(local_file)
|
|
@@ -158,6 +152,39 @@ def get_knowledge_graph(documents, transforms, local_file):
|
|
|
158
152
|
return kg
|
|
159
153
|
|
|
160
154
|
|
|
155
|
+
def get_persona(llm, kg, language):
|
|
156
|
+
from evalscope.backend.rag_eval.ragas.prompts.persona_prompt import PersonaGenerationPromptZH
|
|
157
|
+
from ragas.testset.persona import generate_personas_from_kg, PersonaGenerationPrompt
|
|
158
|
+
from ragas.testset.graph import Node
|
|
159
|
+
|
|
160
|
+
def filter(node: Node) -> bool:
|
|
161
|
+
if (node.type.name == 'DOCUMENT' and node.properties.get('summary_embedding') is not None):
|
|
162
|
+
return True
|
|
163
|
+
else:
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
if language == 'chinese':
|
|
167
|
+
persona_prompt = PersonaGenerationPromptZH()
|
|
168
|
+
else:
|
|
169
|
+
persona_prompt = PersonaGenerationPrompt()
|
|
170
|
+
# NOTE: can't translate this yet
|
|
171
|
+
# asyncio.run(
|
|
172
|
+
# translate_prompts(
|
|
173
|
+
# prompts=[persona_prompt],
|
|
174
|
+
# target_lang=language,
|
|
175
|
+
# llm=llm,
|
|
176
|
+
# adapt_instruction=True,
|
|
177
|
+
# ))
|
|
178
|
+
|
|
179
|
+
return generate_personas_from_kg(
|
|
180
|
+
llm=llm,
|
|
181
|
+
kg=kg,
|
|
182
|
+
num_personas=3,
|
|
183
|
+
persona_generation_prompt=persona_prompt,
|
|
184
|
+
filter_fn=filter,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
161
188
|
def load_data(file_path):
|
|
162
189
|
from langchain_community.document_loaders import UnstructuredFileLoader
|
|
163
190
|
|
|
@@ -178,32 +205,31 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
|
|
|
178
205
|
generator_llm = LLM.load(**args.generator_llm)
|
|
179
206
|
embeddings = EmbeddingModel.load(**args.embeddings)
|
|
180
207
|
|
|
208
|
+
wrapped_llm = LangchainLLMWrapper(generator_llm)
|
|
209
|
+
wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)
|
|
210
|
+
|
|
181
211
|
# Change resulting question type distribution
|
|
182
|
-
distributions = get_distribution(
|
|
183
|
-
LangchainLLMWrapper(generator_llm), args.distribution, args.language
|
|
184
|
-
)
|
|
212
|
+
distributions = get_distribution(wrapped_llm, args.distribution, args.language)
|
|
185
213
|
|
|
214
|
+
run_config = RunConfig(timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True)
|
|
186
215
|
# get transforms
|
|
187
216
|
transforms = get_transform(
|
|
188
|
-
|
|
189
|
-
|
|
217
|
+
wrapped_llm,
|
|
218
|
+
wrapped_embeddings,
|
|
190
219
|
args.language,
|
|
191
220
|
)
|
|
192
221
|
|
|
193
222
|
# get knowledge graph
|
|
194
|
-
knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph)
|
|
223
|
+
knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph, run_config)
|
|
195
224
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
)
|
|
225
|
+
persona_list = get_persona(llm=wrapped_llm, kg=knowledge_graph, language=args.language)
|
|
226
|
+
|
|
227
|
+
generator = TestsetGenerator(llm=wrapped_llm, knowledge_graph=knowledge_graph, persona_list=persona_list)
|
|
199
228
|
|
|
200
|
-
runconfig = RunConfig(
|
|
201
|
-
timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True
|
|
202
|
-
)
|
|
203
229
|
testset = generator.generate(
|
|
204
230
|
testset_size=args.test_size,
|
|
205
231
|
query_distribution=distributions,
|
|
206
|
-
run_config=
|
|
232
|
+
run_config=run_config,
|
|
207
233
|
with_debugging_logs=True,
|
|
208
234
|
raise_exceptions=True,
|
|
209
235
|
)
|
|
@@ -212,9 +238,7 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
|
|
|
212
238
|
testset_df = testset.to_pandas()
|
|
213
239
|
output_path = os.path.dirname(args.output_file)
|
|
214
240
|
os.makedirs(output_path, exist_ok=True)
|
|
215
|
-
testset_df.to_json(
|
|
216
|
-
args.output_file, indent=4, index=False, orient='records', force_ascii=False
|
|
217
|
-
)
|
|
241
|
+
testset_df.to_json(args.output_file, indent=4, index=False, orient='records', force_ascii=False)
|
|
218
242
|
|
|
219
243
|
# get answer
|
|
220
244
|
testset_with_answer = get_answer(testset_df, generator_llm, args.language)
|
|
@@ -243,21 +267,17 @@ Answer:
|
|
|
243
267
|
contexts = '\n'.join(row['reference_contexts'])
|
|
244
268
|
|
|
245
269
|
# Combine question and contexts as input for the LLM
|
|
246
|
-
input_text = template.format(
|
|
247
|
-
language=language, question=question, contexts=contexts
|
|
248
|
-
)
|
|
270
|
+
input_text = template.format(language=language, question=question, contexts=contexts)
|
|
249
271
|
|
|
250
272
|
# Generate the answer using the generator LLM
|
|
251
273
|
answer = generator_llm.invoke(input_text)
|
|
252
274
|
if isinstance(generator_llm, ChatOpenAI):
|
|
253
275
|
answer = answer.content
|
|
254
|
-
items.append(
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
}
|
|
261
|
-
)
|
|
276
|
+
items.append({
|
|
277
|
+
'user_input': question,
|
|
278
|
+
'retrieved_contexts': row['reference_contexts'],
|
|
279
|
+
'response': answer,
|
|
280
|
+
'reference': row['reference'],
|
|
281
|
+
})
|
|
262
282
|
|
|
263
283
|
return pd.DataFrame.from_dict(items)
|