evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.1.dist-info/RECORD +0 -286
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,21 +1,22 @@
|
|
|
1
1
|
from mteb.abstasks.AbsTaskSTS import AbsTaskSTS
|
|
2
2
|
from mteb.abstasks.TaskMetadata import TaskMetadata
|
|
3
3
|
|
|
4
|
+
|
|
4
5
|
class ATEC(AbsTaskSTS):
|
|
5
6
|
metadata = TaskMetadata(
|
|
6
|
-
name=
|
|
7
|
+
name='ATEC',
|
|
7
8
|
dataset={
|
|
8
|
-
|
|
9
|
-
|
|
9
|
+
'path': 'C-MTEB/ATEC',
|
|
10
|
+
'revision': '0f319b1142f28d00e055a6770f3f726ae9b7d865',
|
|
10
11
|
},
|
|
11
|
-
description=
|
|
12
|
-
reference=
|
|
13
|
-
type=
|
|
14
|
-
category=
|
|
15
|
-
modalities=[
|
|
16
|
-
eval_splits=[
|
|
17
|
-
eval_langs=[
|
|
18
|
-
main_score=
|
|
12
|
+
description='A Chinese dataset for textual relatedness',
|
|
13
|
+
reference='https://aclanthology.org/2021.emnlp-main.357',
|
|
14
|
+
type='STS',
|
|
15
|
+
category='s2s',
|
|
16
|
+
modalities=['text'],
|
|
17
|
+
eval_splits=['validation', 'test'],
|
|
18
|
+
eval_langs=['cmn-Hans'],
|
|
19
|
+
main_score='cosine_spearman',
|
|
19
20
|
date=None,
|
|
20
21
|
domains=None,
|
|
21
22
|
task_subtypes=None,
|
|
@@ -43,32 +44,35 @@ class ATEC(AbsTaskSTS):
|
|
|
43
44
|
pages = "4348--4366",
|
|
44
45
|
abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
|
|
45
46
|
}""",
|
|
46
|
-
descriptive_stats={
|
|
47
|
+
descriptive_stats={
|
|
48
|
+
'n_samples': None,
|
|
49
|
+
'avg_character_length': None
|
|
50
|
+
},
|
|
47
51
|
)
|
|
48
52
|
|
|
49
53
|
@property
|
|
50
54
|
def metadata_dict(self) -> dict[str, str]:
|
|
51
55
|
metadata_dict = super().metadata_dict
|
|
52
|
-
metadata_dict[
|
|
53
|
-
metadata_dict[
|
|
56
|
+
metadata_dict['min_score'] = 0
|
|
57
|
+
metadata_dict['max_score'] = 1
|
|
54
58
|
return metadata_dict
|
|
55
59
|
|
|
56
60
|
|
|
57
61
|
class BQ(AbsTaskSTS):
|
|
58
62
|
metadata = TaskMetadata(
|
|
59
|
-
name=
|
|
63
|
+
name='BQ',
|
|
60
64
|
dataset={
|
|
61
|
-
|
|
62
|
-
|
|
65
|
+
'path': 'C-MTEB/BQ',
|
|
66
|
+
'revision': 'e3dda5e115e487b39ec7e618c0c6a29137052a55',
|
|
63
67
|
},
|
|
64
|
-
description=
|
|
65
|
-
reference=
|
|
66
|
-
type=
|
|
67
|
-
category=
|
|
68
|
-
modalities=[
|
|
69
|
-
eval_splits=[
|
|
70
|
-
eval_langs=[
|
|
71
|
-
main_score=
|
|
68
|
+
description='A Chinese dataset for textual relatedness',
|
|
69
|
+
reference='https://aclanthology.org/2021.emnlp-main.357',
|
|
70
|
+
type='STS',
|
|
71
|
+
category='s2s',
|
|
72
|
+
modalities=['text'],
|
|
73
|
+
eval_splits=['validation', 'test'],
|
|
74
|
+
eval_langs=['cmn-Hans'],
|
|
75
|
+
main_score='cosine_spearman',
|
|
72
76
|
date=None,
|
|
73
77
|
domains=None,
|
|
74
78
|
task_subtypes=None,
|
|
@@ -77,40 +81,43 @@ class BQ(AbsTaskSTS):
|
|
|
77
81
|
dialect=None,
|
|
78
82
|
sample_creation=None,
|
|
79
83
|
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
80
|
-
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
84
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
81
85
|
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
82
86
|
year={2024},
|
|
83
87
|
eprint={2309.07597},
|
|
84
88
|
archivePrefix={arXiv},
|
|
85
89
|
primaryClass={cs.CL},
|
|
86
|
-
url={https://arxiv.org/abs/2309.07597},
|
|
90
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
87
91
|
}""",
|
|
88
|
-
descriptive_stats={
|
|
92
|
+
descriptive_stats={
|
|
93
|
+
'n_samples': None,
|
|
94
|
+
'avg_character_length': None
|
|
95
|
+
},
|
|
89
96
|
)
|
|
90
97
|
|
|
91
98
|
@property
|
|
92
99
|
def metadata_dict(self) -> dict[str, str]:
|
|
93
100
|
metadata_dict = super().metadata_dict
|
|
94
|
-
metadata_dict[
|
|
95
|
-
metadata_dict[
|
|
101
|
+
metadata_dict['min_score'] = 0
|
|
102
|
+
metadata_dict['max_score'] = 1
|
|
96
103
|
return metadata_dict
|
|
97
104
|
|
|
98
105
|
|
|
99
106
|
class LCQMC(AbsTaskSTS):
|
|
100
107
|
metadata = TaskMetadata(
|
|
101
|
-
name=
|
|
108
|
+
name='LCQMC',
|
|
102
109
|
dataset={
|
|
103
|
-
|
|
104
|
-
|
|
110
|
+
'path': 'C-MTEB/LCQMC',
|
|
111
|
+
'revision': '17f9b096f80380fce5ed12a9be8be7784b337daf',
|
|
105
112
|
},
|
|
106
|
-
description=
|
|
107
|
-
reference=
|
|
108
|
-
type=
|
|
109
|
-
category=
|
|
110
|
-
modalities=[
|
|
111
|
-
eval_splits=[
|
|
112
|
-
eval_langs=[
|
|
113
|
-
main_score=
|
|
113
|
+
description='A Chinese dataset for textual relatedness',
|
|
114
|
+
reference='https://aclanthology.org/2021.emnlp-main.357',
|
|
115
|
+
type='STS',
|
|
116
|
+
category='s2s',
|
|
117
|
+
modalities=['text'],
|
|
118
|
+
eval_splits=['test'],
|
|
119
|
+
eval_langs=['cmn-Hans'],
|
|
120
|
+
main_score='cosine_spearman',
|
|
114
121
|
date=None,
|
|
115
122
|
domains=None,
|
|
116
123
|
task_subtypes=None,
|
|
@@ -119,40 +126,43 @@ class LCQMC(AbsTaskSTS):
|
|
|
119
126
|
dialect=None,
|
|
120
127
|
sample_creation=None,
|
|
121
128
|
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
122
|
-
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
129
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
123
130
|
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
124
131
|
year={2024},
|
|
125
132
|
eprint={2309.07597},
|
|
126
133
|
archivePrefix={arXiv},
|
|
127
134
|
primaryClass={cs.CL},
|
|
128
|
-
url={https://arxiv.org/abs/2309.07597},
|
|
135
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
129
136
|
}""",
|
|
130
|
-
descriptive_stats={
|
|
137
|
+
descriptive_stats={
|
|
138
|
+
'n_samples': None,
|
|
139
|
+
'avg_character_length': None
|
|
140
|
+
},
|
|
131
141
|
)
|
|
132
142
|
|
|
133
143
|
@property
|
|
134
144
|
def metadata_dict(self) -> dict[str, str]:
|
|
135
145
|
metadata_dict = super().metadata_dict
|
|
136
|
-
metadata_dict[
|
|
137
|
-
metadata_dict[
|
|
146
|
+
metadata_dict['min_score'] = 0
|
|
147
|
+
metadata_dict['max_score'] = 1
|
|
138
148
|
return metadata_dict
|
|
139
149
|
|
|
140
150
|
|
|
141
151
|
class PAWSX(AbsTaskSTS):
|
|
142
152
|
metadata = TaskMetadata(
|
|
143
|
-
name=
|
|
153
|
+
name='PAWSX',
|
|
144
154
|
dataset={
|
|
145
|
-
|
|
146
|
-
|
|
155
|
+
'path': 'C-MTEB/PAWSX',
|
|
156
|
+
'revision': '9c6a90e430ac22b5779fb019a23e820b11a8b5e1',
|
|
147
157
|
},
|
|
148
|
-
description=
|
|
149
|
-
reference=
|
|
150
|
-
type=
|
|
151
|
-
category=
|
|
152
|
-
modalities=[
|
|
153
|
-
eval_splits=[
|
|
154
|
-
eval_langs=[
|
|
155
|
-
main_score=
|
|
158
|
+
description='A Chinese dataset for textual relatedness',
|
|
159
|
+
reference='https://aclanthology.org/2021.emnlp-main.357',
|
|
160
|
+
type='STS',
|
|
161
|
+
category='s2s',
|
|
162
|
+
modalities=['text'],
|
|
163
|
+
eval_splits=['test'],
|
|
164
|
+
eval_langs=['cmn-Hans'],
|
|
165
|
+
main_score='cosine_spearman',
|
|
156
166
|
date=None,
|
|
157
167
|
domains=None,
|
|
158
168
|
task_subtypes=None,
|
|
@@ -161,40 +171,43 @@ class PAWSX(AbsTaskSTS):
|
|
|
161
171
|
dialect=None,
|
|
162
172
|
sample_creation=None,
|
|
163
173
|
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
164
|
-
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
174
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
165
175
|
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
166
176
|
year={2024},
|
|
167
177
|
eprint={2309.07597},
|
|
168
178
|
archivePrefix={arXiv},
|
|
169
179
|
primaryClass={cs.CL},
|
|
170
|
-
url={https://arxiv.org/abs/2309.07597},
|
|
180
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
171
181
|
}""",
|
|
172
|
-
descriptive_stats={
|
|
182
|
+
descriptive_stats={
|
|
183
|
+
'n_samples': None,
|
|
184
|
+
'avg_character_length': None
|
|
185
|
+
},
|
|
173
186
|
)
|
|
174
187
|
|
|
175
188
|
@property
|
|
176
189
|
def metadata_dict(self) -> dict[str, str]:
|
|
177
190
|
metadata_dict = super().metadata_dict
|
|
178
|
-
metadata_dict[
|
|
179
|
-
metadata_dict[
|
|
191
|
+
metadata_dict['min_score'] = 0
|
|
192
|
+
metadata_dict['max_score'] = 1
|
|
180
193
|
return metadata_dict
|
|
181
194
|
|
|
182
195
|
|
|
183
196
|
class STSB(AbsTaskSTS):
|
|
184
197
|
metadata = TaskMetadata(
|
|
185
|
-
name=
|
|
198
|
+
name='STSB',
|
|
186
199
|
dataset={
|
|
187
|
-
|
|
188
|
-
|
|
200
|
+
'path': 'C-MTEB/STSB',
|
|
201
|
+
'revision': '0cde68302b3541bb8b3c340dc0644b0b745b3dc0',
|
|
189
202
|
},
|
|
190
|
-
description=
|
|
191
|
-
reference=
|
|
192
|
-
type=
|
|
193
|
-
category=
|
|
194
|
-
modalities=[
|
|
195
|
-
eval_splits=[
|
|
196
|
-
eval_langs=[
|
|
197
|
-
main_score=
|
|
203
|
+
description='A Chinese dataset for textual relatedness',
|
|
204
|
+
reference='https://aclanthology.org/2021.emnlp-main.357',
|
|
205
|
+
type='STS',
|
|
206
|
+
category='s2s',
|
|
207
|
+
modalities=['text'],
|
|
208
|
+
eval_splits=['validation', 'test'],
|
|
209
|
+
eval_langs=['cmn-Hans'],
|
|
210
|
+
main_score='cosine_spearman',
|
|
198
211
|
date=None,
|
|
199
212
|
domains=None,
|
|
200
213
|
task_subtypes=None,
|
|
@@ -203,40 +216,43 @@ class STSB(AbsTaskSTS):
|
|
|
203
216
|
dialect=None,
|
|
204
217
|
sample_creation=None,
|
|
205
218
|
bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
|
|
206
|
-
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
219
|
+
title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
|
|
207
220
|
author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
|
|
208
221
|
year={2024},
|
|
209
222
|
eprint={2309.07597},
|
|
210
223
|
archivePrefix={arXiv},
|
|
211
224
|
primaryClass={cs.CL},
|
|
212
|
-
url={https://arxiv.org/abs/2309.07597},
|
|
225
|
+
url={https://arxiv.org/abs/2309.07597},
|
|
213
226
|
}""",
|
|
214
|
-
descriptive_stats={
|
|
227
|
+
descriptive_stats={
|
|
228
|
+
'n_samples': None,
|
|
229
|
+
'avg_character_length': None
|
|
230
|
+
},
|
|
215
231
|
)
|
|
216
232
|
|
|
217
233
|
@property
|
|
218
234
|
def metadata_dict(self) -> dict[str, str]:
|
|
219
235
|
metadata_dict = super().metadata_dict
|
|
220
|
-
metadata_dict[
|
|
221
|
-
metadata_dict[
|
|
236
|
+
metadata_dict['min_score'] = 0
|
|
237
|
+
metadata_dict['max_score'] = 5
|
|
222
238
|
return metadata_dict
|
|
223
239
|
|
|
224
240
|
|
|
225
241
|
class AFQMC(AbsTaskSTS):
|
|
226
242
|
metadata = TaskMetadata(
|
|
227
|
-
name=
|
|
243
|
+
name='AFQMC',
|
|
228
244
|
dataset={
|
|
229
|
-
|
|
230
|
-
|
|
245
|
+
'path': 'C-MTEB/AFQMC',
|
|
246
|
+
'revision': 'b44c3b011063adb25877c13823db83bb193913c4',
|
|
231
247
|
},
|
|
232
|
-
description=
|
|
233
|
-
reference=
|
|
234
|
-
type=
|
|
235
|
-
category=
|
|
236
|
-
modalities=[
|
|
237
|
-
eval_splits=[
|
|
238
|
-
eval_langs=[
|
|
239
|
-
main_score=
|
|
248
|
+
description='A Chinese dataset for textual relatedness',
|
|
249
|
+
reference='https://aclanthology.org/2021.emnlp-main.357',
|
|
250
|
+
type='STS',
|
|
251
|
+
category='s2s',
|
|
252
|
+
modalities=['text'],
|
|
253
|
+
eval_splits=['validation'],
|
|
254
|
+
eval_langs=['cmn-Hans'],
|
|
255
|
+
main_score='cosine_spearman',
|
|
240
256
|
date=None,
|
|
241
257
|
domains=None,
|
|
242
258
|
task_subtypes=None,
|
|
@@ -264,32 +280,35 @@ class AFQMC(AbsTaskSTS):
|
|
|
264
280
|
pages = "4348--4366",
|
|
265
281
|
abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
|
|
266
282
|
}""",
|
|
267
|
-
descriptive_stats={
|
|
283
|
+
descriptive_stats={
|
|
284
|
+
'n_samples': None,
|
|
285
|
+
'avg_character_length': None
|
|
286
|
+
},
|
|
268
287
|
)
|
|
269
288
|
|
|
270
289
|
@property
|
|
271
290
|
def metadata_dict(self) -> dict[str, str]:
|
|
272
291
|
metadata_dict = super().metadata_dict
|
|
273
|
-
metadata_dict[
|
|
274
|
-
metadata_dict[
|
|
292
|
+
metadata_dict['min_score'] = 0
|
|
293
|
+
metadata_dict['max_score'] = 1
|
|
275
294
|
return metadata_dict
|
|
276
295
|
|
|
277
296
|
|
|
278
297
|
class QBQTC(AbsTaskSTS):
|
|
279
298
|
metadata = TaskMetadata(
|
|
280
|
-
name=
|
|
299
|
+
name='QBQTC',
|
|
281
300
|
dataset={
|
|
282
|
-
|
|
283
|
-
|
|
301
|
+
'path': 'C-MTEB/QBQTC',
|
|
302
|
+
'revision': '790b0510dc52b1553e8c49f3d2afb48c0e5c48b7',
|
|
284
303
|
},
|
|
285
|
-
description=
|
|
286
|
-
reference=
|
|
287
|
-
type=
|
|
288
|
-
category=
|
|
289
|
-
modalities=[
|
|
290
|
-
eval_splits=[
|
|
291
|
-
eval_langs=[
|
|
292
|
-
main_score=
|
|
304
|
+
description='',
|
|
305
|
+
reference='https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset',
|
|
306
|
+
type='STS',
|
|
307
|
+
category='s2s',
|
|
308
|
+
modalities=['text'],
|
|
309
|
+
eval_splits=['test'],
|
|
310
|
+
eval_langs=['cmn-Hans'],
|
|
311
|
+
main_score='cosine_spearman',
|
|
293
312
|
date=None,
|
|
294
313
|
domains=None,
|
|
295
314
|
task_subtypes=None,
|
|
@@ -298,5 +317,8 @@ class QBQTC(AbsTaskSTS):
|
|
|
298
317
|
dialect=None,
|
|
299
318
|
sample_creation=None,
|
|
300
319
|
bibtex_citation=None,
|
|
301
|
-
descriptive_stats={
|
|
320
|
+
descriptive_stats={
|
|
321
|
+
'n_samples': None,
|
|
322
|
+
'avg_character_length': None
|
|
323
|
+
},
|
|
302
324
|
)
|
|
@@ -1,63 +1,62 @@
|
|
|
1
1
|
from .Classification import *
|
|
2
2
|
from .Clustering import *
|
|
3
|
+
from .CustomTask import *
|
|
3
4
|
from .PairClassification import *
|
|
4
5
|
from .Reranking import *
|
|
5
6
|
from .Retrieval import *
|
|
6
7
|
from .STS import *
|
|
7
|
-
from .CustomTask import *
|
|
8
|
-
|
|
9
8
|
|
|
10
9
|
CLS_CLASSIFICATION = {
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
10
|
+
'TNews': TNews,
|
|
11
|
+
'IFlyTek': IFlyTek,
|
|
12
|
+
'MultilingualSentiment': MultilingualSentiment,
|
|
13
|
+
'JDReview': JDReview,
|
|
14
|
+
'OnlineShopping': OnlineShopping,
|
|
15
|
+
'Waimai': Waimai,
|
|
17
16
|
}
|
|
18
17
|
|
|
19
18
|
CLS_CLUSTERING = {
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
19
|
+
'CLSClusteringS2S': CLSClusteringFastS2S,
|
|
20
|
+
'CLSClusteringP2P': CLSClusteringFastP2P,
|
|
21
|
+
'ThuNewsClusteringS2S': ThuNewsClusteringFastS2S,
|
|
22
|
+
'ThuNewsClusteringP2P': ThuNewsClusteringFastP2P,
|
|
24
23
|
}
|
|
25
24
|
|
|
26
25
|
CLS_PAIR_CLASSIFICATION = {
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
'Ocnli': Ocnli,
|
|
27
|
+
'Cmnli': Cmnli,
|
|
29
28
|
}
|
|
30
29
|
|
|
31
30
|
CLS_RERANKING = {
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
31
|
+
'T2Reranking': T2Reranking,
|
|
32
|
+
'MMarcoReranking': MMarcoReranking,
|
|
33
|
+
'CMedQAv1': CMedQAv1,
|
|
34
|
+
'CMedQAv2': CMedQAv2,
|
|
36
35
|
}
|
|
37
36
|
|
|
38
37
|
CLS_RETRIEVAL = {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
38
|
+
'T2Retrieval': T2Retrieval,
|
|
39
|
+
'MMarcoRetrieval': MMarcoRetrieval,
|
|
40
|
+
'DuRetrieval': DuRetrieval,
|
|
41
|
+
'CovidRetrieval': CovidRetrieval,
|
|
42
|
+
'CmedqaRetrieval': CmedqaRetrieval,
|
|
43
|
+
'EcomRetrieval': EcomRetrieval,
|
|
44
|
+
'MedicalRetrieval': MedicalRetrieval,
|
|
45
|
+
'VideoRetrieval': VideoRetrieval,
|
|
47
46
|
}
|
|
48
47
|
|
|
49
48
|
CLS_STS = {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
49
|
+
'ATEC': ATEC,
|
|
50
|
+
'BQ': BQ,
|
|
51
|
+
'LCQMC': LCQMC,
|
|
52
|
+
'PAWSX': PAWSX,
|
|
53
|
+
'STSB': STSB,
|
|
54
|
+
'AFQMC': AFQMC,
|
|
55
|
+
'QBQTC': QBQTC,
|
|
57
56
|
}
|
|
58
57
|
|
|
59
58
|
CLS_CUSTOM = {
|
|
60
|
-
|
|
59
|
+
'CustomRetrieval': CustomRetrieval,
|
|
61
60
|
}
|
|
62
61
|
|
|
63
62
|
CLS_DICT = {
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
from evalscope.backend.rag_eval.ragas.arguments import
|
|
2
|
-
from evalscope.backend.rag_eval.ragas.task_template import rag_eval
|
|
1
|
+
from evalscope.backend.rag_eval.ragas.arguments import EvaluationArguments, TestsetGenerationArguments
|
|
2
|
+
from evalscope.backend.rag_eval.ragas.task_template import rag_eval
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
@dataclass
|
|
@@ -12,7 +12,6 @@ class TestsetGenerationArguments:
|
|
|
12
12
|
For local LLM support, you can use the following fields:
|
|
13
13
|
model_name_or_path: str
|
|
14
14
|
model_revision: str = "master"
|
|
15
|
-
template_type: str = "default"
|
|
16
15
|
generation_config: Optional[Dict]
|
|
17
16
|
|
|
18
17
|
For API LLM support, you can use the following fields:
|
|
@@ -22,9 +21,7 @@ class TestsetGenerationArguments:
|
|
|
22
21
|
"""
|
|
23
22
|
generator_llm: Dict = field(default_factory=dict)
|
|
24
23
|
embeddings: Dict = field(default_factory=dict)
|
|
25
|
-
distribution: str = field(
|
|
26
|
-
default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1}
|
|
27
|
-
)
|
|
24
|
+
distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
|
|
28
25
|
# For LLM based evaluation
|
|
29
26
|
# available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
|
|
30
27
|
# 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
|
|
@@ -37,9 +34,7 @@ class EvaluationArguments:
|
|
|
37
34
|
testset_file: str
|
|
38
35
|
critic_llm: Dict = field(default_factory=dict)
|
|
39
36
|
embeddings: Dict = field(default_factory=dict)
|
|
40
|
-
metrics: List[str] = field(
|
|
41
|
-
default_factory=lambda: ['answer_relevancy', 'faithfulness']
|
|
42
|
-
)
|
|
37
|
+
metrics: List[str] = field(default_factory=lambda: ['answer_relevancy', 'faithfulness'])
|
|
43
38
|
# For LLM based evaluation
|
|
44
39
|
# available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
|
|
45
40
|
# 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
|
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
{
|
|
2
|
-
"ragas_version": "0.2.
|
|
3
|
-
"original_hash":
|
|
2
|
+
"ragas_version": "0.2.7",
|
|
3
|
+
"original_hash": -492257975294377194,
|
|
4
4
|
"language": "chinese",
|
|
5
|
-
"instruction": "给定一个真实情况和一个答案陈述,分析每个陈述并将其分类为以下类别之一:TP(真正):答案中存在的陈述也直接由一个或多个真实情况中的陈述支持,FP(假正):答案中存在的陈述但没有被任何真实情况中的陈述直接支持,FN
|
|
5
|
+
"instruction": "给定一个真实情况和一个答案陈述,分析每个陈述并将其分类为以下类别之一:TP(真正):答案中存在的陈述也直接由一个或多个真实情况中的陈述支持,FP(假正):答案中存在的陈述但没有被任何真实情况中的陈述直接支持,FN(假负):在真实情况中发现但在答案中不存在的陈述。每个陈述只能属于其中一个类别。为每个分类提供理由。",
|
|
6
6
|
"examples": [
|
|
7
7
|
{
|
|
8
8
|
"input": {
|
|
9
9
|
"question": "是什么为太阳提供能量,它的主要功能是什么?",
|
|
10
10
|
"answer": [
|
|
11
|
-
"
|
|
11
|
+
"太阳的能量来自核裂变,类似于地球上的核反应堆。",
|
|
12
12
|
"太阳的主要功能是为太阳系提供光。"
|
|
13
13
|
],
|
|
14
14
|
"ground_truth": [
|
|
15
|
-
"
|
|
15
|
+
"太阳的能量来自核聚变,其中氢原子融合形成氦。",
|
|
16
16
|
"太阳核心的这种聚变过程释放出巨大的能量。",
|
|
17
17
|
"来自太阳的能量提供热量和光,这对地球上的生命至关重要。",
|
|
18
18
|
"太阳的光在地球的气候系统中起着关键作用。",
|
|
@@ -28,13 +28,13 @@
|
|
|
28
28
|
],
|
|
29
29
|
"FP": [
|
|
30
30
|
{
|
|
31
|
-
"statement": "
|
|
32
|
-
"reason": "
|
|
31
|
+
"statement": "太阳的能量来自核裂变,类似于地球上的核反应堆。",
|
|
32
|
+
"reason": "这一说法是不正确的,与地面事实相矛盾,地面事实指出太阳的能量来自核聚变。"
|
|
33
33
|
}
|
|
34
34
|
],
|
|
35
35
|
"FN": [
|
|
36
36
|
{
|
|
37
|
-
"statement": "
|
|
37
|
+
"statement": "太阳的能量来自核聚变,其中氢原子融合形成氦。",
|
|
38
38
|
"reason": "这种对太阳能量来源的准确描述没有包含在答案中。"
|
|
39
39
|
},
|
|
40
40
|
{
|
|
@@ -71,7 +71,7 @@
|
|
|
71
71
|
"TP": [
|
|
72
72
|
{
|
|
73
73
|
"statement": "水的沸点在海平面上是100摄氏度。",
|
|
74
|
-
"reason": "
|
|
74
|
+
"reason": "这一说法直接得到了地面事实的支持,地面事实具体说明了水的沸点在海平面上是100摄氏度。"
|
|
75
75
|
}
|
|
76
76
|
],
|
|
77
77
|
"FP": [],
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"ragas_version": "0.2.
|
|
3
|
-
"original_hash":
|
|
2
|
+
"ragas_version": "0.2.7",
|
|
3
|
+
"original_hash": -8546983388246528139,
|
|
4
4
|
"language": "chinese",
|
|
5
5
|
"instruction": "给定一个问题、一个答案和答案中的句子,分析在“句子”下给出的每个句子的复杂性,并将每个句子分解为一个或多个完全可理解的陈述,同时确保每个陈述中不使用代词。将输出格式化为JSON。",
|
|
6
6
|
"examples": [
|
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
|
-
"ragas_version": "0.2.
|
|
3
|
-
"original_hash":
|
|
2
|
+
"ragas_version": "0.2.7",
|
|
3
|
+
"original_hash": 7951911230338252816,
|
|
4
4
|
"language": "chinese",
|
|
5
|
-
"instruction": "
|
|
5
|
+
"instruction": "为给定的答案生成一个问题,并识别答案是否含糊不清。如果答案含糊不清,则给出1;如果答案明确,则给出0。含糊不清的答案是指那些回避的、模糊的或不明确的答案。例如,“我不知道”或“我不确定”是含糊不清的答案。",
|
|
6
6
|
"examples": [
|
|
7
7
|
{
|
|
8
8
|
"input": {
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
{
|
|
2
|
-
"ragas_version": "0.2.
|
|
3
|
-
"original_hash":
|
|
2
|
+
"ragas_version": "0.2.7",
|
|
3
|
+
"original_hash": -5318808809674890018,
|
|
4
4
|
"language": "chinese",
|
|
5
|
-
"instruction": "
|
|
5
|
+
"instruction": "给定问题、答案和背景,验证背景在得出给定答案时是否有用。如果有用,判定为“1”,如果没有用,判定为“0”,并以json格式输出。",
|
|
6
6
|
"examples": [
|
|
7
7
|
{
|
|
8
8
|
"input": {
|
|
9
9
|
"question": "你能告诉我关于阿尔伯特·爱因斯坦的什么?",
|
|
10
|
-
"context": "阿尔伯特·爱因斯坦(1879年3月14日-1955年4月18
|
|
11
|
-
"answer": "阿尔伯特·爱因斯坦,生于1879年3月14
|
|
10
|
+
"context": "阿尔伯特·爱因斯坦(1879年3月14日-1955年4月18日)是一位德国出生的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的科学家之一。他因发展相对论而闻名,同时也对量子力学做出了重要贡献,因此在20世纪前几十年现代物理学对自然科学理解的革命性重塑中起到了核心作用。他的质能等价公式E=mc²,源于相对论,被称为“世界上最著名的方程”。他因“对理论物理学的贡献,特别是发现光电效应定律”而获得1921年诺贝尔物理学奖,这是量子理论发展的关键一步。他的工作也因其对科学哲学的影响而闻名。在1999年由英国《物理世界》杂志对全球130位顶尖物理学家的调查中,爱因斯坦被评为有史以来最伟大的物理学家。他的智力成就和原创性使爱因斯坦成为天才的代名词。",
|
|
11
|
+
"answer": "阿尔伯特·爱因斯坦,生于1879年3月14日,是一位德国出生的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的科学家之一。他因对理论物理学的贡献而获得1921年诺贝尔物理学奖。"
|
|
12
12
|
},
|
|
13
13
|
"output": {
|
|
14
14
|
"reason": "提供的背景确实有助于得出给定的答案。背景包括关于阿尔伯特·爱因斯坦的生活和贡献的关键信息,这些信息在答案中得到了反映。",
|