evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +3 -4
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.1.dist-info/RECORD +0 -286
  230. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,22 @@
1
1
  from mteb.abstasks.AbsTaskSTS import AbsTaskSTS
2
2
  from mteb.abstasks.TaskMetadata import TaskMetadata
3
3
 
4
+
4
5
  class ATEC(AbsTaskSTS):
5
6
  metadata = TaskMetadata(
6
- name="ATEC",
7
+ name='ATEC',
7
8
  dataset={
8
- "path": "C-MTEB/ATEC",
9
- "revision": "0f319b1142f28d00e055a6770f3f726ae9b7d865",
9
+ 'path': 'C-MTEB/ATEC',
10
+ 'revision': '0f319b1142f28d00e055a6770f3f726ae9b7d865',
10
11
  },
11
- description="A Chinese dataset for textual relatedness",
12
- reference="https://aclanthology.org/2021.emnlp-main.357",
13
- type="STS",
14
- category="s2s",
15
- modalities=["text"],
16
- eval_splits=["validation", "test"],
17
- eval_langs=["cmn-Hans"],
18
- main_score="cosine_spearman",
12
+ description='A Chinese dataset for textual relatedness',
13
+ reference='https://aclanthology.org/2021.emnlp-main.357',
14
+ type='STS',
15
+ category='s2s',
16
+ modalities=['text'],
17
+ eval_splits=['validation', 'test'],
18
+ eval_langs=['cmn-Hans'],
19
+ main_score='cosine_spearman',
19
20
  date=None,
20
21
  domains=None,
21
22
  task_subtypes=None,
@@ -43,32 +44,35 @@ class ATEC(AbsTaskSTS):
43
44
  pages = "4348--4366",
44
45
  abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
45
46
  }""",
46
- descriptive_stats={"n_samples": None, "avg_character_length": None},
47
+ descriptive_stats={
48
+ 'n_samples': None,
49
+ 'avg_character_length': None
50
+ },
47
51
  )
48
52
 
49
53
  @property
50
54
  def metadata_dict(self) -> dict[str, str]:
51
55
  metadata_dict = super().metadata_dict
52
- metadata_dict["min_score"] = 0
53
- metadata_dict["max_score"] = 1
56
+ metadata_dict['min_score'] = 0
57
+ metadata_dict['max_score'] = 1
54
58
  return metadata_dict
55
59
 
56
60
 
57
61
  class BQ(AbsTaskSTS):
58
62
  metadata = TaskMetadata(
59
- name="BQ",
63
+ name='BQ',
60
64
  dataset={
61
- "path": "C-MTEB/BQ",
62
- "revision": "e3dda5e115e487b39ec7e618c0c6a29137052a55",
65
+ 'path': 'C-MTEB/BQ',
66
+ 'revision': 'e3dda5e115e487b39ec7e618c0c6a29137052a55',
63
67
  },
64
- description="A Chinese dataset for textual relatedness",
65
- reference="https://aclanthology.org/2021.emnlp-main.357",
66
- type="STS",
67
- category="s2s",
68
- modalities=["text"],
69
- eval_splits=["validation", "test"],
70
- eval_langs=["cmn-Hans"],
71
- main_score="cosine_spearman",
68
+ description='A Chinese dataset for textual relatedness',
69
+ reference='https://aclanthology.org/2021.emnlp-main.357',
70
+ type='STS',
71
+ category='s2s',
72
+ modalities=['text'],
73
+ eval_splits=['validation', 'test'],
74
+ eval_langs=['cmn-Hans'],
75
+ main_score='cosine_spearman',
72
76
  date=None,
73
77
  domains=None,
74
78
  task_subtypes=None,
@@ -77,40 +81,43 @@ class BQ(AbsTaskSTS):
77
81
  dialect=None,
78
82
  sample_creation=None,
79
83
  bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
80
- title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
84
+ title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
81
85
  author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
82
86
  year={2024},
83
87
  eprint={2309.07597},
84
88
  archivePrefix={arXiv},
85
89
  primaryClass={cs.CL},
86
- url={https://arxiv.org/abs/2309.07597},
90
+ url={https://arxiv.org/abs/2309.07597},
87
91
  }""",
88
- descriptive_stats={"n_samples": None, "avg_character_length": None},
92
+ descriptive_stats={
93
+ 'n_samples': None,
94
+ 'avg_character_length': None
95
+ },
89
96
  )
90
97
 
91
98
  @property
92
99
  def metadata_dict(self) -> dict[str, str]:
93
100
  metadata_dict = super().metadata_dict
94
- metadata_dict["min_score"] = 0
95
- metadata_dict["max_score"] = 1
101
+ metadata_dict['min_score'] = 0
102
+ metadata_dict['max_score'] = 1
96
103
  return metadata_dict
97
104
 
98
105
 
99
106
  class LCQMC(AbsTaskSTS):
100
107
  metadata = TaskMetadata(
101
- name="LCQMC",
108
+ name='LCQMC',
102
109
  dataset={
103
- "path": "C-MTEB/LCQMC",
104
- "revision": "17f9b096f80380fce5ed12a9be8be7784b337daf",
110
+ 'path': 'C-MTEB/LCQMC',
111
+ 'revision': '17f9b096f80380fce5ed12a9be8be7784b337daf',
105
112
  },
106
- description="A Chinese dataset for textual relatedness",
107
- reference="https://aclanthology.org/2021.emnlp-main.357",
108
- type="STS",
109
- category="s2s",
110
- modalities=["text"],
111
- eval_splits=["test"],
112
- eval_langs=["cmn-Hans"],
113
- main_score="cosine_spearman",
113
+ description='A Chinese dataset for textual relatedness',
114
+ reference='https://aclanthology.org/2021.emnlp-main.357',
115
+ type='STS',
116
+ category='s2s',
117
+ modalities=['text'],
118
+ eval_splits=['test'],
119
+ eval_langs=['cmn-Hans'],
120
+ main_score='cosine_spearman',
114
121
  date=None,
115
122
  domains=None,
116
123
  task_subtypes=None,
@@ -119,40 +126,43 @@ class LCQMC(AbsTaskSTS):
119
126
  dialect=None,
120
127
  sample_creation=None,
121
128
  bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
122
- title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
129
+ title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
123
130
  author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
124
131
  year={2024},
125
132
  eprint={2309.07597},
126
133
  archivePrefix={arXiv},
127
134
  primaryClass={cs.CL},
128
- url={https://arxiv.org/abs/2309.07597},
135
+ url={https://arxiv.org/abs/2309.07597},
129
136
  }""",
130
- descriptive_stats={"n_samples": None, "avg_character_length": None},
137
+ descriptive_stats={
138
+ 'n_samples': None,
139
+ 'avg_character_length': None
140
+ },
131
141
  )
132
142
 
133
143
  @property
134
144
  def metadata_dict(self) -> dict[str, str]:
135
145
  metadata_dict = super().metadata_dict
136
- metadata_dict["min_score"] = 0
137
- metadata_dict["max_score"] = 1
146
+ metadata_dict['min_score'] = 0
147
+ metadata_dict['max_score'] = 1
138
148
  return metadata_dict
139
149
 
140
150
 
141
151
  class PAWSX(AbsTaskSTS):
142
152
  metadata = TaskMetadata(
143
- name="PAWSX",
153
+ name='PAWSX',
144
154
  dataset={
145
- "path": "C-MTEB/PAWSX",
146
- "revision": "9c6a90e430ac22b5779fb019a23e820b11a8b5e1",
155
+ 'path': 'C-MTEB/PAWSX',
156
+ 'revision': '9c6a90e430ac22b5779fb019a23e820b11a8b5e1',
147
157
  },
148
- description="A Chinese dataset for textual relatedness",
149
- reference="https://aclanthology.org/2021.emnlp-main.357",
150
- type="STS",
151
- category="s2s",
152
- modalities=["text"],
153
- eval_splits=["test"],
154
- eval_langs=["cmn-Hans"],
155
- main_score="cosine_spearman",
158
+ description='A Chinese dataset for textual relatedness',
159
+ reference='https://aclanthology.org/2021.emnlp-main.357',
160
+ type='STS',
161
+ category='s2s',
162
+ modalities=['text'],
163
+ eval_splits=['test'],
164
+ eval_langs=['cmn-Hans'],
165
+ main_score='cosine_spearman',
156
166
  date=None,
157
167
  domains=None,
158
168
  task_subtypes=None,
@@ -161,40 +171,43 @@ class PAWSX(AbsTaskSTS):
161
171
  dialect=None,
162
172
  sample_creation=None,
163
173
  bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
164
- title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
174
+ title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
165
175
  author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
166
176
  year={2024},
167
177
  eprint={2309.07597},
168
178
  archivePrefix={arXiv},
169
179
  primaryClass={cs.CL},
170
- url={https://arxiv.org/abs/2309.07597},
180
+ url={https://arxiv.org/abs/2309.07597},
171
181
  }""",
172
- descriptive_stats={"n_samples": None, "avg_character_length": None},
182
+ descriptive_stats={
183
+ 'n_samples': None,
184
+ 'avg_character_length': None
185
+ },
173
186
  )
174
187
 
175
188
  @property
176
189
  def metadata_dict(self) -> dict[str, str]:
177
190
  metadata_dict = super().metadata_dict
178
- metadata_dict["min_score"] = 0
179
- metadata_dict["max_score"] = 1
191
+ metadata_dict['min_score'] = 0
192
+ metadata_dict['max_score'] = 1
180
193
  return metadata_dict
181
194
 
182
195
 
183
196
  class STSB(AbsTaskSTS):
184
197
  metadata = TaskMetadata(
185
- name="STSB",
198
+ name='STSB',
186
199
  dataset={
187
- "path": "C-MTEB/STSB",
188
- "revision": "0cde68302b3541bb8b3c340dc0644b0b745b3dc0",
200
+ 'path': 'C-MTEB/STSB',
201
+ 'revision': '0cde68302b3541bb8b3c340dc0644b0b745b3dc0',
189
202
  },
190
- description="A Chinese dataset for textual relatedness",
191
- reference="https://aclanthology.org/2021.emnlp-main.357",
192
- type="STS",
193
- category="s2s",
194
- modalities=["text"],
195
- eval_splits=["validation", "test"],
196
- eval_langs=["cmn-Hans"],
197
- main_score="cosine_spearman",
203
+ description='A Chinese dataset for textual relatedness',
204
+ reference='https://aclanthology.org/2021.emnlp-main.357',
205
+ type='STS',
206
+ category='s2s',
207
+ modalities=['text'],
208
+ eval_splits=['validation', 'test'],
209
+ eval_langs=['cmn-Hans'],
210
+ main_score='cosine_spearman',
198
211
  date=None,
199
212
  domains=None,
200
213
  task_subtypes=None,
@@ -203,40 +216,43 @@ class STSB(AbsTaskSTS):
203
216
  dialect=None,
204
217
  sample_creation=None,
205
218
  bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
206
- title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
219
+ title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
207
220
  author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
208
221
  year={2024},
209
222
  eprint={2309.07597},
210
223
  archivePrefix={arXiv},
211
224
  primaryClass={cs.CL},
212
- url={https://arxiv.org/abs/2309.07597},
225
+ url={https://arxiv.org/abs/2309.07597},
213
226
  }""",
214
- descriptive_stats={"n_samples": None, "avg_character_length": None},
227
+ descriptive_stats={
228
+ 'n_samples': None,
229
+ 'avg_character_length': None
230
+ },
215
231
  )
216
232
 
217
233
  @property
218
234
  def metadata_dict(self) -> dict[str, str]:
219
235
  metadata_dict = super().metadata_dict
220
- metadata_dict["min_score"] = 0
221
- metadata_dict["max_score"] = 5
236
+ metadata_dict['min_score'] = 0
237
+ metadata_dict['max_score'] = 5
222
238
  return metadata_dict
223
239
 
224
240
 
225
241
  class AFQMC(AbsTaskSTS):
226
242
  metadata = TaskMetadata(
227
- name="AFQMC",
243
+ name='AFQMC',
228
244
  dataset={
229
- "path": "C-MTEB/AFQMC",
230
- "revision": "b44c3b011063adb25877c13823db83bb193913c4",
245
+ 'path': 'C-MTEB/AFQMC',
246
+ 'revision': 'b44c3b011063adb25877c13823db83bb193913c4',
231
247
  },
232
- description="A Chinese dataset for textual relatedness",
233
- reference="https://aclanthology.org/2021.emnlp-main.357",
234
- type="STS",
235
- category="s2s",
236
- modalities=["text"],
237
- eval_splits=["validation"],
238
- eval_langs=["cmn-Hans"],
239
- main_score="cosine_spearman",
248
+ description='A Chinese dataset for textual relatedness',
249
+ reference='https://aclanthology.org/2021.emnlp-main.357',
250
+ type='STS',
251
+ category='s2s',
252
+ modalities=['text'],
253
+ eval_splits=['validation'],
254
+ eval_langs=['cmn-Hans'],
255
+ main_score='cosine_spearman',
240
256
  date=None,
241
257
  domains=None,
242
258
  task_subtypes=None,
@@ -264,32 +280,35 @@ class AFQMC(AbsTaskSTS):
264
280
  pages = "4348--4366",
265
281
  abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
266
282
  }""",
267
- descriptive_stats={"n_samples": None, "avg_character_length": None},
283
+ descriptive_stats={
284
+ 'n_samples': None,
285
+ 'avg_character_length': None
286
+ },
268
287
  )
269
288
 
270
289
  @property
271
290
  def metadata_dict(self) -> dict[str, str]:
272
291
  metadata_dict = super().metadata_dict
273
- metadata_dict["min_score"] = 0
274
- metadata_dict["max_score"] = 1
292
+ metadata_dict['min_score'] = 0
293
+ metadata_dict['max_score'] = 1
275
294
  return metadata_dict
276
295
 
277
296
 
278
297
  class QBQTC(AbsTaskSTS):
279
298
  metadata = TaskMetadata(
280
- name="QBQTC",
299
+ name='QBQTC',
281
300
  dataset={
282
- "path": "C-MTEB/QBQTC",
283
- "revision": "790b0510dc52b1553e8c49f3d2afb48c0e5c48b7",
301
+ 'path': 'C-MTEB/QBQTC',
302
+ 'revision': '790b0510dc52b1553e8c49f3d2afb48c0e5c48b7',
284
303
  },
285
- description="",
286
- reference="https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset",
287
- type="STS",
288
- category="s2s",
289
- modalities=["text"],
290
- eval_splits=["test"],
291
- eval_langs=["cmn-Hans"],
292
- main_score="cosine_spearman",
304
+ description='',
305
+ reference='https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset',
306
+ type='STS',
307
+ category='s2s',
308
+ modalities=['text'],
309
+ eval_splits=['test'],
310
+ eval_langs=['cmn-Hans'],
311
+ main_score='cosine_spearman',
293
312
  date=None,
294
313
  domains=None,
295
314
  task_subtypes=None,
@@ -298,5 +317,8 @@ class QBQTC(AbsTaskSTS):
298
317
  dialect=None,
299
318
  sample_creation=None,
300
319
  bibtex_citation=None,
301
- descriptive_stats={"n_samples": None, "avg_character_length": None},
320
+ descriptive_stats={
321
+ 'n_samples': None,
322
+ 'avg_character_length': None
323
+ },
302
324
  )
@@ -1,63 +1,62 @@
1
1
  from .Classification import *
2
2
  from .Clustering import *
3
+ from .CustomTask import *
3
4
  from .PairClassification import *
4
5
  from .Reranking import *
5
6
  from .Retrieval import *
6
7
  from .STS import *
7
- from .CustomTask import *
8
-
9
8
 
10
9
  CLS_CLASSIFICATION = {
11
- "TNews": TNews,
12
- "IFlyTek": IFlyTek,
13
- "MultilingualSentiment": MultilingualSentiment,
14
- "JDReview": JDReview,
15
- "OnlineShopping": OnlineShopping,
16
- "Waimai": Waimai,
10
+ 'TNews': TNews,
11
+ 'IFlyTek': IFlyTek,
12
+ 'MultilingualSentiment': MultilingualSentiment,
13
+ 'JDReview': JDReview,
14
+ 'OnlineShopping': OnlineShopping,
15
+ 'Waimai': Waimai,
17
16
  }
18
17
 
19
18
  CLS_CLUSTERING = {
20
- "CLSClusteringS2S": CLSClusteringFastS2S,
21
- "CLSClusteringP2P": CLSClusteringFastP2P,
22
- "ThuNewsClusteringS2S": ThuNewsClusteringFastS2S,
23
- "ThuNewsClusteringP2P": ThuNewsClusteringFastP2P,
19
+ 'CLSClusteringS2S': CLSClusteringFastS2S,
20
+ 'CLSClusteringP2P': CLSClusteringFastP2P,
21
+ 'ThuNewsClusteringS2S': ThuNewsClusteringFastS2S,
22
+ 'ThuNewsClusteringP2P': ThuNewsClusteringFastP2P,
24
23
  }
25
24
 
26
25
  CLS_PAIR_CLASSIFICATION = {
27
- "Ocnli": Ocnli,
28
- "Cmnli": Cmnli,
26
+ 'Ocnli': Ocnli,
27
+ 'Cmnli': Cmnli,
29
28
  }
30
29
 
31
30
  CLS_RERANKING = {
32
- "T2Reranking": T2Reranking,
33
- "MMarcoReranking": MMarcoReranking,
34
- "CMedQAv1": CMedQAv1,
35
- "CMedQAv2": CMedQAv2,
31
+ 'T2Reranking': T2Reranking,
32
+ 'MMarcoReranking': MMarcoReranking,
33
+ 'CMedQAv1': CMedQAv1,
34
+ 'CMedQAv2': CMedQAv2,
36
35
  }
37
36
 
38
37
  CLS_RETRIEVAL = {
39
- "T2Retrieval": T2Retrieval,
40
- "MMarcoRetrieval": MMarcoRetrieval,
41
- "DuRetrieval": DuRetrieval,
42
- "CovidRetrieval": CovidRetrieval,
43
- "CmedqaRetrieval": CmedqaRetrieval,
44
- "EcomRetrieval": EcomRetrieval,
45
- "MedicalRetrieval": MedicalRetrieval,
46
- "VideoRetrieval": VideoRetrieval,
38
+ 'T2Retrieval': T2Retrieval,
39
+ 'MMarcoRetrieval': MMarcoRetrieval,
40
+ 'DuRetrieval': DuRetrieval,
41
+ 'CovidRetrieval': CovidRetrieval,
42
+ 'CmedqaRetrieval': CmedqaRetrieval,
43
+ 'EcomRetrieval': EcomRetrieval,
44
+ 'MedicalRetrieval': MedicalRetrieval,
45
+ 'VideoRetrieval': VideoRetrieval,
47
46
  }
48
47
 
49
48
  CLS_STS = {
50
- "ATEC": ATEC,
51
- "BQ": BQ,
52
- "LCQMC": LCQMC,
53
- "PAWSX": PAWSX,
54
- "STSB": STSB,
55
- "AFQMC": AFQMC,
56
- "QBQTC": QBQTC,
49
+ 'ATEC': ATEC,
50
+ 'BQ': BQ,
51
+ 'LCQMC': LCQMC,
52
+ 'PAWSX': PAWSX,
53
+ 'STSB': STSB,
54
+ 'AFQMC': AFQMC,
55
+ 'QBQTC': QBQTC,
57
56
  }
58
57
 
59
58
  CLS_CUSTOM = {
60
- "CustomRetrieval": CustomRetrieval,
59
+ 'CustomRetrieval': CustomRetrieval,
61
60
  }
62
61
 
63
62
  CLS_DICT = {
@@ -1,2 +1,2 @@
1
- from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments, EvaluationArguments
2
- from evalscope.backend.rag_eval.ragas.task_template import rag_eval
1
+ from evalscope.backend.rag_eval.ragas.arguments import EvaluationArguments, TestsetGenerationArguments
2
+ from evalscope.backend.rag_eval.ragas.task_template import rag_eval
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List, Optional, Union, Dict, Any
2
+ from typing import Any, Dict, List, Optional, Union
3
3
 
4
4
 
5
5
  @dataclass
@@ -12,7 +12,6 @@ class TestsetGenerationArguments:
12
12
  For local LLM support, you can use the following fields:
13
13
  model_name_or_path: str
14
14
  model_revision: str = "master"
15
- template_type: str = "default"
16
15
  generation_config: Optional[Dict]
17
16
 
18
17
  For API LLM support, you can use the following fields:
@@ -22,9 +21,7 @@ class TestsetGenerationArguments:
22
21
  """
23
22
  generator_llm: Dict = field(default_factory=dict)
24
23
  embeddings: Dict = field(default_factory=dict)
25
- distribution: str = field(
26
- default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1}
27
- )
24
+ distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
28
25
  # For LLM based evaluation
29
26
  # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
30
27
  # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
@@ -37,9 +34,7 @@ class EvaluationArguments:
37
34
  testset_file: str
38
35
  critic_llm: Dict = field(default_factory=dict)
39
36
  embeddings: Dict = field(default_factory=dict)
40
- metrics: List[str] = field(
41
- default_factory=lambda: ['answer_relevancy', 'faithfulness']
42
- )
37
+ metrics: List[str] = field(default_factory=lambda: ['answer_relevancy', 'faithfulness'])
43
38
  # For LLM based evaluation
44
39
  # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
45
40
  # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
@@ -1,18 +1,18 @@
1
1
  {
2
- "ragas_version": "0.2.5",
3
- "original_hash": 963876325390538086,
2
+ "ragas_version": "0.2.7",
3
+ "original_hash": -492257975294377194,
4
4
  "language": "chinese",
5
- "instruction": "给定一个真实情况和一个答案陈述,分析每个陈述并将其分类为以下类别之一:TP(真正):答案中存在的陈述也直接由一个或多个真实情况中的陈述支持,FP(假正):答案中存在的陈述但没有被任何真实情况中的陈述直接支持,FN(假负):在真实情况中发现但在答案中不存在的陈述。每个陈述只能属于一个类别。为每个分类提供理由。",
5
+ "instruction": "给定一个真实情况和一个答案陈述,分析每个陈述并将其分类为以下类别之一:TP(真正):答案中存在的陈述也直接由一个或多个真实情况中的陈述支持,FP(假正):答案中存在的陈述但没有被任何真实情况中的陈述直接支持,FN(假负):在真实情况中发现但在答案中不存在的陈述。每个陈述只能属于其中一个类别。为每个分类提供理由。",
6
6
  "examples": [
7
7
  {
8
8
  "input": {
9
9
  "question": "是什么为太阳提供能量,它的主要功能是什么?",
10
10
  "answer": [
11
- "太阳的能量来源于核裂变,类似于地球上的核反应堆。",
11
+ "太阳的能量来自核裂变,类似于地球上的核反应堆。",
12
12
  "太阳的主要功能是为太阳系提供光。"
13
13
  ],
14
14
  "ground_truth": [
15
- "太阳的能量来源于核聚变,其中氢原子融合形成氦。",
15
+ "太阳的能量来自核聚变,其中氢原子融合形成氦。",
16
16
  "太阳核心的这种聚变过程释放出巨大的能量。",
17
17
  "来自太阳的能量提供热量和光,这对地球上的生命至关重要。",
18
18
  "太阳的光在地球的气候系统中起着关键作用。",
@@ -28,13 +28,13 @@
28
28
  ],
29
29
  "FP": [
30
30
  {
31
- "statement": "太阳的能量来源于核裂变,类似于地球上的核反应堆。",
32
- "reason": "这一说法是不正确的,与地面事实相矛盾,地面事实指出太阳的能量来源于核聚变。"
31
+ "statement": "太阳的能量来自核裂变,类似于地球上的核反应堆。",
32
+ "reason": "这一说法是不正确的,与地面事实相矛盾,地面事实指出太阳的能量来自核聚变。"
33
33
  }
34
34
  ],
35
35
  "FN": [
36
36
  {
37
- "statement": "太阳的能量来源于核聚变,其中氢原子融合形成氦。",
37
+ "statement": "太阳的能量来自核聚变,其中氢原子融合形成氦。",
38
38
  "reason": "这种对太阳能量来源的准确描述没有包含在答案中。"
39
39
  },
40
40
  {
@@ -71,7 +71,7 @@
71
71
  "TP": [
72
72
  {
73
73
  "statement": "水的沸点在海平面上是100摄氏度。",
74
- "reason": "这一说法得到了地面事实的直接支持,地面事实明确指出水的沸点在海平面上是100摄氏度。"
74
+ "reason": "这一说法直接得到了地面事实的支持,地面事实具体说明了水的沸点在海平面上是100摄氏度。"
75
75
  }
76
76
  ],
77
77
  "FP": [],
@@ -1,6 +1,6 @@
1
1
  {
2
- "ragas_version": "0.2.5",
3
- "original_hash": 8370494081602031492,
2
+ "ragas_version": "0.2.7",
3
+ "original_hash": -8546983388246528139,
4
4
  "language": "chinese",
5
5
  "instruction": "给定一个问题、一个答案和答案中的句子,分析在“句子”下给出的每个句子的复杂性,并将每个句子分解为一个或多个完全可理解的陈述,同时确保每个陈述中不使用代词。将输出格式化为JSON。",
6
6
  "examples": [
@@ -1,8 +1,8 @@
1
1
  {
2
- "ragas_version": "0.2.5",
3
- "original_hash": -6199619726952258368,
2
+ "ragas_version": "0.2.7",
3
+ "original_hash": 7951911230338252816,
4
4
  "language": "chinese",
5
- "instruction": "为给定的答案生成一个问题,并识别答案是否是不明确的。如果答案是不明确的,则给出1;如果答案是明确的,则给出0。不明确的答案是指那些含糊其辞、模棱两可或不清楚的答案。例如,“我不知道”或“我不确定”是不明确的答案。",
5
+ "instruction": "为给定的答案生成一个问题,并识别答案是否含糊不清。如果答案含糊不清,则给出1;如果答案明确,则给出0。含糊不清的答案是指那些回避的、模糊的或不明确的答案。例如,“我不知道”或“我不确定”是含糊不清的答案。",
6
6
  "examples": [
7
7
  {
8
8
  "input": {
@@ -1,14 +1,14 @@
1
1
  {
2
- "ragas_version": "0.2.5",
3
- "original_hash": 6611742689846464445,
2
+ "ragas_version": "0.2.7",
3
+ "original_hash": -5318808809674890018,
4
4
  "language": "chinese",
5
- "instruction": "给定问题、答案和上下文,验证上下文在得出给定答案时是否有用。如果有用,给出判决为“1”,如果没有用,给出判决为“0”,并以json格式输出。",
5
+ "instruction": "给定问题、答案和背景,验证背景在得出给定答案时是否有用。如果有用,判定为“1”,如果没有用,判定为“0”,并以json格式输出。",
6
6
  "examples": [
7
7
  {
8
8
  "input": {
9
9
  "question": "你能告诉我关于阿尔伯特·爱因斯坦的什么?",
10
- "context": "阿尔伯特·爱因斯坦(1879年3月14日-1955年4月18日)是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最具影响力的科学家之一。他因发展相对论而闻名,同时也对量子力学做出了重要贡献,因此在20世纪前几十年现代物理学对自然科学理解的革命性重塑中起到了核心作用。他的质能等价公式E=mc²源于相对论,被称为“世界上最著名的方程”。他因“对理论物理学的贡献,特别是发现光电效应定律”而获得1921年诺贝尔物理学奖,这是量子理论发展的关键一步。他的工作也因其对科学哲学的影响而闻名。在1999年由英国《物理世界》杂志对全球130位顶尖物理学家的调查中,爱因斯坦被评为有史以来最伟大的物理学家。他的智力成就和原创性使爱因斯坦成为天才的代名词。",
11
- "answer": "阿尔伯特·爱因斯坦,生于1879年3月14日,是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最具影响力的科学家之一。他因对理论物理学的贡献而获得1921年诺贝尔物理学奖。"
10
+ "context": "阿尔伯特·爱因斯坦(1879年3月14日-1955年4月18日)是一位德国出生的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的科学家之一。他因发展相对论而闻名,同时也对量子力学做出了重要贡献,因此在20世纪前几十年现代物理学对自然科学理解的革命性重塑中起到了核心作用。他的质能等价公式E=mc²,源于相对论,被称为“世界上最著名的方程”。他因“对理论物理学的贡献,特别是发现光电效应定律”而获得1921年诺贝尔物理学奖,这是量子理论发展的关键一步。他的工作也因其对科学哲学的影响而闻名。在1999年由英国《物理世界》杂志对全球130位顶尖物理学家的调查中,爱因斯坦被评为有史以来最伟大的物理学家。他的智力成就和原创性使爱因斯坦成为天才的代名词。",
11
+ "answer": "阿尔伯特·爱因斯坦,生于1879年3月14日,是一位德国出生的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的科学家之一。他因对理论物理学的贡献而获得1921年诺贝尔物理学奖。"
12
12
  },
13
13
  "output": {
14
14
  "reason": "提供的背景确实有助于得出给定的答案。背景包括关于阿尔伯特·爱因斯坦的生活和贡献的关键信息,这些信息在答案中得到了反映。",
@@ -0,0 +1,7 @@
1
+ {
2
+ "ragas_version": "0.2.7",
3
+ "original_hash": -1333942410710431097,
4
+ "language": "chinese",
5
+ "instruction": "给定文档摘要和节点内容,将节点内容评分在1到5的范围内。",
6
+ "examples": []
7
+ }