PyPI - evalscope - Versions diffs - 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.1py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.1.dist-info/RECORD +0 -286
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/backend/rag_eval/cmteb/tasks/STS.py CHANGED Viewed

@@ -1,21 +1,22 @@
 from mteb.abstasks.AbsTaskSTS import AbsTaskSTS
 from mteb.abstasks.TaskMetadata import TaskMetadata
 class ATEC(AbsTaskSTS):
     metadata = TaskMetadata(
-        name="ATEC",
+        name='ATEC',
         dataset={
-            "path": "C-MTEB/ATEC",
-            "revision": "0f319b1142f28d00e055a6770f3f726ae9b7d865",
+            'path': 'C-MTEB/ATEC',
+            'revision': '0f319b1142f28d00e055a6770f3f726ae9b7d865',
         },
-        description="A Chinese dataset for textual relatedness",
-        reference="https://aclanthology.org/2021.emnlp-main.357",
-        type="STS",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["validation", "test"],
-        eval_langs=["cmn-Hans"],
-        main_score="cosine_spearman",
+        description='A Chinese dataset for textual relatedness',
+        reference='https://aclanthology.org/2021.emnlp-main.357',
+        type='STS',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['validation', 'test'],
+        eval_langs=['cmn-Hans'],
+        main_score='cosine_spearman',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -43,32 +44,35 @@ class ATEC(AbsTaskSTS):
     pages = "4348--4366",
     abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["min_score"] = 0
-        metadata_dict["max_score"] = 1
+        metadata_dict['min_score'] = 0
+        metadata_dict['max_score'] = 1
         return metadata_dict
 class BQ(AbsTaskSTS):
     metadata = TaskMetadata(
-        name="BQ",
+        name='BQ',
         dataset={
-            "path": "C-MTEB/BQ",
-            "revision": "e3dda5e115e487b39ec7e618c0c6a29137052a55",
+            'path': 'C-MTEB/BQ',
+            'revision': 'e3dda5e115e487b39ec7e618c0c6a29137052a55',
         },
-        description="A Chinese dataset for textual relatedness",
-        reference="https://aclanthology.org/2021.emnlp-main.357",
-        type="STS",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["validation", "test"],
-        eval_langs=["cmn-Hans"],
-        main_score="cosine_spearman",
+        description='A Chinese dataset for textual relatedness',
+        reference='https://aclanthology.org/2021.emnlp-main.357',
+        type='STS',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['validation', 'test'],
+        eval_langs=['cmn-Hans'],
+        main_score='cosine_spearman',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -77,40 +81,43 @@ class BQ(AbsTaskSTS):
         dialect=None,
         sample_creation=None,
         bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
-      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
+      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
       author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
       year={2024},
       eprint={2309.07597},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2309.07597},
+      url={https://arxiv.org/abs/2309.07597},
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["min_score"] = 0
-        metadata_dict["max_score"] = 1
+        metadata_dict['min_score'] = 0
+        metadata_dict['max_score'] = 1
         return metadata_dict
 class LCQMC(AbsTaskSTS):
     metadata = TaskMetadata(
-        name="LCQMC",
+        name='LCQMC',
         dataset={
-            "path": "C-MTEB/LCQMC",
-            "revision": "17f9b096f80380fce5ed12a9be8be7784b337daf",
+            'path': 'C-MTEB/LCQMC',
+            'revision': '17f9b096f80380fce5ed12a9be8be7784b337daf',
         },
-        description="A Chinese dataset for textual relatedness",
-        reference="https://aclanthology.org/2021.emnlp-main.357",
-        type="STS",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="cosine_spearman",
+        description='A Chinese dataset for textual relatedness',
+        reference='https://aclanthology.org/2021.emnlp-main.357',
+        type='STS',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='cosine_spearman',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -119,40 +126,43 @@ class LCQMC(AbsTaskSTS):
         dialect=None,
         sample_creation=None,
         bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
-      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
+      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
       author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
       year={2024},
       eprint={2309.07597},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2309.07597},
+      url={https://arxiv.org/abs/2309.07597},
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["min_score"] = 0
-        metadata_dict["max_score"] = 1
+        metadata_dict['min_score'] = 0
+        metadata_dict['max_score'] = 1
         return metadata_dict
 class PAWSX(AbsTaskSTS):
     metadata = TaskMetadata(
-        name="PAWSX",
+        name='PAWSX',
         dataset={
-            "path": "C-MTEB/PAWSX",
-            "revision": "9c6a90e430ac22b5779fb019a23e820b11a8b5e1",
+            'path': 'C-MTEB/PAWSX',
+            'revision': '9c6a90e430ac22b5779fb019a23e820b11a8b5e1',
         },
-        description="A Chinese dataset for textual relatedness",
-        reference="https://aclanthology.org/2021.emnlp-main.357",
-        type="STS",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="cosine_spearman",
+        description='A Chinese dataset for textual relatedness',
+        reference='https://aclanthology.org/2021.emnlp-main.357',
+        type='STS',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='cosine_spearman',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -161,40 +171,43 @@ class PAWSX(AbsTaskSTS):
         dialect=None,
         sample_creation=None,
         bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
-      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
+      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
       author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
       year={2024},
       eprint={2309.07597},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2309.07597},
+      url={https://arxiv.org/abs/2309.07597},
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["min_score"] = 0
-        metadata_dict["max_score"] = 1
+        metadata_dict['min_score'] = 0
+        metadata_dict['max_score'] = 1
         return metadata_dict
 class STSB(AbsTaskSTS):
     metadata = TaskMetadata(
-        name="STSB",
+        name='STSB',
         dataset={
-            "path": "C-MTEB/STSB",
-            "revision": "0cde68302b3541bb8b3c340dc0644b0b745b3dc0",
+            'path': 'C-MTEB/STSB',
+            'revision': '0cde68302b3541bb8b3c340dc0644b0b745b3dc0',
         },
-        description="A Chinese dataset for textual relatedness",
-        reference="https://aclanthology.org/2021.emnlp-main.357",
-        type="STS",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["validation", "test"],
-        eval_langs=["cmn-Hans"],
-        main_score="cosine_spearman",
+        description='A Chinese dataset for textual relatedness',
+        reference='https://aclanthology.org/2021.emnlp-main.357',
+        type='STS',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['validation', 'test'],
+        eval_langs=['cmn-Hans'],
+        main_score='cosine_spearman',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -203,40 +216,43 @@ class STSB(AbsTaskSTS):
         dialect=None,
         sample_creation=None,
         bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
-      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
+      title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
       author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
       year={2024},
       eprint={2309.07597},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2309.07597},
+      url={https://arxiv.org/abs/2309.07597},
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["min_score"] = 0
-        metadata_dict["max_score"] = 5
+        metadata_dict['min_score'] = 0
+        metadata_dict['max_score'] = 5
         return metadata_dict
 class AFQMC(AbsTaskSTS):
     metadata = TaskMetadata(
-        name="AFQMC",
+        name='AFQMC',
         dataset={
-            "path": "C-MTEB/AFQMC",
-            "revision": "b44c3b011063adb25877c13823db83bb193913c4",
+            'path': 'C-MTEB/AFQMC',
+            'revision': 'b44c3b011063adb25877c13823db83bb193913c4',
         },
-        description="A Chinese dataset for textual relatedness",
-        reference="https://aclanthology.org/2021.emnlp-main.357",
-        type="STS",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["validation"],
-        eval_langs=["cmn-Hans"],
-        main_score="cosine_spearman",
+        description='A Chinese dataset for textual relatedness',
+        reference='https://aclanthology.org/2021.emnlp-main.357',
+        type='STS',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['validation'],
+        eval_langs=['cmn-Hans'],
+        main_score='cosine_spearman',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -264,32 +280,35 @@ class AFQMC(AbsTaskSTS):
     pages = "4348--4366",
     abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["min_score"] = 0
-        metadata_dict["max_score"] = 1
+        metadata_dict['min_score'] = 0
+        metadata_dict['max_score'] = 1
         return metadata_dict
 class QBQTC(AbsTaskSTS):
     metadata = TaskMetadata(
-        name="QBQTC",
+        name='QBQTC',
         dataset={
-            "path": "C-MTEB/QBQTC",
-            "revision": "790b0510dc52b1553e8c49f3d2afb48c0e5c48b7",
+            'path': 'C-MTEB/QBQTC',
+            'revision': '790b0510dc52b1553e8c49f3d2afb48c0e5c48b7',
         },
-        description="",
-        reference="https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset",
-        type="STS",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="cosine_spearman",
+        description='',
+        reference='https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset',
+        type='STS',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='cosine_spearman',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -298,5 +317,8 @@ class QBQTC(AbsTaskSTS):
         dialect=None,
         sample_creation=None,
         bibtex_citation=None,
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )

evalscope/backend/rag_eval/cmteb/tasks/__init__.py CHANGED Viewed

@@ -1,63 +1,62 @@
 from .Classification import *
 from .Clustering import *
+from .CustomTask import *
 from .PairClassification import *
 from .Reranking import *
 from .Retrieval import *
 from .STS import *
-from .CustomTask import *
 CLS_CLASSIFICATION = {
-    "TNews": TNews,
-    "IFlyTek": IFlyTek,
-    "MultilingualSentiment": MultilingualSentiment,
-    "JDReview": JDReview,
-    "OnlineShopping": OnlineShopping,
-    "Waimai": Waimai,
+    'TNews': TNews,
+    'IFlyTek': IFlyTek,
+    'MultilingualSentiment': MultilingualSentiment,
+    'JDReview': JDReview,
+    'OnlineShopping': OnlineShopping,
+    'Waimai': Waimai,
 }
 CLS_CLUSTERING = {
-    "CLSClusteringS2S": CLSClusteringFastS2S,
-    "CLSClusteringP2P": CLSClusteringFastP2P,
-    "ThuNewsClusteringS2S": ThuNewsClusteringFastS2S,
-    "ThuNewsClusteringP2P": ThuNewsClusteringFastP2P,
+    'CLSClusteringS2S': CLSClusteringFastS2S,
+    'CLSClusteringP2P': CLSClusteringFastP2P,
+    'ThuNewsClusteringS2S': ThuNewsClusteringFastS2S,
+    'ThuNewsClusteringP2P': ThuNewsClusteringFastP2P,
 }
 CLS_PAIR_CLASSIFICATION = {
-    "Ocnli": Ocnli,
-    "Cmnli": Cmnli,
+    'Ocnli': Ocnli,
+    'Cmnli': Cmnli,
 }
 CLS_RERANKING = {
-    "T2Reranking": T2Reranking,
-    "MMarcoReranking": MMarcoReranking,
-    "CMedQAv1": CMedQAv1,
-    "CMedQAv2": CMedQAv2,
+    'T2Reranking': T2Reranking,
+    'MMarcoReranking': MMarcoReranking,
+    'CMedQAv1': CMedQAv1,
+    'CMedQAv2': CMedQAv2,
 }
 CLS_RETRIEVAL = {
-    "T2Retrieval": T2Retrieval,
-    "MMarcoRetrieval": MMarcoRetrieval,
-    "DuRetrieval": DuRetrieval,
-    "CovidRetrieval": CovidRetrieval,
-    "CmedqaRetrieval": CmedqaRetrieval,
-    "EcomRetrieval": EcomRetrieval,
-    "MedicalRetrieval": MedicalRetrieval,
-    "VideoRetrieval": VideoRetrieval,
+    'T2Retrieval': T2Retrieval,
+    'MMarcoRetrieval': MMarcoRetrieval,
+    'DuRetrieval': DuRetrieval,
+    'CovidRetrieval': CovidRetrieval,
+    'CmedqaRetrieval': CmedqaRetrieval,
+    'EcomRetrieval': EcomRetrieval,
+    'MedicalRetrieval': MedicalRetrieval,
+    'VideoRetrieval': VideoRetrieval,
 }
 CLS_STS = {
-    "ATEC": ATEC,
-    "BQ": BQ,
-    "LCQMC": LCQMC,
-    "PAWSX": PAWSX,
-    "STSB": STSB,
-    "AFQMC": AFQMC,
-    "QBQTC": QBQTC,
+    'ATEC': ATEC,
+    'BQ': BQ,
+    'LCQMC': LCQMC,
+    'PAWSX': PAWSX,
+    'STSB': STSB,
+    'AFQMC': AFQMC,
+    'QBQTC': QBQTC,
 }
 CLS_CUSTOM = {
-    "CustomRetrieval": CustomRetrieval,
+    'CustomRetrieval': CustomRetrieval,
 }
 CLS_DICT = {

evalscope/backend/rag_eval/ragas/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments, EvaluationArguments
-from evalscope.backend.rag_eval.ragas.task_template import rag_eval
+from evalscope.backend.rag_eval.ragas.arguments import EvaluationArguments, TestsetGenerationArguments
+from evalscope.backend.rag_eval.ragas.task_template import rag_eval

evalscope/backend/rag_eval/ragas/arguments.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import dataclass, field
-from typing import List, Optional, Union, Dict, Any
+from typing import Any, Dict, List, Optional, Union
 @dataclass
@@ -12,7 +12,6 @@ class TestsetGenerationArguments:
     For local LLM support, you can use the following fields:
         model_name_or_path: str
         model_revision: str = "master"
-        template_type: str = "default"
         generation_config: Optional[Dict]
     For API LLM support, you can use the following fields:
@@ -22,9 +21,7 @@ class TestsetGenerationArguments:
     """
     generator_llm: Dict = field(default_factory=dict)
     embeddings: Dict = field(default_factory=dict)
-    distribution: str = field(
-        default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1}
-    )
+    distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
     # For LLM based evaluation
     # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
     # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
@@ -37,9 +34,7 @@ class EvaluationArguments:
     testset_file: str
     critic_llm: Dict = field(default_factory=dict)
     embeddings: Dict = field(default_factory=dict)
-    metrics: List[str] = field(
-        default_factory=lambda: ['answer_relevancy', 'faithfulness']
-    )
+    metrics: List[str] = field(default_factory=lambda: ['answer_relevancy', 'faithfulness'])
     # For LLM based evaluation
     # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
     # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',

evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json CHANGED Viewed

@@ -1,18 +1,18 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": 963876325390538086,
+  "ragas_version": "0.2.7",
+  "original_hash": -492257975294377194,
   "language": "chinese",
-  "instruction": "给定一个真实情况和一个答案陈述，分析每个陈述并将其分类为以下类别之一：TP（真正）：答案中存在的陈述也直接由一个或多个真实情况中的陈述支持，FP（假正）：答案中存在的陈述但没有被任何真实情况中的陈述直接支持，FN（假负）：在真实情况中发现但在答案中不存在的陈述。每个陈述只能属于一个类别。为每个分类提供理由。",
+  "instruction": "给定一个真实情况和一个答案陈述，分析每个陈述并将其分类为以下类别之一：TP（真正）：答案中存在的陈述也直接由一个或多个真实情况中的陈述支持，FP（假正）：答案中存在的陈述但没有被任何真实情况中的陈述直接支持，FN（假负）：在真实情况中发现但在答案中不存在的陈述。每个陈述只能属于其中一个类别。为每个分类提供理由。",
   "examples": [
     {
       "input": {
         "question": "是什么为太阳提供能量，它的主要功能是什么？",
         "answer": [
-          "太阳的能量来源于核裂变，类似于地球上的核反应堆。",
+          "太阳的能量来自核裂变，类似于地球上的核反应堆。",
           "太阳的主要功能是为太阳系提供光。"
         ],
         "ground_truth": [
-          "太阳的能量来源于核聚变，其中氢原子融合形成氦。",
+          "太阳的能量来自核聚变，其中氢原子融合形成氦。",
           "太阳核心的这种聚变过程释放出巨大的能量。",
           "来自太阳的能量提供热量和光，这对地球上的生命至关重要。",
           "太阳的光在地球的气候系统中起着关键作用。",
@@ -28,13 +28,13 @@
         ],
         "FP": [
           {
-            "statement": "太阳的能量来源于核裂变，类似于地球上的核反应堆。",
-            "reason": "这一说法是不正确的，与地面事实相矛盾，地面事实指出太阳的能量来源于核聚变。"
+            "statement": "太阳的能量来自核裂变，类似于地球上的核反应堆。",
+            "reason": "这一说法是不正确的，与地面事实相矛盾，地面事实指出太阳的能量来自核聚变。"
           }
         ],
         "FN": [
           {
-            "statement": "太阳的能量来源于核聚变，其中氢原子融合形成氦。",
+            "statement": "太阳的能量来自核聚变，其中氢原子融合形成氦。",
             "reason": "这种对太阳能量来源的准确描述没有包含在答案中。"
           },
           {
@@ -71,7 +71,7 @@
         "TP": [
           {
             "statement": "水的沸点在海平面上是100摄氏度。",
-            "reason": "这一说法得到了地面事实的直接支持，地面事实明确指出水的沸点在海平面上是100摄氏度。"
+            "reason": "这一说法直接得到了地面事实的支持，地面事实具体说明了水的沸点在海平面上是100摄氏度。"
           }
         ],
         "FP": [],

evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": 8370494081602031492,
+  "ragas_version": "0.2.7",
+  "original_hash": -8546983388246528139,
   "language": "chinese",
   "instruction": "给定一个问题、一个答案和答案中的句子，分析在“句子”下给出的每个句子的复杂性，并将每个句子分解为一个或多个完全可理解的陈述，同时确保每个陈述中不使用代词。将输出格式化为JSON。",
   "examples": [

evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": -6199619726952258368,
+  "ragas_version": "0.2.7",
+  "original_hash": 7951911230338252816,
   "language": "chinese",
-  "instruction": "为给定的答案生成一个问题，并识别答案是否是不明确的。如果答案是不明确的，则给出1；如果答案是明确的，则给出0。不明确的答案是指那些含糊其辞、模棱两可或不清楚的答案。例如，“我不知道”或“我不确定”是不明确的答案。",
+  "instruction": "为给定的答案生成一个问题，并识别答案是否含糊不清。如果答案含糊不清，则给出1；如果答案明确，则给出0。含糊不清的答案是指那些回避的、模糊的或不明确的答案。例如，“我不知道”或“我不确定”是含糊不清的答案。",
   "examples": [
     {
       "input": {

evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": 6611742689846464445,
+  "ragas_version": "0.2.7",
+  "original_hash": -5318808809674890018,
   "language": "chinese",
-  "instruction": "给定问题、答案和上下文，验证上下文在得出给定答案时是否有用。如果有用，给出判决为“1”，如果没有用，给出判决为“0”，并以json格式输出。",
+  "instruction": "给定问题、答案和背景，验证背景在得出给定答案时是否有用。如果有用，判定为“1”，如果没有用，判定为“0”，并以json格式输出。",
   "examples": [
     {
       "input": {
         "question": "你能告诉我关于阿尔伯特·爱因斯坦的什么？",
-        "context": "阿尔伯特·爱因斯坦（1879年3月14日－1955年4月18日）是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最具影响力的科学家之一。他因发展相对论而闻名，同时也对量子力学做出了重要贡献，因此在20世纪前几十年现代物理学对自然科学理解的革命性重塑中起到了核心作用。他的质能等价公式E=mc²源于相对论，被称为“世界上最著名的方程”。他因“对理论物理学的贡献，特别是发现光电效应定律”而获得1921年诺贝尔物理学奖，这是量子理论发展的关键一步。他的工作也因其对科学哲学的影响而闻名。在1999年由英国《物理世界》杂志对全球130位顶尖物理学家的调查中，爱因斯坦被评为有史以来最伟大的物理学家。他的智力成就和原创性使爱因斯坦成为天才的代名词。",
-        "answer": "阿尔伯特·爱因斯坦，生于1879年3月14日，是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最具影响力的科学家之一。他因对理论物理学的贡献而获得1921年诺贝尔物理学奖。"
+        "context": "阿尔伯特·爱因斯坦（1879年3月14日－1955年4月18日）是一位德国出生的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的科学家之一。他因发展相对论而闻名，同时也对量子力学做出了重要贡献，因此在20世纪前几十年现代物理学对自然科学理解的革命性重塑中起到了核心作用。他的质能等价公式E=mc²，源于相对论，被称为“世界上最著名的方程”。他因“对理论物理学的贡献，特别是发现光电效应定律”而获得1921年诺贝尔物理学奖，这是量子理论发展的关键一步。他的工作也因其对科学哲学的影响而闻名。在1999年由英国《物理世界》杂志对全球130位顶尖物理学家的调查中，爱因斯坦被评为有史以来最伟大的物理学家。他的智力成就和原创性使爱因斯坦成为天才的代名词。",
+        "answer": "阿尔伯特·爱因斯坦，生于1879年3月14日，是一位德国出生的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的科学家之一。他因对理论物理学的贡献而获得1921年诺贝尔物理学奖。"
       },
       "output": {
         "reason": "提供的背景确实有助于得出给定的答案。背景包括关于阿尔伯特·爱因斯坦的生活和贡献的关键信息，这些信息在答案中得到了反映。",

evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json ADDED Viewed

@@ -0,0 +1,7 @@
+{
+  "ragas_version": "0.2.7",
+  "original_hash": -1333942410710431097,
+  "language": "chinese",
+  "instruction": "给定文档摘要和节点内容，将节点内容评分在1到5的范围内。",
+  "examples": []
+}

evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.1py3-none-any.whl → 0.8.0py3-none-any.whl