PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +2 -3
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/backend/rag_eval/cmteb/tasks/Classification.py CHANGED Viewed

@@ -4,19 +4,19 @@ from mteb.abstasks.TaskMetadata import TaskMetadata
 class TNews(AbsTaskClassification):
     metadata = TaskMetadata(
-        name="TNews",
-        description="Short Text Classification for News",
-        reference="https://www.cluebenchmarks.com/introduce.html",
+        name='TNews',
+        description='Short Text Classification for News',
+        reference='https://www.cluebenchmarks.com/introduce.html',
         dataset={
-            "path": "C-MTEB/TNews-classification",
-            "revision": "317f262bf1e6126357bbe89e875451e4b0938fe4",
+            'path': 'C-MTEB/TNews-classification',
+            'revision': '317f262bf1e6126357bbe89e875451e4b0938fe4',
         },
-        type="Classification",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["validation"],
-        eval_langs=["cmn-Hans"],
-        main_score="accuracy",
+        type='Classification',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['validation'],
+        eval_langs=['cmn-Hans'],
+        main_score='accuracy',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -67,31 +67,34 @@ class TNews(AbsTaskClassification):
  doi = "10.18653/v1/2020.coling-main.419",
  pages = "4762--4772",
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["samples_per_label"] = 32
+        metadata_dict['samples_per_label'] = 32
         return metadata_dict
 class IFlyTek(AbsTaskClassification):
     metadata = TaskMetadata(
-        name="IFlyTek",
-        description="Long Text classification for the description of Apps",
-        reference="https://www.cluebenchmarks.com/introduce.html",
+        name='IFlyTek',
+        description='Long Text classification for the description of Apps',
+        reference='https://www.cluebenchmarks.com/introduce.html',
         dataset={
-            "path": "C-MTEB/IFlyTek-classification",
-            "revision": "421605374b29664c5fc098418fe20ada9bd55f8a",
+            'path': 'C-MTEB/IFlyTek-classification',
+            'revision': '421605374b29664c5fc098418fe20ada9bd55f8a',
         },
-        type="Classification",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["validation"],
-        eval_langs=["cmn-Hans"],
-        main_score="accuracy",
+        type='Classification',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['validation'],
+        eval_langs=['cmn-Hans'],
+        main_score='accuracy',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -143,32 +146,36 @@ class IFlyTek(AbsTaskClassification):
  pages = "4762--4772",
  abstract = "The advent of natural language understanding (NLU) benchmarks for English, such as GLUE and SuperGLUE allows new NLU models to be evaluated across a diverse set of tasks. These comprehensive benchmarks have facilitated a broad range of research and applications in natural language processing (NLP). The problem, however, is that most such benchmarks are limited to English, which has made it difficult to replicate many of the successes in English NLU for other languages. To help remedy this issue, we introduce the first large-scale Chinese Language Understanding Evaluation (CLUE) benchmark. CLUE is an open-ended, community-driven project that brings together 9 tasks spanning several well-established single-sentence/sentence-pair classification tasks, as well as machine reading comprehension, all on original Chinese text. To establish results on these tasks, we report scores using an exhaustive set of current state-of-the-art pre-trained Chinese models (9 in total). We also introduce a number of supplementary datasets and additional tools to help facilitate further progress on Chinese NLU. Our benchmark is released at https://www.cluebenchmarks.com",
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["samples_per_label"] = 32
-        metadata_dict["n_experiments"] = 5
+        metadata_dict['samples_per_label'] = 32
+        metadata_dict['n_experiments'] = 5
         return metadata_dict
 class MultilingualSentiment(AbsTaskClassification):
     metadata = TaskMetadata(
-        name="MultilingualSentiment",
-        description="A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative",
-        reference="https://github.com/tyqiangz/multilingual-sentiment-datasets",
+        name='MultilingualSentiment',
+        description=
+        'A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative',
+        reference='https://github.com/tyqiangz/multilingual-sentiment-datasets',
         dataset={
-            "path": "C-MTEB/MultilingualSentiment-classification",
-            "revision": "46958b007a63fdbf239b7672c25d0bea67b5ea1a",
+            'path': 'C-MTEB/MultilingualSentiment-classification',
+            'revision': '46958b007a63fdbf239b7672c25d0bea67b5ea1a',
         },
-        type="Classification",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["validation", "test"],
-        eval_langs=["cmn-Hans"],
-        main_score="accuracy",
+        type='Classification',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['validation', 'test'],
+        eval_langs=['cmn-Hans'],
+        main_score='accuracy',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -177,31 +184,34 @@ class MultilingualSentiment(AbsTaskClassification):
         dialect=None,
         sample_creation=None,
         bibtex_citation=None,
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["samples_per_label"] = 32
+        metadata_dict['samples_per_label'] = 32
         return metadata_dict
 class JDReview(AbsTaskClassification):
     metadata = TaskMetadata(
-        name="JDReview",
-        description="review for iphone",
-        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        name='JDReview',
+        description='review for iphone',
+        reference='https://aclanthology.org/2023.nodalida-1.20/',
         dataset={
-            "path": "C-MTEB/JDReview-classification",
-            "revision": "b7c64bd89eb87f8ded463478346f76731f07bf8b",
+            'path': 'C-MTEB/JDReview-classification',
+            'revision': 'b7c64bd89eb87f8ded463478346f76731f07bf8b',
         },
-        type="Classification",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="accuracy",
+        type='Classification',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='accuracy',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -215,31 +225,34 @@ class JDReview(AbsTaskClassification):
   journal={arXiv preprint arXiv:2309.07597},
   year={2023}
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["samples_per_label"] = 32
+        metadata_dict['samples_per_label'] = 32
         return metadata_dict
 class OnlineShopping(AbsTaskClassification):
     metadata = TaskMetadata(
-        name="OnlineShopping",
-        description="Sentiment Analysis of User Reviews on Online Shopping Websites",
-        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        name='OnlineShopping',
+        description='Sentiment Analysis of User Reviews on Online Shopping Websites',
+        reference='https://aclanthology.org/2023.nodalida-1.20/',
         dataset={
-            "path": "C-MTEB/OnlineShopping-classification",
-            "revision": "e610f2ebd179a8fda30ae534c3878750a96db120",
+            'path': 'C-MTEB/OnlineShopping-classification',
+            'revision': 'e610f2ebd179a8fda30ae534c3878750a96db120',
         },
-        type="Classification",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="accuracy",
+        type='Classification',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='accuracy',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -253,31 +266,34 @@ class OnlineShopping(AbsTaskClassification):
   journal={arXiv preprint arXiv:2309.07597},
   year={2023}
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["samples_per_label"] = 32
+        metadata_dict['samples_per_label'] = 32
         return metadata_dict
 class Waimai(AbsTaskClassification):
     metadata = TaskMetadata(
-        name="Waimai",
-        description="Sentiment Analysis of user reviews on takeaway platforms",
-        reference="https://aclanthology.org/2023.nodalida-1.20/",
+        name='Waimai',
+        description='Sentiment Analysis of user reviews on takeaway platforms',
+        reference='https://aclanthology.org/2023.nodalida-1.20/',
         dataset={
-            "path": "C-MTEB/waimai-classification",
-            "revision": "339287def212450dcaa9df8c22bf93e9980c7023",
+            'path': 'C-MTEB/waimai-classification',
+            'revision': '339287def212450dcaa9df8c22bf93e9980c7023',
         },
-        type="Classification",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["test"],
-        eval_langs=["cmn-Hans"],
-        main_score="accuracy",
+        type='Classification',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['test'],
+        eval_langs=['cmn-Hans'],
+        main_score='accuracy',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -291,12 +307,15 @@ class Waimai(AbsTaskClassification):
   journal={arXiv preprint arXiv:2309.07597},
   year={2023}
 }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     @property
     def metadata_dict(self) -> dict[str, str]:
         metadata_dict = super().metadata_dict
-        metadata_dict["samples_per_label"] = 32
+        metadata_dict['samples_per_label'] = 32
         return metadata_dict

evalscope/backend/rag_eval/cmteb/tasks/Clustering.py CHANGED Viewed

@@ -1,12 +1,7 @@
 import itertools
 from datasets import Dataset, DatasetDict
 from mteb.abstasks.AbsTaskClustering import AbsTaskClustering
-from mteb.abstasks.AbsTaskClusteringFast import (
-    AbsTaskClusteringFast,
-    check_label_distribution,
-)
+from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast, check_label_distribution
 from mteb.abstasks.TaskMetadata import TaskMetadata
 NUM_SAMPLES = 2048
@@ -46,7 +41,9 @@ class CLSClusteringFastS2S(AbsTaskClusteringFast):
             primaryClass={cs.CL}
         }""",  # noqa
         descriptive_stats={
-            'n_samples': {'test': NUM_SAMPLES},
+            'n_samples': {
+                'test': NUM_SAMPLES
+            },
             'avg_character_length': {},
         },
     )
@@ -55,9 +52,7 @@ class CLSClusteringFastS2S(AbsTaskClusteringFast):
         ds = {}
         for split in self.metadata.eval_splits:
             labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
-            sentences = list(
-                itertools.chain.from_iterable(self.dataset[split]['sentences'])
-            )
+            sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
             check_label_distribution(self.dataset[split])
@@ -106,7 +101,9 @@ class CLSClusteringFastP2P(AbsTaskClusteringFast):
             primaryClass={cs.CL}
         }""",  # noqa
         descriptive_stats={
-            'n_samples': {'test': NUM_SAMPLES},
+            'n_samples': {
+                'test': NUM_SAMPLES
+            },
             'avg_character_length': {},
         },
     )
@@ -115,9 +112,7 @@ class CLSClusteringFastP2P(AbsTaskClusteringFast):
         ds = {}
         for split in self.metadata.eval_splits:
             labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
-            sentences = list(
-                itertools.chain.from_iterable(self.dataset[split]['sentences'])
-            )
+            sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
             check_label_distribution(self.dataset[split])
@@ -166,7 +161,9 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
   url = {https://github.com/thunlp/THUCTC}
 }""",
         descriptive_stats={
-            'n_samples': {'test': NUM_SAMPLES},
+            'n_samples': {
+                'test': NUM_SAMPLES
+            },
             'avg_character_length': {},
         },
     )
@@ -175,9 +172,7 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
         ds = {}
         for split in self.metadata.eval_splits:
             labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
-            sentences = list(
-                itertools.chain.from_iterable(self.dataset[split]['sentences'])
-            )
+            sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
             check_label_distribution(self.dataset[split])
@@ -226,7 +221,9 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
   url = {https://github.com/thunlp/THUCTC}
 }""",
         descriptive_stats={
-            'n_samples': {'test': NUM_SAMPLES},
+            'n_samples': {
+                'test': NUM_SAMPLES
+            },
             'avg_character_length': {},
         },
     )
@@ -235,9 +232,7 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
         ds = {}
         for split in self.metadata.eval_splits:
             labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
-            sentences = list(
-                itertools.chain.from_iterable(self.dataset[split]['sentences'])
-            )
+            sentences = list(itertools.chain.from_iterable(self.dataset[split]['sentences']))
             check_label_distribution(self.dataset[split])

evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py CHANGED Viewed

@@ -1,31 +1,29 @@
-from typing import Optional
 from mteb import AbsTaskRetrieval
 from mteb import HFDataLoader as CustomDataLoader
 from mteb.abstasks.TaskMetadata import TaskMetadata
+from typing import Optional
 class CustomRetrieval(AbsTaskRetrieval):
     metadata: TaskMetadata
     ignore_identical_ids: bool = True
-    def __init__(
-        self, dataset_path: Optional[str] = "custom_eval/text/retrieval", **kwargs
-    ):
+    def __init__(self, dataset_path: Optional[str] = 'custom_eval/text/retrieval', **kwargs):
         super().__init__(**kwargs)
         self.metadata = TaskMetadata(
-            name="CustomRetrieval",
-            description="CustomRetrieval Task",
+            name='CustomRetrieval',
+            description='CustomRetrieval Task',
             reference=None,
             dataset={
-                "path": dataset_path,
-                "revision": "v1",
+                'path': dataset_path,
+                'revision': 'v1',
             },
-            type="Retrieval",
-            category="s2p",
-            modalities=["text"],
-            eval_splits=["test"],
-            eval_langs=["cmn-Hans"],
-            main_score="recall_at_5",
+            type='Retrieval',
+            category='s2p',
+            modalities=['text'],
+            eval_splits=['test'],
+            eval_langs=['cmn-Hans'],
+            main_score='recall_at_5',
             date=None,
             domains=None,
             task_subtypes=None,
@@ -33,7 +31,7 @@ class CustomRetrieval(AbsTaskRetrieval):
             annotations_creators=None,
             dialect=None,
             sample_creation=None,
-            bibtex_citation="",
+            bibtex_citation='',
             descriptive_stats={},
         )
@@ -41,17 +39,17 @@ class CustomRetrieval(AbsTaskRetrieval):
         if self.data_loaded:
             return
         self.corpus, self.queries, self.relevant_docs = {}, {}, {}
-        dataset_path = self.metadata_dict["dataset"]["path"]
+        dataset_path = self.metadata_dict['dataset']['path']
-        for split in kwargs.get("eval_splits", self.metadata_dict["eval_splits"]):
+        for split in kwargs.get('eval_splits', self.metadata_dict['eval_splits']):
             corpus, queries, qrels = CustomDataLoader(
                 data_folder=dataset_path,
                 streaming=False,
                 keep_in_memory=False,
             ).load(split=split)
             # Conversion from DataSet
-            queries = {query["id"]: query["text"] for query in queries}
-            corpus = {doc["id"]: {"text": doc["text"]} for doc in corpus}
+            queries = {query['id']: query['text'] for query in queries}
+            corpus = {doc['id']: {'text': doc['text']} for doc in corpus}
             self.corpus[split], self.queries[split], self.relevant_docs[split] = (
                 corpus,
                 queries,

evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py CHANGED Viewed

@@ -4,19 +4,19 @@ from mteb.abstasks.TaskMetadata import TaskMetadata
 class Ocnli(AbsTaskPairClassification):
     metadata = TaskMetadata(
-        name="Ocnli",
-        description="Original Chinese Natural Language Inference dataset",
-        reference="https://arxiv.org/abs/2010.05444",
+        name='Ocnli',
+        description='Original Chinese Natural Language Inference dataset',
+        reference='https://arxiv.org/abs/2010.05444',
         dataset={
-            "path": "C-MTEB/OCNLI",
-            "revision": "66e76a618a34d6d565d5538088562851e6daa7ec",
+            'path': 'C-MTEB/OCNLI',
+            'revision': '66e76a618a34d6d565d5538088562851e6daa7ec',
         },
-        type="PairClassification",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["validation"],
-        eval_langs=["cmn-Hans"],
-        main_score="max_accuracy",
+        type='PairClassification',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['validation'],
+        eval_langs=['cmn-Hans'],
+        main_score='max_accuracy',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -25,36 +25,39 @@ class Ocnli(AbsTaskPairClassification):
         dialect=None,
         sample_creation=None,
         bibtex_citation="""@misc{hu2020ocnli,
-            title={OCNLI: Original Chinese Natural Language Inference},
+            title={OCNLI: Original Chinese Natural Language Inference},
             author={Hai Hu and Kyle Richardson and Liang Xu and Lu Li and Sandra Kuebler and Lawrence S. Moss},
             year={2020},
             eprint={2010.05444},
             archivePrefix={arXiv},
             primaryClass={cs.CL}
         }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     def dataset_transform(self):
-        self.dataset = self.dataset.rename_column("sent1", "sentence1")
-        self.dataset = self.dataset.rename_column("sent2", "sentence2")
+        self.dataset = self.dataset.rename_column('sent1', 'sentence1')
+        self.dataset = self.dataset.rename_column('sent2', 'sentence2')
 class Cmnli(AbsTaskPairClassification):
     metadata = TaskMetadata(
-        name="Cmnli",
-        description="Chinese Multi-Genre NLI",
-        reference="https://huggingface.co/datasets/clue/viewer/cmnli",
+        name='Cmnli',
+        description='Chinese Multi-Genre NLI',
+        reference='https://huggingface.co/datasets/clue/viewer/cmnli',
         dataset={
-            "path": "C-MTEB/CMNLI",
-            "revision": "41bc36f332156f7adc9e38f53777c959b2ae9766",
+            'path': 'C-MTEB/CMNLI',
+            'revision': '41bc36f332156f7adc9e38f53777c959b2ae9766',
         },
-        type="PairClassification",
-        category="s2s",
-        modalities=["text"],
-        eval_splits=["validation", "test"],
-        eval_langs=["cmn-Hans"],
-        main_score="max_accuracy",
+        type='PairClassification',
+        category='s2s',
+        modalities=['text'],
+        eval_splits=['validation', 'test'],
+        eval_langs=['cmn-Hans'],
+        main_score='max_accuracy',
         date=None,
         domains=None,
         task_subtypes=None,
@@ -105,9 +108,12 @@ class Cmnli(AbsTaskPairClassification):
             doi = "10.18653/v1/2020.coling-main.419",
             pages = "4762--4772",
         }""",
-        descriptive_stats={"n_samples": None, "avg_character_length": None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
     def dataset_transform(self):
-        self.dataset = self.dataset.rename_column("sent1", "sentence1")
-        self.dataset = self.dataset.rename_column("sent2", "sentence2")
+        self.dataset = self.dataset.rename_column('sent1', 'sentence1')
+        self.dataset = self.dataset.rename_column('sent2', 'sentence2')

evalscope/backend/rag_eval/cmteb/tasks/Reranking.py CHANGED Viewed

@@ -33,7 +33,10 @@ class T2Reranking(AbsTaskReranking):
       archivePrefix={arXiv},
       primaryClass={cs.IR}
 }""",  # noqa
-        descriptive_stats={'n_samples': None, 'avg_character_length': None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
@@ -68,7 +71,10 @@ class MMarcoReranking(AbsTaskReranking):
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }""",  # noqa
-        descriptive_stats={'n_samples': None, 'avg_character_length': None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )
@@ -105,8 +111,12 @@ class CMedQAv1(AbsTaskReranking):
   publisher={Multidisciplinary Digital Publishing Institute}
 }""",
         descriptive_stats={
-            'n_samples': {'test': 2000},
-            'avg_character_length': {'test': 165},
+            'n_samples': {
+                'test': 2000
+            },
+            'avg_character_length': {
+                'test': 165
+            },
         },
     )
@@ -146,5 +156,8 @@ keywords={Biomedical imaging;Data mining;Semantics;Medical services;Feature extr
 doi={10.1109/ACCESS.2018.2883637},
 ISSN={2169-3536},
 month={},}""",  # noqa
-        descriptive_stats={'n_samples': None, 'avg_character_length': None},
+        descriptive_stats={
+            'n_samples': None,
+            'avg_character_length': None
+        },
     )

evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl