evalscope 0.6.0__tar.gz → 0.6.0rc0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/PKG-INFO +9 -9
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/tasks/eval_datasets.py +2 -1
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +96 -96
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +70 -71
- evalscope-0.6.0rc0/evalscope/version.py +4 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/PKG-INFO +9 -9
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/requires.txt +8 -8
- evalscope-0.6.0/evalscope/version.py +0 -4
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/README.md +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/base.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/data_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cache.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cli/base.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cli/cli.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/config.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/constants.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/evaluator/evaluator.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/math_accuracy.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/api/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/api/openai_api.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/dummy_chat_model.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/model.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/model_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/openai_model.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/models/template.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/_logging.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/api_plugin_base.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/custom_api.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/dashscope_api.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/dataset_plugin_base.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/datasets/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/datasets/line_by_line.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/datasets/openqa.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/how_to_analysis_result.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/openai_api.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/plugin_registry.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/query_parameters.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/perf/server_sent_event.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/preprocess/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/preprocess/tokenizers/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/run.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/run_arena.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/run_ms.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/summarizer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/tools/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/tools/combine_reports.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/tools/rewrite_eval_results.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/logger.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/task_cfg_parser.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/task_utils.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope/utils/utils.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/SOURCES.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.0rc0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.0rc0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -28,7 +28,7 @@ Requires-Dist: nltk>=3.9
|
|
|
28
28
|
Requires-Dist: openai
|
|
29
29
|
Requires-Dist: pandas
|
|
30
30
|
Requires-Dist: plotly
|
|
31
|
-
Requires-Dist: pyarrow
|
|
31
|
+
Requires-Dist: pyarrow<=17.0.0
|
|
32
32
|
Requires-Dist: pympler
|
|
33
33
|
Requires-Dist: pyyaml
|
|
34
34
|
Requires-Dist: regex
|
|
@@ -48,12 +48,12 @@ Requires-Dist: transformers_stream_generator
|
|
|
48
48
|
Requires-Dist: jieba
|
|
49
49
|
Requires-Dist: rouge-chinese
|
|
50
50
|
Provides-Extra: opencompass
|
|
51
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
51
|
+
Requires-Dist: ms-opencompass>=0.1.3; extra == "opencompass"
|
|
52
52
|
Provides-Extra: vlmeval
|
|
53
53
|
Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
|
|
54
54
|
Provides-Extra: rag
|
|
55
|
-
Requires-Dist: mteb
|
|
56
|
-
Requires-Dist: ragas
|
|
55
|
+
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
56
|
+
Requires-Dist: ragas==0.2.3; extra == "rag"
|
|
57
57
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
58
58
|
Provides-Extra: inner
|
|
59
59
|
Requires-Dist: absl-py; extra == "inner"
|
|
@@ -96,7 +96,7 @@ Requires-Dist: nltk>=3.9; extra == "all"
|
|
|
96
96
|
Requires-Dist: openai; extra == "all"
|
|
97
97
|
Requires-Dist: pandas; extra == "all"
|
|
98
98
|
Requires-Dist: plotly; extra == "all"
|
|
99
|
-
Requires-Dist: pyarrow; extra == "all"
|
|
99
|
+
Requires-Dist: pyarrow<=17.0.0; extra == "all"
|
|
100
100
|
Requires-Dist: pympler; extra == "all"
|
|
101
101
|
Requires-Dist: pyyaml; extra == "all"
|
|
102
102
|
Requires-Dist: regex; extra == "all"
|
|
@@ -115,10 +115,10 @@ Requires-Dist: transformers>=4.33; extra == "all"
|
|
|
115
115
|
Requires-Dist: transformers_stream_generator; extra == "all"
|
|
116
116
|
Requires-Dist: jieba; extra == "all"
|
|
117
117
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
118
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
118
|
+
Requires-Dist: ms-opencompass>=0.1.3; extra == "all"
|
|
119
119
|
Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
|
|
120
|
-
Requires-Dist: mteb
|
|
121
|
-
Requires-Dist: ragas
|
|
120
|
+
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
121
|
+
Requires-Dist: ragas==0.2.3; extra == "all"
|
|
122
122
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
123
123
|
|
|
124
124
|
|
|
@@ -50,12 +50,13 @@ with read_base():
|
|
|
50
50
|
from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
|
|
51
51
|
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
|
52
52
|
from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
|
|
53
|
+
from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
|
|
54
|
+
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
|
|
53
55
|
|
|
54
56
|
# Note: to be supported
|
|
55
57
|
# from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
|
|
56
58
|
# from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
|
|
57
59
|
# from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
|
|
58
|
-
# from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
|
|
59
60
|
|
|
60
61
|
|
|
61
62
|
datasets = []
|
|
@@ -17,57 +17,57 @@ class CLSClusteringFastS2S(AbsTaskClusteringFast):
|
|
|
17
17
|
max_fraction_of_documents_to_embed = None
|
|
18
18
|
|
|
19
19
|
metadata = TaskMetadata(
|
|
20
|
-
name=
|
|
21
|
-
description=
|
|
22
|
-
reference=
|
|
20
|
+
name='CLSClusteringS2S',
|
|
21
|
+
description='Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.',
|
|
22
|
+
reference='https://arxiv.org/abs/2209.05034',
|
|
23
23
|
dataset={
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
'path': 'C-MTEB/CLSClusteringS2S',
|
|
25
|
+
'revision': 'e458b3f5414b62b7f9f83499ac1f5497ae2e869f',
|
|
26
26
|
},
|
|
27
|
-
type=
|
|
28
|
-
category=
|
|
29
|
-
modalities=[
|
|
30
|
-
eval_splits=[
|
|
31
|
-
eval_langs=[
|
|
32
|
-
main_score=
|
|
33
|
-
date=(
|
|
34
|
-
domains=[
|
|
35
|
-
task_subtypes=[
|
|
36
|
-
license=
|
|
37
|
-
annotations_creators=
|
|
27
|
+
type='Clustering',
|
|
28
|
+
category='s2s',
|
|
29
|
+
modalities=['text'],
|
|
30
|
+
eval_splits=['test'],
|
|
31
|
+
eval_langs=['cmn-Hans'],
|
|
32
|
+
main_score='v_measure',
|
|
33
|
+
date=('2022-01-01', '2022-09-12'),
|
|
34
|
+
domains=['Academic', 'Written'],
|
|
35
|
+
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
36
|
+
license='apache-2.0',
|
|
37
|
+
annotations_creators='derived',
|
|
38
38
|
dialect=[],
|
|
39
|
-
sample_creation=
|
|
39
|
+
sample_creation='found',
|
|
40
40
|
bibtex_citation="""@misc{li2022csl,
|
|
41
|
-
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
|
|
41
|
+
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
|
|
42
42
|
author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
|
|
43
43
|
year={2022},
|
|
44
44
|
eprint={2209.05034},
|
|
45
45
|
archivePrefix={arXiv},
|
|
46
46
|
primaryClass={cs.CL}
|
|
47
|
-
}""",
|
|
47
|
+
}""", # noqa
|
|
48
48
|
descriptive_stats={
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
'n_samples': {'test': NUM_SAMPLES},
|
|
50
|
+
'avg_character_length': {},
|
|
51
51
|
},
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
def dataset_transform(self):
|
|
55
55
|
ds = {}
|
|
56
56
|
for split in self.metadata.eval_splits:
|
|
57
|
-
labels = list(itertools.chain.from_iterable(self.dataset[split][
|
|
57
|
+
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
58
58
|
sentences = list(
|
|
59
|
-
itertools.chain.from_iterable(self.dataset[split][
|
|
59
|
+
itertools.chain.from_iterable(self.dataset[split]['sentences'])
|
|
60
60
|
)
|
|
61
61
|
|
|
62
62
|
check_label_distribution(self.dataset[split])
|
|
63
63
|
|
|
64
|
-
ds[split] = Dataset.from_dict({
|
|
64
|
+
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
65
65
|
self.dataset = DatasetDict(ds)
|
|
66
66
|
self.dataset = self.stratified_subsampling(
|
|
67
67
|
self.dataset,
|
|
68
68
|
self.seed,
|
|
69
69
|
self.metadata.eval_splits,
|
|
70
|
-
label=
|
|
70
|
+
label='labels',
|
|
71
71
|
n_samples=NUM_SAMPLES,
|
|
72
72
|
)
|
|
73
73
|
|
|
@@ -77,57 +77,57 @@ class CLSClusteringFastP2P(AbsTaskClusteringFast):
|
|
|
77
77
|
max_fraction_of_documents_to_embed = None
|
|
78
78
|
|
|
79
79
|
metadata = TaskMetadata(
|
|
80
|
-
name=
|
|
81
|
-
description=
|
|
82
|
-
reference=
|
|
80
|
+
name='CLSClusteringP2P',
|
|
81
|
+
description='Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.',
|
|
82
|
+
reference='https://arxiv.org/abs/2209.05034',
|
|
83
83
|
dataset={
|
|
84
|
-
|
|
85
|
-
|
|
84
|
+
'path': 'C-MTEB/CLSClusteringP2P',
|
|
85
|
+
'revision': '4b6227591c6c1a73bc76b1055f3b7f3588e72476',
|
|
86
86
|
},
|
|
87
|
-
type=
|
|
88
|
-
category=
|
|
89
|
-
modalities=[
|
|
90
|
-
eval_splits=[
|
|
91
|
-
eval_langs=[
|
|
92
|
-
main_score=
|
|
93
|
-
date=(
|
|
94
|
-
domains=[
|
|
95
|
-
task_subtypes=[
|
|
96
|
-
license=
|
|
97
|
-
annotations_creators=
|
|
87
|
+
type='Clustering',
|
|
88
|
+
category='p2p',
|
|
89
|
+
modalities=['text'],
|
|
90
|
+
eval_splits=['test'],
|
|
91
|
+
eval_langs=['cmn-Hans'],
|
|
92
|
+
main_score='v_measure',
|
|
93
|
+
date=('2022-01-01', '2022-09-12'),
|
|
94
|
+
domains=['Academic', 'Written'],
|
|
95
|
+
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
96
|
+
license='apache-2.0',
|
|
97
|
+
annotations_creators='derived',
|
|
98
98
|
dialect=[],
|
|
99
|
-
sample_creation=
|
|
99
|
+
sample_creation='found',
|
|
100
100
|
bibtex_citation="""@misc{li2022csl,
|
|
101
|
-
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
|
|
101
|
+
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
|
|
102
102
|
author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
|
|
103
103
|
year={2022},
|
|
104
104
|
eprint={2209.05034},
|
|
105
105
|
archivePrefix={arXiv},
|
|
106
106
|
primaryClass={cs.CL}
|
|
107
|
-
}""",
|
|
107
|
+
}""", # noqa
|
|
108
108
|
descriptive_stats={
|
|
109
|
-
|
|
110
|
-
|
|
109
|
+
'n_samples': {'test': NUM_SAMPLES},
|
|
110
|
+
'avg_character_length': {},
|
|
111
111
|
},
|
|
112
112
|
)
|
|
113
113
|
|
|
114
114
|
def dataset_transform(self):
|
|
115
115
|
ds = {}
|
|
116
116
|
for split in self.metadata.eval_splits:
|
|
117
|
-
labels = list(itertools.chain.from_iterable(self.dataset[split][
|
|
117
|
+
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
118
118
|
sentences = list(
|
|
119
|
-
itertools.chain.from_iterable(self.dataset[split][
|
|
119
|
+
itertools.chain.from_iterable(self.dataset[split]['sentences'])
|
|
120
120
|
)
|
|
121
121
|
|
|
122
122
|
check_label_distribution(self.dataset[split])
|
|
123
123
|
|
|
124
|
-
ds[split] = Dataset.from_dict({
|
|
124
|
+
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
125
125
|
self.dataset = DatasetDict(ds)
|
|
126
126
|
self.dataset = self.stratified_subsampling(
|
|
127
127
|
self.dataset,
|
|
128
128
|
self.seed,
|
|
129
129
|
self.metadata.eval_splits,
|
|
130
|
-
label=
|
|
130
|
+
label='labels',
|
|
131
131
|
n_samples=NUM_SAMPLES,
|
|
132
132
|
)
|
|
133
133
|
|
|
@@ -137,26 +137,26 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
|
|
|
137
137
|
max_fraction_of_documents_to_embed = None
|
|
138
138
|
|
|
139
139
|
metadata = TaskMetadata(
|
|
140
|
-
name=
|
|
140
|
+
name='ThuNewsClusteringS2S',
|
|
141
141
|
dataset={
|
|
142
|
-
|
|
143
|
-
|
|
142
|
+
'path': 'C-MTEB/ThuNewsClusteringS2S',
|
|
143
|
+
'revision': '8a8b2caeda43f39e13c4bc5bea0f8a667896e10d',
|
|
144
144
|
},
|
|
145
|
-
description=
|
|
146
|
-
reference=
|
|
147
|
-
type=
|
|
148
|
-
category=
|
|
149
|
-
modalities=[
|
|
150
|
-
eval_splits=[
|
|
151
|
-
eval_langs=[
|
|
152
|
-
main_score=
|
|
153
|
-
date=(
|
|
154
|
-
domains=[
|
|
155
|
-
task_subtypes=[
|
|
156
|
-
license=
|
|
157
|
-
annotations_creators=
|
|
145
|
+
description='Clustering of titles from the THUCNews dataset',
|
|
146
|
+
reference='http://thuctc.thunlp.org/',
|
|
147
|
+
type='Clustering',
|
|
148
|
+
category='s2s',
|
|
149
|
+
modalities=['text'],
|
|
150
|
+
eval_splits=['test'],
|
|
151
|
+
eval_langs=['cmn-Hans'],
|
|
152
|
+
main_score='v_measure',
|
|
153
|
+
date=('2006-01-01', '2007-01-01'),
|
|
154
|
+
domains=['News', 'Written'],
|
|
155
|
+
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
156
|
+
license='apache-2.0',
|
|
157
|
+
annotations_creators='derived',
|
|
158
158
|
dialect=[],
|
|
159
|
-
sample_creation=
|
|
159
|
+
sample_creation='found',
|
|
160
160
|
bibtex_citation="""@software{THUCTC,
|
|
161
161
|
author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
|
|
162
162
|
title = {THUCTC: An Efficient Chinese Text Classifier},
|
|
@@ -166,28 +166,28 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
|
|
|
166
166
|
url = {https://github.com/thunlp/THUCTC}
|
|
167
167
|
}""",
|
|
168
168
|
descriptive_stats={
|
|
169
|
-
|
|
170
|
-
|
|
169
|
+
'n_samples': {'test': NUM_SAMPLES},
|
|
170
|
+
'avg_character_length': {},
|
|
171
171
|
},
|
|
172
172
|
)
|
|
173
173
|
|
|
174
174
|
def dataset_transform(self):
|
|
175
175
|
ds = {}
|
|
176
176
|
for split in self.metadata.eval_splits:
|
|
177
|
-
labels = list(itertools.chain.from_iterable(self.dataset[split][
|
|
177
|
+
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
178
178
|
sentences = list(
|
|
179
|
-
itertools.chain.from_iterable(self.dataset[split][
|
|
179
|
+
itertools.chain.from_iterable(self.dataset[split]['sentences'])
|
|
180
180
|
)
|
|
181
181
|
|
|
182
182
|
check_label_distribution(self.dataset[split])
|
|
183
183
|
|
|
184
|
-
ds[split] = Dataset.from_dict({
|
|
184
|
+
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
185
185
|
self.dataset = DatasetDict(ds)
|
|
186
186
|
self.dataset = self.stratified_subsampling(
|
|
187
187
|
self.dataset,
|
|
188
188
|
self.seed,
|
|
189
189
|
self.metadata.eval_splits,
|
|
190
|
-
label=
|
|
190
|
+
label='labels',
|
|
191
191
|
n_samples=NUM_SAMPLES,
|
|
192
192
|
)
|
|
193
193
|
|
|
@@ -197,26 +197,26 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
|
|
|
197
197
|
max_fraction_of_documents_to_embed = None
|
|
198
198
|
|
|
199
199
|
metadata = TaskMetadata(
|
|
200
|
-
name=
|
|
200
|
+
name='ThuNewsClusteringP2P',
|
|
201
201
|
dataset={
|
|
202
|
-
|
|
203
|
-
|
|
202
|
+
'path': 'C-MTEB/ThuNewsClusteringP2P',
|
|
203
|
+
'revision': '5798586b105c0434e4f0fe5e767abe619442cf93',
|
|
204
204
|
},
|
|
205
|
-
description=
|
|
206
|
-
reference=
|
|
207
|
-
type=
|
|
208
|
-
category=
|
|
209
|
-
modalities=[
|
|
210
|
-
eval_splits=[
|
|
211
|
-
eval_langs=[
|
|
212
|
-
main_score=
|
|
213
|
-
date=(
|
|
214
|
-
domains=[
|
|
215
|
-
task_subtypes=[
|
|
216
|
-
license=
|
|
217
|
-
annotations_creators=
|
|
205
|
+
description='Clustering of titles + abstracts from the THUCNews dataset',
|
|
206
|
+
reference='http://thuctc.thunlp.org/',
|
|
207
|
+
type='Clustering',
|
|
208
|
+
category='p2p',
|
|
209
|
+
modalities=['text'],
|
|
210
|
+
eval_splits=['test'],
|
|
211
|
+
eval_langs=['cmn-Hans'],
|
|
212
|
+
main_score='v_measure',
|
|
213
|
+
date=('2006-01-01', '2007-01-01'),
|
|
214
|
+
domains=['News', 'Written'],
|
|
215
|
+
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
216
|
+
license='apache-2.0',
|
|
217
|
+
annotations_creators='derived',
|
|
218
218
|
dialect=[],
|
|
219
|
-
sample_creation=
|
|
219
|
+
sample_creation='found',
|
|
220
220
|
bibtex_citation="""@software{THUCTC,
|
|
221
221
|
author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
|
|
222
222
|
title = {THUCTC: An Efficient Chinese Text Classifier},
|
|
@@ -226,27 +226,27 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
|
|
|
226
226
|
url = {https://github.com/thunlp/THUCTC}
|
|
227
227
|
}""",
|
|
228
228
|
descriptive_stats={
|
|
229
|
-
|
|
230
|
-
|
|
229
|
+
'n_samples': {'test': NUM_SAMPLES},
|
|
230
|
+
'avg_character_length': {},
|
|
231
231
|
},
|
|
232
232
|
)
|
|
233
233
|
|
|
234
234
|
def dataset_transform(self):
|
|
235
235
|
ds = {}
|
|
236
236
|
for split in self.metadata.eval_splits:
|
|
237
|
-
labels = list(itertools.chain.from_iterable(self.dataset[split][
|
|
237
|
+
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
238
238
|
sentences = list(
|
|
239
|
-
itertools.chain.from_iterable(self.dataset[split][
|
|
239
|
+
itertools.chain.from_iterable(self.dataset[split]['sentences'])
|
|
240
240
|
)
|
|
241
241
|
|
|
242
242
|
check_label_distribution(self.dataset[split])
|
|
243
243
|
|
|
244
|
-
ds[split] = Dataset.from_dict({
|
|
244
|
+
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
245
245
|
self.dataset = DatasetDict(ds)
|
|
246
246
|
self.dataset = self.stratified_subsampling(
|
|
247
247
|
self.dataset,
|
|
248
248
|
self.seed,
|
|
249
249
|
self.metadata.eval_splits,
|
|
250
|
-
label=
|
|
250
|
+
label='labels',
|
|
251
251
|
n_samples=NUM_SAMPLES,
|
|
252
252
|
)
|
|
@@ -2,22 +2,21 @@ from mteb.abstasks.AbsTaskReranking import AbsTaskReranking
|
|
|
2
2
|
from mteb.abstasks.TaskMetadata import HFSubset, TaskMetadata
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
|
|
6
5
|
class T2Reranking(AbsTaskReranking):
|
|
7
6
|
metadata = TaskMetadata(
|
|
8
|
-
name=
|
|
9
|
-
description=
|
|
10
|
-
reference=
|
|
7
|
+
name='T2Reranking',
|
|
8
|
+
description='T2Ranking: A large-scale Chinese Benchmark for Passage Ranking',
|
|
9
|
+
reference='https://arxiv.org/abs/2304.03679',
|
|
11
10
|
dataset={
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
'path': 'C-MTEB/T2Reranking',
|
|
12
|
+
'revision': '76631901a18387f85eaa53e5450019b87ad58ef9',
|
|
14
13
|
},
|
|
15
|
-
type=
|
|
16
|
-
category=
|
|
17
|
-
modalities=[
|
|
18
|
-
eval_splits=[
|
|
19
|
-
eval_langs=[
|
|
20
|
-
main_score=
|
|
14
|
+
type='Reranking',
|
|
15
|
+
category='s2s',
|
|
16
|
+
modalities=['text'],
|
|
17
|
+
eval_splits=['dev'],
|
|
18
|
+
eval_langs=['cmn-Hans'],
|
|
19
|
+
main_score='map',
|
|
21
20
|
date=None,
|
|
22
21
|
form=None,
|
|
23
22
|
domains=None,
|
|
@@ -27,32 +26,32 @@ class T2Reranking(AbsTaskReranking):
|
|
|
27
26
|
dialect=None,
|
|
28
27
|
sample_creation=None,
|
|
29
28
|
bibtex_citation="""@misc{xie2023t2ranking,
|
|
30
|
-
title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
|
|
29
|
+
title={T2Ranking: A large-scale Chinese Benchmark for Passage Ranking},
|
|
31
30
|
author={Xiaohui Xie and Qian Dong and Bingning Wang and Feiyang Lv and Ting Yao and Weinan Gan and Zhijing Wu and Xiangsheng Li and Haitao Li and Yiqun Liu and Jin Ma},
|
|
32
31
|
year={2023},
|
|
33
32
|
eprint={2304.03679},
|
|
34
33
|
archivePrefix={arXiv},
|
|
35
34
|
primaryClass={cs.IR}
|
|
36
|
-
}""",
|
|
37
|
-
descriptive_stats={
|
|
35
|
+
}""", # noqa
|
|
36
|
+
descriptive_stats={'n_samples': None, 'avg_character_length': None},
|
|
38
37
|
)
|
|
39
38
|
|
|
40
39
|
|
|
41
40
|
class MMarcoReranking(AbsTaskReranking):
|
|
42
41
|
metadata = TaskMetadata(
|
|
43
|
-
name=
|
|
44
|
-
description=
|
|
45
|
-
reference=
|
|
42
|
+
name='MMarcoReranking',
|
|
43
|
+
description='mMARCO is a multilingual version of the MS MARCO passage ranking dataset',
|
|
44
|
+
reference='https://github.com/unicamp-dl/mMARCO',
|
|
46
45
|
dataset={
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
'path': 'C-MTEB/Mmarco-reranking',
|
|
47
|
+
'revision': '8e0c766dbe9e16e1d221116a3f36795fbade07f6',
|
|
49
48
|
},
|
|
50
|
-
type=
|
|
51
|
-
category=
|
|
52
|
-
modalities=[
|
|
53
|
-
eval_splits=[
|
|
54
|
-
eval_langs=[
|
|
55
|
-
main_score=
|
|
49
|
+
type='Reranking',
|
|
50
|
+
category='s2s',
|
|
51
|
+
modalities=['text'],
|
|
52
|
+
eval_splits=['dev'],
|
|
53
|
+
eval_langs=['cmn-Hans'],
|
|
54
|
+
main_score='map',
|
|
56
55
|
date=None,
|
|
57
56
|
form=None,
|
|
58
57
|
domains=None,
|
|
@@ -62,39 +61,39 @@ class MMarcoReranking(AbsTaskReranking):
|
|
|
62
61
|
dialect=None,
|
|
63
62
|
sample_creation=None,
|
|
64
63
|
bibtex_citation="""@misc{bonifacio2021mmarco,
|
|
65
|
-
title={mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset},
|
|
64
|
+
title={mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset},
|
|
66
65
|
author={Luiz Henrique Bonifacio and Vitor Jeronymo and Hugo Queiroz Abonizio and Israel Campiotti and Marzieh Fadaee and and Roberto Lotufo and Rodrigo Nogueira},
|
|
67
66
|
year={2021},
|
|
68
67
|
eprint={2108.13897},
|
|
69
68
|
archivePrefix={arXiv},
|
|
70
69
|
primaryClass={cs.CL}
|
|
71
|
-
}""",
|
|
72
|
-
descriptive_stats={
|
|
70
|
+
}""", # noqa
|
|
71
|
+
descriptive_stats={'n_samples': None, 'avg_character_length': None},
|
|
73
72
|
)
|
|
74
73
|
|
|
75
74
|
|
|
76
75
|
class CMedQAv1(AbsTaskReranking):
|
|
77
76
|
metadata = TaskMetadata(
|
|
78
|
-
name=
|
|
79
|
-
description=
|
|
80
|
-
reference=
|
|
77
|
+
name='CMedQAv1',
|
|
78
|
+
description='Chinese community medical question answering',
|
|
79
|
+
reference='https://github.com/zhangsheng93/cMedQA',
|
|
81
80
|
dataset={
|
|
82
|
-
|
|
83
|
-
|
|
81
|
+
'path': 'C-MTEB/CMedQAv1-reranking',
|
|
82
|
+
'revision': '8d7f1e942507dac42dc58017c1a001c3717da7df',
|
|
84
83
|
},
|
|
85
|
-
type=
|
|
86
|
-
category=
|
|
87
|
-
modalities=[
|
|
88
|
-
eval_splits=[
|
|
89
|
-
eval_langs=[
|
|
90
|
-
main_score=
|
|
91
|
-
date=(
|
|
92
|
-
domains=[
|
|
84
|
+
type='Reranking',
|
|
85
|
+
category='s2s',
|
|
86
|
+
modalities=['text'],
|
|
87
|
+
eval_splits=['test'],
|
|
88
|
+
eval_langs=['cmn-Hans'],
|
|
89
|
+
main_score='map',
|
|
90
|
+
date=('2017-01-01', '2017-07-26'),
|
|
91
|
+
domains=['Medical', 'Written'],
|
|
93
92
|
task_subtypes=[],
|
|
94
|
-
license=
|
|
95
|
-
annotations_creators=
|
|
93
|
+
license='apache-2.0',
|
|
94
|
+
annotations_creators='expert-annotated',
|
|
96
95
|
dialect=[],
|
|
97
|
-
sample_creation=
|
|
96
|
+
sample_creation='found',
|
|
98
97
|
bibtex_citation="""@article{zhang2017chinese,
|
|
99
98
|
title={Chinese Medical Question Answer Matching Using End-to-End Character-Level Multi-Scale CNNs},
|
|
100
99
|
author={Zhang, Sheng and Zhang, Xin and Wang, Hui and Cheng, Jiajun and Li, Pei and Ding, Zhaoyun},
|
|
@@ -106,27 +105,27 @@ class CMedQAv1(AbsTaskReranking):
|
|
|
106
105
|
publisher={Multidisciplinary Digital Publishing Institute}
|
|
107
106
|
}""",
|
|
108
107
|
descriptive_stats={
|
|
109
|
-
|
|
110
|
-
|
|
108
|
+
'n_samples': {'test': 2000},
|
|
109
|
+
'avg_character_length': {'test': 165},
|
|
111
110
|
},
|
|
112
111
|
)
|
|
113
112
|
|
|
114
113
|
|
|
115
114
|
class CMedQAv2(AbsTaskReranking):
|
|
116
115
|
metadata = TaskMetadata(
|
|
117
|
-
name=
|
|
118
|
-
description=
|
|
119
|
-
reference=
|
|
116
|
+
name='CMedQAv2',
|
|
117
|
+
description='Chinese community medical question answering',
|
|
118
|
+
reference='https://github.com/zhangsheng93/cMedQA2',
|
|
120
119
|
dataset={
|
|
121
|
-
|
|
122
|
-
|
|
120
|
+
'path': 'C-MTEB/CMedQAv2-reranking',
|
|
121
|
+
'revision': '23d186750531a14a0357ca22cd92d712fd512ea0',
|
|
123
122
|
},
|
|
124
|
-
type=
|
|
125
|
-
category=
|
|
126
|
-
modalities=[
|
|
127
|
-
eval_splits=[
|
|
128
|
-
eval_langs=[
|
|
129
|
-
main_score=
|
|
123
|
+
type='Reranking',
|
|
124
|
+
category='s2s',
|
|
125
|
+
modalities=['text'],
|
|
126
|
+
eval_splits=['test'],
|
|
127
|
+
eval_langs=['cmn-Hans'],
|
|
128
|
+
main_score='map',
|
|
130
129
|
date=None,
|
|
131
130
|
form=None,
|
|
132
131
|
domains=None,
|
|
@@ -135,17 +134,17 @@ class CMedQAv2(AbsTaskReranking):
|
|
|
135
134
|
annotations_creators=None,
|
|
136
135
|
dialect=None,
|
|
137
136
|
sample_creation=None,
|
|
138
|
-
bibtex_citation="""@ARTICLE{8548603,
|
|
139
|
-
author={S. Zhang and X. Zhang and H. Wang and L. Guo and S. Liu},
|
|
140
|
-
journal={IEEE Access},
|
|
141
|
-
title={Multi-Scale Attentive Interaction Networks for Chinese Medical Question Answer Selection},
|
|
142
|
-
year={2018},
|
|
143
|
-
volume={6},
|
|
144
|
-
number={},
|
|
145
|
-
pages={74061-74071},
|
|
146
|
-
keywords={Biomedical imaging;Data mining;Semantics;Medical services;Feature extraction;Knowledge discovery;Medical question answering;interactive attention;deep learning;deep neural networks},
|
|
147
|
-
doi={10.1109/ACCESS.2018.2883637},
|
|
148
|
-
ISSN={2169-3536},
|
|
149
|
-
month={},}""",
|
|
150
|
-
descriptive_stats={
|
|
137
|
+
bibtex_citation="""@ARTICLE{8548603,
|
|
138
|
+
author={S. Zhang and X. Zhang and H. Wang and L. Guo and S. Liu},
|
|
139
|
+
journal={IEEE Access},
|
|
140
|
+
title={Multi-Scale Attentive Interaction Networks for Chinese Medical Question Answer Selection},
|
|
141
|
+
year={2018},
|
|
142
|
+
volume={6},
|
|
143
|
+
number={},
|
|
144
|
+
pages={74061-74071},
|
|
145
|
+
keywords={Biomedical imaging;Data mining;Semantics;Medical services;Feature extraction;Knowledge discovery;Medical question answering;interactive attention;deep learning;deep neural networks},
|
|
146
|
+
doi={10.1109/ACCESS.2018.2883637},
|
|
147
|
+
ISSN={2169-3536},
|
|
148
|
+
month={},}""", # noqa
|
|
149
|
+
descriptive_stats={'n_samples': None, 'avg_character_length': None},
|
|
151
150
|
)
|