evalscope 0.6.0__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.6.0 → evalscope-0.6.1}/PKG-INFO +14 -13
- {evalscope-0.6.0 → evalscope-0.6.1}/README.md +6 -5
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +1 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +96 -96
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +70 -71
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +120 -100
- evalscope-0.6.1/evalscope/backend/rag_eval/utils/clip.py +149 -0
- evalscope-0.6.1/evalscope/backend/rag_eval/utils/embedding.py +183 -0
- evalscope-0.6.1/evalscope/backend/rag_eval/utils/llm.py +72 -0
- evalscope-0.6.1/evalscope/backend/rag_eval/utils/tools.py +63 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope-0.6.1/evalscope/preprocess/tokenizers/__init__.py +0 -0
- evalscope-0.6.1/evalscope/version.py +4 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/PKG-INFO +14 -13
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/SOURCES.txt +5 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/requires.txt +8 -8
- evalscope-0.6.0/evalscope/version.py +0 -4
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/base.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.6.0/evalscope/perf → evalscope-0.6.1/evalscope/backend/rag_eval/utils}/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/data_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cache.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cli/base.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cli/cli.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/config.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/constants.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/evaluator/evaluator.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/math_accuracy.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/api/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/api/openai_api.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/dummy_chat_model.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/model.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/model_adapter.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/openai_model.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/models/template.py +0 -0
- {evalscope-0.6.0/evalscope/perf/datasets → evalscope-0.6.1/evalscope/perf}/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/_logging.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/api_plugin_base.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/custom_api.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/dashscope_api.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/dataset_plugin_base.py +0 -0
- {evalscope-0.6.0/evalscope/preprocess/tokenizers → evalscope-0.6.1/evalscope/perf/datasets}/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/datasets/line_by_line.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/datasets/openqa.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/how_to_analysis_result.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/openai_api.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/plugin_registry.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/query_parameters.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/perf/server_sent_event.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/preprocess/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/run.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/run_arena.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/run_ms.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/summarizer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/tools/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/tools/combine_reports.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/tools/rewrite_eval_results.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/logger.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/task_cfg_parser.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/task_utils.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope/utils/utils.py +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.6.0 → evalscope-0.6.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -28,7 +28,7 @@ Requires-Dist: nltk>=3.9
|
|
|
28
28
|
Requires-Dist: openai
|
|
29
29
|
Requires-Dist: pandas
|
|
30
30
|
Requires-Dist: plotly
|
|
31
|
-
Requires-Dist: pyarrow
|
|
31
|
+
Requires-Dist: pyarrow<=17.0.0
|
|
32
32
|
Requires-Dist: pympler
|
|
33
33
|
Requires-Dist: pyyaml
|
|
34
34
|
Requires-Dist: regex
|
|
@@ -48,12 +48,12 @@ Requires-Dist: transformers_stream_generator
|
|
|
48
48
|
Requires-Dist: jieba
|
|
49
49
|
Requires-Dist: rouge-chinese
|
|
50
50
|
Provides-Extra: opencompass
|
|
51
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
51
|
+
Requires-Dist: ms-opencompass>=0.1.3; extra == "opencompass"
|
|
52
52
|
Provides-Extra: vlmeval
|
|
53
53
|
Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
|
|
54
54
|
Provides-Extra: rag
|
|
55
|
-
Requires-Dist: mteb
|
|
56
|
-
Requires-Dist: ragas
|
|
55
|
+
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
56
|
+
Requires-Dist: ragas==0.2.5; extra == "rag"
|
|
57
57
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
58
58
|
Provides-Extra: inner
|
|
59
59
|
Requires-Dist: absl-py; extra == "inner"
|
|
@@ -96,7 +96,7 @@ Requires-Dist: nltk>=3.9; extra == "all"
|
|
|
96
96
|
Requires-Dist: openai; extra == "all"
|
|
97
97
|
Requires-Dist: pandas; extra == "all"
|
|
98
98
|
Requires-Dist: plotly; extra == "all"
|
|
99
|
-
Requires-Dist: pyarrow; extra == "all"
|
|
99
|
+
Requires-Dist: pyarrow<=17.0.0; extra == "all"
|
|
100
100
|
Requires-Dist: pympler; extra == "all"
|
|
101
101
|
Requires-Dist: pyyaml; extra == "all"
|
|
102
102
|
Requires-Dist: regex; extra == "all"
|
|
@@ -115,10 +115,10 @@ Requires-Dist: transformers>=4.33; extra == "all"
|
|
|
115
115
|
Requires-Dist: transformers_stream_generator; extra == "all"
|
|
116
116
|
Requires-Dist: jieba; extra == "all"
|
|
117
117
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
118
|
-
Requires-Dist: ms-opencompass>=0.1.
|
|
118
|
+
Requires-Dist: ms-opencompass>=0.1.3; extra == "all"
|
|
119
119
|
Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
|
|
120
|
-
Requires-Dist: mteb
|
|
121
|
-
Requires-Dist: ragas
|
|
120
|
+
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
121
|
+
Requires-Dist: ragas==0.2.5; extra == "all"
|
|
122
122
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
123
123
|
|
|
124
124
|
|
|
@@ -140,6 +140,7 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
|
140
140
|
<a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
|
|
141
141
|
<p>
|
|
142
142
|
|
|
143
|
+
> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
|
|
143
144
|
|
|
144
145
|
## 📋 Table of Contents
|
|
145
146
|
- [Introduction](#introduction)
|
|
@@ -165,7 +166,7 @@ EvalScope is the official model evaluation and performance benchmarking framewor
|
|
|
165
166
|
The architecture includes the following modules:
|
|
166
167
|
1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
|
|
167
168
|
2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
|
|
168
|
-
3. **Evaluation Backend**:
|
|
169
|
+
3. **Evaluation Backend**:
|
|
169
170
|
- **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
|
|
170
171
|
- **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
|
|
171
172
|
- **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
@@ -252,7 +253,7 @@ You can execute this command from any directory:
|
|
|
252
253
|
python -m evalscope.run \
|
|
253
254
|
--model qwen/Qwen2-0.5B-Instruct \
|
|
254
255
|
--template-type qwen \
|
|
255
|
-
--datasets arc
|
|
256
|
+
--datasets arc
|
|
256
257
|
```
|
|
257
258
|
|
|
258
259
|
#### Install from source
|
|
@@ -359,13 +360,13 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
|
|
|
359
360
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
|
|
360
361
|
|
|
361
362
|
## Offline Evaluation
|
|
362
|
-
You can use local dataset to evaluate the model without internet connection.
|
|
363
|
+
You can use local dataset to evaluate the model without internet connection.
|
|
363
364
|
|
|
364
365
|
Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
|
|
365
366
|
|
|
366
367
|
|
|
367
368
|
## Arena Mode
|
|
368
|
-
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
369
|
+
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
369
370
|
|
|
370
371
|
Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
371
372
|
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
<a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
|
|
18
18
|
<p>
|
|
19
19
|
|
|
20
|
+
> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
|
|
20
21
|
|
|
21
22
|
## 📋 Table of Contents
|
|
22
23
|
- [Introduction](#introduction)
|
|
@@ -42,7 +43,7 @@ EvalScope is the official model evaluation and performance benchmarking framewor
|
|
|
42
43
|
The architecture includes the following modules:
|
|
43
44
|
1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
|
|
44
45
|
2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
|
|
45
|
-
3. **Evaluation Backend**:
|
|
46
|
+
3. **Evaluation Backend**:
|
|
46
47
|
- **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
|
|
47
48
|
- **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
|
|
48
49
|
- **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
@@ -129,7 +130,7 @@ You can execute this command from any directory:
|
|
|
129
130
|
python -m evalscope.run \
|
|
130
131
|
--model qwen/Qwen2-0.5B-Instruct \
|
|
131
132
|
--template-type qwen \
|
|
132
|
-
--datasets arc
|
|
133
|
+
--datasets arc
|
|
133
134
|
```
|
|
134
135
|
|
|
135
136
|
#### Install from source
|
|
@@ -236,13 +237,13 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
|
|
|
236
237
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
|
|
237
238
|
|
|
238
239
|
## Offline Evaluation
|
|
239
|
-
You can use local dataset to evaluate the model without internet connection.
|
|
240
|
+
You can use local dataset to evaluate the model without internet connection.
|
|
240
241
|
|
|
241
242
|
Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
|
|
242
243
|
|
|
243
244
|
|
|
244
245
|
## Arena Mode
|
|
245
|
-
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
246
|
+
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
246
247
|
|
|
247
248
|
Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
248
249
|
|
|
@@ -270,4 +271,4 @@ Refer to : Model Serving Performance Evaluation [📖 User Guide](https://evalsc
|
|
|
270
271
|
|
|
271
272
|
## Star History
|
|
272
273
|
|
|
273
|
-
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
274
|
+
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -50,6 +50,7 @@ with read_base():
|
|
|
50
50
|
from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
|
|
51
51
|
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
|
52
52
|
from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
|
|
53
|
+
from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
|
|
53
54
|
|
|
54
55
|
# Note: to be supported
|
|
55
56
|
# from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
|
|
@@ -17,57 +17,57 @@ class CLSClusteringFastS2S(AbsTaskClusteringFast):
|
|
|
17
17
|
max_fraction_of_documents_to_embed = None
|
|
18
18
|
|
|
19
19
|
metadata = TaskMetadata(
|
|
20
|
-
name=
|
|
21
|
-
description=
|
|
22
|
-
reference=
|
|
20
|
+
name='CLSClusteringS2S',
|
|
21
|
+
description='Clustering of titles from CLS dataset. Clustering of 13 sets on the main category.',
|
|
22
|
+
reference='https://arxiv.org/abs/2209.05034',
|
|
23
23
|
dataset={
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
'path': 'C-MTEB/CLSClusteringS2S',
|
|
25
|
+
'revision': 'e458b3f5414b62b7f9f83499ac1f5497ae2e869f',
|
|
26
26
|
},
|
|
27
|
-
type=
|
|
28
|
-
category=
|
|
29
|
-
modalities=[
|
|
30
|
-
eval_splits=[
|
|
31
|
-
eval_langs=[
|
|
32
|
-
main_score=
|
|
33
|
-
date=(
|
|
34
|
-
domains=[
|
|
35
|
-
task_subtypes=[
|
|
36
|
-
license=
|
|
37
|
-
annotations_creators=
|
|
27
|
+
type='Clustering',
|
|
28
|
+
category='s2s',
|
|
29
|
+
modalities=['text'],
|
|
30
|
+
eval_splits=['test'],
|
|
31
|
+
eval_langs=['cmn-Hans'],
|
|
32
|
+
main_score='v_measure',
|
|
33
|
+
date=('2022-01-01', '2022-09-12'),
|
|
34
|
+
domains=['Academic', 'Written'],
|
|
35
|
+
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
36
|
+
license='apache-2.0',
|
|
37
|
+
annotations_creators='derived',
|
|
38
38
|
dialect=[],
|
|
39
|
-
sample_creation=
|
|
39
|
+
sample_creation='found',
|
|
40
40
|
bibtex_citation="""@misc{li2022csl,
|
|
41
|
-
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
|
|
41
|
+
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
|
|
42
42
|
author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
|
|
43
43
|
year={2022},
|
|
44
44
|
eprint={2209.05034},
|
|
45
45
|
archivePrefix={arXiv},
|
|
46
46
|
primaryClass={cs.CL}
|
|
47
|
-
}""",
|
|
47
|
+
}""", # noqa
|
|
48
48
|
descriptive_stats={
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
'n_samples': {'test': NUM_SAMPLES},
|
|
50
|
+
'avg_character_length': {},
|
|
51
51
|
},
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
def dataset_transform(self):
|
|
55
55
|
ds = {}
|
|
56
56
|
for split in self.metadata.eval_splits:
|
|
57
|
-
labels = list(itertools.chain.from_iterable(self.dataset[split][
|
|
57
|
+
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
58
58
|
sentences = list(
|
|
59
|
-
itertools.chain.from_iterable(self.dataset[split][
|
|
59
|
+
itertools.chain.from_iterable(self.dataset[split]['sentences'])
|
|
60
60
|
)
|
|
61
61
|
|
|
62
62
|
check_label_distribution(self.dataset[split])
|
|
63
63
|
|
|
64
|
-
ds[split] = Dataset.from_dict({
|
|
64
|
+
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
65
65
|
self.dataset = DatasetDict(ds)
|
|
66
66
|
self.dataset = self.stratified_subsampling(
|
|
67
67
|
self.dataset,
|
|
68
68
|
self.seed,
|
|
69
69
|
self.metadata.eval_splits,
|
|
70
|
-
label=
|
|
70
|
+
label='labels',
|
|
71
71
|
n_samples=NUM_SAMPLES,
|
|
72
72
|
)
|
|
73
73
|
|
|
@@ -77,57 +77,57 @@ class CLSClusteringFastP2P(AbsTaskClusteringFast):
|
|
|
77
77
|
max_fraction_of_documents_to_embed = None
|
|
78
78
|
|
|
79
79
|
metadata = TaskMetadata(
|
|
80
|
-
name=
|
|
81
|
-
description=
|
|
82
|
-
reference=
|
|
80
|
+
name='CLSClusteringP2P',
|
|
81
|
+
description='Clustering of titles + abstract from CLS dataset. Clustering of 13 sets on the main category.',
|
|
82
|
+
reference='https://arxiv.org/abs/2209.05034',
|
|
83
83
|
dataset={
|
|
84
|
-
|
|
85
|
-
|
|
84
|
+
'path': 'C-MTEB/CLSClusteringP2P',
|
|
85
|
+
'revision': '4b6227591c6c1a73bc76b1055f3b7f3588e72476',
|
|
86
86
|
},
|
|
87
|
-
type=
|
|
88
|
-
category=
|
|
89
|
-
modalities=[
|
|
90
|
-
eval_splits=[
|
|
91
|
-
eval_langs=[
|
|
92
|
-
main_score=
|
|
93
|
-
date=(
|
|
94
|
-
domains=[
|
|
95
|
-
task_subtypes=[
|
|
96
|
-
license=
|
|
97
|
-
annotations_creators=
|
|
87
|
+
type='Clustering',
|
|
88
|
+
category='p2p',
|
|
89
|
+
modalities=['text'],
|
|
90
|
+
eval_splits=['test'],
|
|
91
|
+
eval_langs=['cmn-Hans'],
|
|
92
|
+
main_score='v_measure',
|
|
93
|
+
date=('2022-01-01', '2022-09-12'),
|
|
94
|
+
domains=['Academic', 'Written'],
|
|
95
|
+
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
96
|
+
license='apache-2.0',
|
|
97
|
+
annotations_creators='derived',
|
|
98
98
|
dialect=[],
|
|
99
|
-
sample_creation=
|
|
99
|
+
sample_creation='found',
|
|
100
100
|
bibtex_citation="""@misc{li2022csl,
|
|
101
|
-
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
|
|
101
|
+
title={CSL: A Large-scale Chinese Scientific Literature Dataset},
|
|
102
102
|
author={Yudong Li and Yuqing Zhang and Zhe Zhao and Linlin Shen and Weijie Liu and Weiquan Mao and Hui Zhang},
|
|
103
103
|
year={2022},
|
|
104
104
|
eprint={2209.05034},
|
|
105
105
|
archivePrefix={arXiv},
|
|
106
106
|
primaryClass={cs.CL}
|
|
107
|
-
}""",
|
|
107
|
+
}""", # noqa
|
|
108
108
|
descriptive_stats={
|
|
109
|
-
|
|
110
|
-
|
|
109
|
+
'n_samples': {'test': NUM_SAMPLES},
|
|
110
|
+
'avg_character_length': {},
|
|
111
111
|
},
|
|
112
112
|
)
|
|
113
113
|
|
|
114
114
|
def dataset_transform(self):
|
|
115
115
|
ds = {}
|
|
116
116
|
for split in self.metadata.eval_splits:
|
|
117
|
-
labels = list(itertools.chain.from_iterable(self.dataset[split][
|
|
117
|
+
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
118
118
|
sentences = list(
|
|
119
|
-
itertools.chain.from_iterable(self.dataset[split][
|
|
119
|
+
itertools.chain.from_iterable(self.dataset[split]['sentences'])
|
|
120
120
|
)
|
|
121
121
|
|
|
122
122
|
check_label_distribution(self.dataset[split])
|
|
123
123
|
|
|
124
|
-
ds[split] = Dataset.from_dict({
|
|
124
|
+
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
125
125
|
self.dataset = DatasetDict(ds)
|
|
126
126
|
self.dataset = self.stratified_subsampling(
|
|
127
127
|
self.dataset,
|
|
128
128
|
self.seed,
|
|
129
129
|
self.metadata.eval_splits,
|
|
130
|
-
label=
|
|
130
|
+
label='labels',
|
|
131
131
|
n_samples=NUM_SAMPLES,
|
|
132
132
|
)
|
|
133
133
|
|
|
@@ -137,26 +137,26 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
|
|
|
137
137
|
max_fraction_of_documents_to_embed = None
|
|
138
138
|
|
|
139
139
|
metadata = TaskMetadata(
|
|
140
|
-
name=
|
|
140
|
+
name='ThuNewsClusteringS2S',
|
|
141
141
|
dataset={
|
|
142
|
-
|
|
143
|
-
|
|
142
|
+
'path': 'C-MTEB/ThuNewsClusteringS2S',
|
|
143
|
+
'revision': '8a8b2caeda43f39e13c4bc5bea0f8a667896e10d',
|
|
144
144
|
},
|
|
145
|
-
description=
|
|
146
|
-
reference=
|
|
147
|
-
type=
|
|
148
|
-
category=
|
|
149
|
-
modalities=[
|
|
150
|
-
eval_splits=[
|
|
151
|
-
eval_langs=[
|
|
152
|
-
main_score=
|
|
153
|
-
date=(
|
|
154
|
-
domains=[
|
|
155
|
-
task_subtypes=[
|
|
156
|
-
license=
|
|
157
|
-
annotations_creators=
|
|
145
|
+
description='Clustering of titles from the THUCNews dataset',
|
|
146
|
+
reference='http://thuctc.thunlp.org/',
|
|
147
|
+
type='Clustering',
|
|
148
|
+
category='s2s',
|
|
149
|
+
modalities=['text'],
|
|
150
|
+
eval_splits=['test'],
|
|
151
|
+
eval_langs=['cmn-Hans'],
|
|
152
|
+
main_score='v_measure',
|
|
153
|
+
date=('2006-01-01', '2007-01-01'),
|
|
154
|
+
domains=['News', 'Written'],
|
|
155
|
+
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
156
|
+
license='apache-2.0',
|
|
157
|
+
annotations_creators='derived',
|
|
158
158
|
dialect=[],
|
|
159
|
-
sample_creation=
|
|
159
|
+
sample_creation='found',
|
|
160
160
|
bibtex_citation="""@software{THUCTC,
|
|
161
161
|
author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
|
|
162
162
|
title = {THUCTC: An Efficient Chinese Text Classifier},
|
|
@@ -166,28 +166,28 @@ class ThuNewsClusteringFastS2S(AbsTaskClusteringFast):
|
|
|
166
166
|
url = {https://github.com/thunlp/THUCTC}
|
|
167
167
|
}""",
|
|
168
168
|
descriptive_stats={
|
|
169
|
-
|
|
170
|
-
|
|
169
|
+
'n_samples': {'test': NUM_SAMPLES},
|
|
170
|
+
'avg_character_length': {},
|
|
171
171
|
},
|
|
172
172
|
)
|
|
173
173
|
|
|
174
174
|
def dataset_transform(self):
|
|
175
175
|
ds = {}
|
|
176
176
|
for split in self.metadata.eval_splits:
|
|
177
|
-
labels = list(itertools.chain.from_iterable(self.dataset[split][
|
|
177
|
+
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
178
178
|
sentences = list(
|
|
179
|
-
itertools.chain.from_iterable(self.dataset[split][
|
|
179
|
+
itertools.chain.from_iterable(self.dataset[split]['sentences'])
|
|
180
180
|
)
|
|
181
181
|
|
|
182
182
|
check_label_distribution(self.dataset[split])
|
|
183
183
|
|
|
184
|
-
ds[split] = Dataset.from_dict({
|
|
184
|
+
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
185
185
|
self.dataset = DatasetDict(ds)
|
|
186
186
|
self.dataset = self.stratified_subsampling(
|
|
187
187
|
self.dataset,
|
|
188
188
|
self.seed,
|
|
189
189
|
self.metadata.eval_splits,
|
|
190
|
-
label=
|
|
190
|
+
label='labels',
|
|
191
191
|
n_samples=NUM_SAMPLES,
|
|
192
192
|
)
|
|
193
193
|
|
|
@@ -197,26 +197,26 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
|
|
|
197
197
|
max_fraction_of_documents_to_embed = None
|
|
198
198
|
|
|
199
199
|
metadata = TaskMetadata(
|
|
200
|
-
name=
|
|
200
|
+
name='ThuNewsClusteringP2P',
|
|
201
201
|
dataset={
|
|
202
|
-
|
|
203
|
-
|
|
202
|
+
'path': 'C-MTEB/ThuNewsClusteringP2P',
|
|
203
|
+
'revision': '5798586b105c0434e4f0fe5e767abe619442cf93',
|
|
204
204
|
},
|
|
205
|
-
description=
|
|
206
|
-
reference=
|
|
207
|
-
type=
|
|
208
|
-
category=
|
|
209
|
-
modalities=[
|
|
210
|
-
eval_splits=[
|
|
211
|
-
eval_langs=[
|
|
212
|
-
main_score=
|
|
213
|
-
date=(
|
|
214
|
-
domains=[
|
|
215
|
-
task_subtypes=[
|
|
216
|
-
license=
|
|
217
|
-
annotations_creators=
|
|
205
|
+
description='Clustering of titles + abstracts from the THUCNews dataset',
|
|
206
|
+
reference='http://thuctc.thunlp.org/',
|
|
207
|
+
type='Clustering',
|
|
208
|
+
category='p2p',
|
|
209
|
+
modalities=['text'],
|
|
210
|
+
eval_splits=['test'],
|
|
211
|
+
eval_langs=['cmn-Hans'],
|
|
212
|
+
main_score='v_measure',
|
|
213
|
+
date=('2006-01-01', '2007-01-01'),
|
|
214
|
+
domains=['News', 'Written'],
|
|
215
|
+
task_subtypes=['Thematic clustering', 'Topic classification'],
|
|
216
|
+
license='apache-2.0',
|
|
217
|
+
annotations_creators='derived',
|
|
218
218
|
dialect=[],
|
|
219
|
-
sample_creation=
|
|
219
|
+
sample_creation='found',
|
|
220
220
|
bibtex_citation="""@software{THUCTC,
|
|
221
221
|
author = {Sun, M. and Li, J. and Guo, Z. and Yu, Z. and Zheng, Y. and Si, X. and Liu, Z.},
|
|
222
222
|
title = {THUCTC: An Efficient Chinese Text Classifier},
|
|
@@ -226,27 +226,27 @@ class ThuNewsClusteringFastP2P(AbsTaskClusteringFast):
|
|
|
226
226
|
url = {https://github.com/thunlp/THUCTC}
|
|
227
227
|
}""",
|
|
228
228
|
descriptive_stats={
|
|
229
|
-
|
|
230
|
-
|
|
229
|
+
'n_samples': {'test': NUM_SAMPLES},
|
|
230
|
+
'avg_character_length': {},
|
|
231
231
|
},
|
|
232
232
|
)
|
|
233
233
|
|
|
234
234
|
def dataset_transform(self):
|
|
235
235
|
ds = {}
|
|
236
236
|
for split in self.metadata.eval_splits:
|
|
237
|
-
labels = list(itertools.chain.from_iterable(self.dataset[split][
|
|
237
|
+
labels = list(itertools.chain.from_iterable(self.dataset[split]['labels']))
|
|
238
238
|
sentences = list(
|
|
239
|
-
itertools.chain.from_iterable(self.dataset[split][
|
|
239
|
+
itertools.chain.from_iterable(self.dataset[split]['sentences'])
|
|
240
240
|
)
|
|
241
241
|
|
|
242
242
|
check_label_distribution(self.dataset[split])
|
|
243
243
|
|
|
244
|
-
ds[split] = Dataset.from_dict({
|
|
244
|
+
ds[split] = Dataset.from_dict({'labels': labels, 'sentences': sentences})
|
|
245
245
|
self.dataset = DatasetDict(ds)
|
|
246
246
|
self.dataset = self.stratified_subsampling(
|
|
247
247
|
self.dataset,
|
|
248
248
|
self.seed,
|
|
249
249
|
self.metadata.eval_splits,
|
|
250
|
-
label=
|
|
250
|
+
label='labels',
|
|
251
251
|
n_samples=NUM_SAMPLES,
|
|
252
252
|
)
|