evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +2 -3
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
# Convert datasets to webdataset format
|
|
2
2
|
import os
|
|
3
|
-
from tqdm import tqdm
|
|
4
3
|
import torch
|
|
5
4
|
import torch.utils.data
|
|
6
5
|
import webdataset
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
7
8
|
from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import DatasetWrapper
|
|
8
|
-
from evalscope.backend.rag_eval.utils.tools import
|
|
9
|
+
from evalscope.backend.rag_eval.utils.tools import PIL_to_bytes, path_to_bytes
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
@@ -17,19 +18,21 @@ def convert_dataset(
|
|
|
17
18
|
output_folder,
|
|
18
19
|
*,
|
|
19
20
|
transform=None,
|
|
20
|
-
image_format=
|
|
21
|
+
image_format='webp',
|
|
21
22
|
max_count=10_000,
|
|
22
23
|
max_size=1_000_000_000,
|
|
23
24
|
multilabel=False,
|
|
24
25
|
verbose=True,
|
|
25
26
|
):
|
|
26
27
|
"""
|
|
27
|
-
Convert an iterable `dataset` of (image, label) pairs to webdataset (.tar) format, and store in
|
|
28
|
+
Convert an iterable `dataset` of (image, label) pairs to webdataset (.tar) format, and store in
|
|
29
|
+
`output_folder/split`.
|
|
28
30
|
|
|
29
31
|
Images may be passed in as either:
|
|
30
32
|
* File paths: pass in `transform=path_to_bytes`;
|
|
31
33
|
* PIL images: pass in `transform=PIL_to_bytes(image_format)` where `image_format` is e.g. "webp"; or
|
|
32
|
-
* Raw binary data: use a PyTorch `Dataset` that supports `transform=PIL_to_bytes(image_format)`, and
|
|
34
|
+
* Raw binary data: use a PyTorch `Dataset` that supports `transform=PIL_to_bytes(image_format)`, and
|
|
35
|
+
pass in `transform=None` here.
|
|
33
36
|
Be sure that the transform is not applied twice.
|
|
34
37
|
|
|
35
38
|
Copying image files directly or writing raw binary data is fastest since it allows multiprocessing;
|
|
@@ -37,9 +40,10 @@ def convert_dataset(
|
|
|
37
40
|
|
|
38
41
|
Labels must be zero-indexed integers (for multilabel datasets, labels must be arrays/tensors).
|
|
39
42
|
|
|
40
|
-
Classnames and zero-shot classification templates can be provided as attributes of the dataset (`.classes`
|
|
41
|
-
or filled in manually afterward. `dataset.classes` should be a list of strings indexed by
|
|
42
|
-
and `dataset.templates` should be a list of strings containing `{c}` to specify where classnames
|
|
43
|
+
Classnames and zero-shot classification templates can be provided as attributes of the dataset (`.classes`
|
|
44
|
+
and `.templates`) or filled in manually afterward. `dataset.classes` should be a list of strings indexed by
|
|
45
|
+
the labels, and `dataset.templates` should be a list of strings containing `{c}` to specify where classnames
|
|
46
|
+
are to be inserted.
|
|
43
47
|
"""
|
|
44
48
|
# Create output directory
|
|
45
49
|
os.makedirs(os.path.join(output_folder, split), exist_ok=True)
|
|
@@ -52,52 +56,44 @@ def convert_dataset(
|
|
|
52
56
|
)
|
|
53
57
|
if verbose:
|
|
54
58
|
try:
|
|
55
|
-
logger.info(f
|
|
59
|
+
logger.info(f'Dataset size: {len(dataset)}')
|
|
56
60
|
except TypeError:
|
|
57
|
-
logger.info(
|
|
61
|
+
logger.info('IterableDataset has no len()')
|
|
58
62
|
# Save classnames
|
|
59
|
-
if hasattr(dataset,
|
|
60
|
-
classnames_fname = os.path.join(output_folder,
|
|
61
|
-
with open(classnames_fname,
|
|
62
|
-
logger.info(*dataset.classes, sep=
|
|
63
|
+
if hasattr(dataset, 'classes') and dataset.classes:
|
|
64
|
+
classnames_fname = os.path.join(output_folder, 'classnames.txt')
|
|
65
|
+
with open(classnames_fname, 'w') as classnames_file:
|
|
66
|
+
logger.info(*dataset.classes, sep='\n', end='\n', file=classnames_file)
|
|
63
67
|
if verbose:
|
|
64
68
|
logger.info("Saved class names to '%s'" % classnames_fname)
|
|
65
69
|
elif verbose:
|
|
66
|
-
logger.info(
|
|
70
|
+
logger.info('WARNING: No class names found')
|
|
67
71
|
# Save zeroshot templates
|
|
68
|
-
if hasattr(dataset,
|
|
69
|
-
templates_fname = os.path.join(
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
with open(templates_fname, "w") as templates_file:
|
|
73
|
-
logger.info(*dataset.templates, sep="\n", end="\n", file=templates_file)
|
|
72
|
+
if hasattr(dataset, 'templates') and dataset.templates:
|
|
73
|
+
templates_fname = os.path.join(output_folder, 'zeroshot_classification_templates.txt')
|
|
74
|
+
with open(templates_fname, 'w') as templates_file:
|
|
75
|
+
logger.info(*dataset.templates, sep='\n', end='\n', file=templates_file)
|
|
74
76
|
if verbose:
|
|
75
77
|
logger.info("Saved class names to '%s'" % templates_fname)
|
|
76
78
|
elif verbose:
|
|
77
|
-
logger.info(
|
|
79
|
+
logger.info('WARNING: No zeroshot classification templates found')
|
|
78
80
|
# Save dataset type
|
|
79
81
|
if multilabel:
|
|
80
|
-
type_fname = os.path.join(output_folder,
|
|
81
|
-
with open(type_fname,
|
|
82
|
-
logger.info(
|
|
82
|
+
type_fname = os.path.join(output_folder, 'dataset_type.txt')
|
|
83
|
+
with open(type_fname, 'w') as type_file:
|
|
84
|
+
logger.info('multilabel', end='\n', file=type_file)
|
|
83
85
|
if verbose:
|
|
84
86
|
logger.info("Saved dataset type to '%s'" % type_fname)
|
|
85
87
|
# Write to TAR files
|
|
86
|
-
data_fname = os.path.join(output_folder, split, r
|
|
88
|
+
data_fname = os.path.join(output_folder, split, r'%d.tar')
|
|
87
89
|
sink = webdataset.ShardWriter(data_fname, maxcount=max_count, maxsize=max_size)
|
|
88
90
|
nsamples = 0
|
|
89
|
-
label_type =
|
|
90
|
-
for index, (input, output) in enumerate(tqdm(dataloader, desc=
|
|
91
|
+
label_type = 'npy' if multilabel else 'cls'
|
|
92
|
+
for index, (input, output) in enumerate(tqdm(dataloader, desc='Converting')):
|
|
91
93
|
nsamples += 1
|
|
92
94
|
if isinstance(input, str) and transform is path_to_bytes:
|
|
93
95
|
# If copying file, determine image format from extension
|
|
94
|
-
extension = (
|
|
95
|
-
os.path.splitext(input)[1]
|
|
96
|
-
.replace(".", "")
|
|
97
|
-
.lower()
|
|
98
|
-
.replace("jpeg", "jpg")
|
|
99
|
-
or image_format
|
|
100
|
-
)
|
|
96
|
+
extension = (os.path.splitext(input)[1].replace('.', '').lower().replace('jpeg', 'jpg') or image_format)
|
|
101
97
|
else:
|
|
102
98
|
extension = image_format
|
|
103
99
|
# Convert label if necessary
|
|
@@ -107,27 +103,22 @@ def convert_dataset(
|
|
|
107
103
|
else:
|
|
108
104
|
output = output.item()
|
|
109
105
|
# Write example
|
|
110
|
-
sink.write(
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
}
|
|
116
|
-
)
|
|
106
|
+
sink.write({
|
|
107
|
+
'__key__': 's%07d' % index,
|
|
108
|
+
extension: transform(input) if transform else input,
|
|
109
|
+
label_type: output,
|
|
110
|
+
})
|
|
117
111
|
num_shards = sink.shard
|
|
118
112
|
sink.close()
|
|
119
113
|
if verbose:
|
|
120
|
-
logger.info(
|
|
121
|
-
"Saved dataset to '%s'"
|
|
122
|
-
% data_fname.replace(r"%d", "{0..%d}" % (num_shards - 1))
|
|
123
|
-
)
|
|
114
|
+
logger.info("Saved dataset to '%s'" % data_fname.replace(r'%d', '{0..%d}' % (num_shards - 1)))
|
|
124
115
|
# Save number of shards
|
|
125
|
-
nshards_fname = os.path.join(output_folder, split,
|
|
126
|
-
with open(nshards_fname,
|
|
127
|
-
logger.info(num_shards, end=
|
|
116
|
+
nshards_fname = os.path.join(output_folder, split, 'nshards.txt')
|
|
117
|
+
with open(nshards_fname, 'w') as nshards_file:
|
|
118
|
+
logger.info(num_shards, end='\n', file=nshards_file)
|
|
128
119
|
if verbose:
|
|
129
120
|
logger.info("Saved number of shards = %d to '%s'" % (num_shards, nshards_fname))
|
|
130
|
-
logger.info(
|
|
121
|
+
logger.info('Final dataset size:', nsamples)
|
|
131
122
|
|
|
132
123
|
|
|
133
124
|
def convert_retrieval_dataset(
|
|
@@ -136,13 +127,14 @@ def convert_retrieval_dataset(
|
|
|
136
127
|
output_folder,
|
|
137
128
|
*,
|
|
138
129
|
transform=None,
|
|
139
|
-
image_format=
|
|
130
|
+
image_format='webp',
|
|
140
131
|
max_count=10_000,
|
|
141
132
|
max_size=1_000_000_000,
|
|
142
133
|
verbose=True,
|
|
143
134
|
):
|
|
144
135
|
"""
|
|
145
|
-
Convert an iterable `dataset` of (image, [caption1, caption2, ...]) pairs to webdataset (.tar) format,
|
|
136
|
+
Convert an iterable `dataset` of (image, [caption1, caption2, ...]) pairs to webdataset (.tar) format,
|
|
137
|
+
and store in `output_folder/split`.
|
|
146
138
|
|
|
147
139
|
Labels must be lists of strings, with no newlines.
|
|
148
140
|
|
|
@@ -159,72 +151,59 @@ def convert_retrieval_dataset(
|
|
|
159
151
|
)
|
|
160
152
|
if verbose:
|
|
161
153
|
try:
|
|
162
|
-
logger.info(f
|
|
154
|
+
logger.info(f'Dataset size: {len(dataset)}')
|
|
163
155
|
except TypeError:
|
|
164
|
-
logger.info(
|
|
156
|
+
logger.info('IterableDataset has no len()')
|
|
165
157
|
# No classnames
|
|
166
158
|
# No zeroshot templates
|
|
167
159
|
# Save dataset type
|
|
168
|
-
type_fname = os.path.join(output_folder,
|
|
169
|
-
with open(type_fname,
|
|
170
|
-
logger.info(
|
|
160
|
+
type_fname = os.path.join(output_folder, 'dataset_type.txt')
|
|
161
|
+
with open(type_fname, 'w') as type_file:
|
|
162
|
+
logger.info('retrieval', end='\n', file=type_file)
|
|
171
163
|
if verbose:
|
|
172
164
|
logger.info("Saved dataset type to '%s'" % type_fname)
|
|
173
165
|
# Write to TAR files
|
|
174
|
-
data_fname = os.path.join(output_folder, split, r
|
|
166
|
+
data_fname = os.path.join(output_folder, split, r'%d.tar')
|
|
175
167
|
sink = webdataset.ShardWriter(data_fname, maxcount=max_count, maxsize=max_size)
|
|
176
168
|
nsamples = 0
|
|
177
|
-
for index, (input, output) in enumerate(tqdm(dataloader, desc=
|
|
169
|
+
for index, (input, output) in enumerate(tqdm(dataloader, desc='Converting')):
|
|
178
170
|
nsamples += 1
|
|
179
171
|
if isinstance(input, str) and transform is path_to_bytes:
|
|
180
172
|
# If copying file, determine image format from extension
|
|
181
|
-
extension = (
|
|
182
|
-
os.path.splitext(input)[1]
|
|
183
|
-
.replace(".", "")
|
|
184
|
-
.lower()
|
|
185
|
-
.replace("jpeg", "jpg")
|
|
186
|
-
or image_format
|
|
187
|
-
)
|
|
173
|
+
extension = (os.path.splitext(input)[1].replace('.', '').lower().replace('jpeg', 'jpg') or image_format)
|
|
188
174
|
else:
|
|
189
175
|
extension = image_format
|
|
190
|
-
sink.write(
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
}
|
|
196
|
-
)
|
|
176
|
+
sink.write({
|
|
177
|
+
'__key__': 's%07d' % index,
|
|
178
|
+
extension: transform(input) if transform else input,
|
|
179
|
+
'txt': '\n'.join(caption.replace('\n', r'\n') for caption in output),
|
|
180
|
+
})
|
|
197
181
|
num_shards = sink.shard
|
|
198
182
|
sink.close()
|
|
199
183
|
if verbose:
|
|
200
|
-
logger.info(
|
|
201
|
-
"Saved dataset to '%s'"
|
|
202
|
-
% data_fname.replace(r"%d", "{0..%d}" % (num_shards - 1))
|
|
203
|
-
)
|
|
184
|
+
logger.info("Saved dataset to '%s'" % data_fname.replace(r'%d', '{0..%d}' % (num_shards - 1)))
|
|
204
185
|
# Save number of shards
|
|
205
|
-
nshards_fname = os.path.join(output_folder, split,
|
|
206
|
-
with open(nshards_fname,
|
|
207
|
-
logger.info(num_shards, end=
|
|
186
|
+
nshards_fname = os.path.join(output_folder, split, 'nshards.txt')
|
|
187
|
+
with open(nshards_fname, 'w') as nshards_file:
|
|
188
|
+
logger.info(num_shards, end='\n', file=nshards_file)
|
|
208
189
|
if verbose:
|
|
209
190
|
logger.info("Saved number of shards = %d to '%s'" % (num_shards, nshards_fname))
|
|
210
|
-
logger.info(
|
|
191
|
+
logger.info('Final dataset size:', nsamples)
|
|
211
192
|
|
|
212
193
|
|
|
213
|
-
if __name__ ==
|
|
194
|
+
if __name__ == '__main__':
|
|
214
195
|
from modelscope.msdatasets import MsDataset
|
|
215
196
|
|
|
216
|
-
splits = [
|
|
197
|
+
splits = ['train', 'validation']
|
|
217
198
|
for split in splits:
|
|
218
|
-
ds = MsDataset.load(
|
|
199
|
+
ds = MsDataset.load('modelscope/muge', split=split)
|
|
219
200
|
hf_dataset = ds.to_hf_dataset()
|
|
220
|
-
pytorch_dataset = DatasetWrapper(
|
|
221
|
-
hf_dataset, image_key="image", text_key="query"
|
|
222
|
-
)
|
|
201
|
+
pytorch_dataset = DatasetWrapper(hf_dataset, image_key='image', text_key='query')
|
|
223
202
|
convert_retrieval_dataset(
|
|
224
203
|
pytorch_dataset,
|
|
225
204
|
split,
|
|
226
|
-
|
|
227
|
-
transform=PIL_to_bytes(
|
|
228
|
-
image_format=
|
|
205
|
+
'data/muge',
|
|
206
|
+
transform=PIL_to_bytes('jpg'),
|
|
207
|
+
image_format='jpg',
|
|
229
208
|
max_count=50_000,
|
|
230
209
|
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from evalscope.backend.rag_eval.cmteb.
|
|
1
|
+
from evalscope.backend.rag_eval.cmteb.arguments import EvalArguments, ModelArguments
|
|
2
2
|
from evalscope.backend.rag_eval.cmteb.base import *
|
|
3
|
-
from evalscope.backend.rag_eval.cmteb.
|
|
4
|
-
from evalscope.backend.rag_eval.cmteb.
|
|
3
|
+
from evalscope.backend.rag_eval.cmteb.task_template import one_stage_eval, two_stage_eval
|
|
4
|
+
from evalscope.backend.rag_eval.cmteb.tasks import *
|
|
@@ -1,38 +1,36 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
@dataclass
|
|
6
6
|
class ModelArguments:
|
|
7
7
|
# Arguments for embeding model: sentence transformer or cross encoder
|
|
8
|
-
model_name_or_path: str =
|
|
8
|
+
model_name_or_path: str = '' # model name or path
|
|
9
9
|
is_cross_encoder: bool = False # whether the model is a cross encoder
|
|
10
10
|
# pooling mode: Either “cls”, “lasttoken”, “max”, “mean”, “mean_sqrt_len_tokens”, or “weightedmean”.
|
|
11
11
|
pooling_mode: Optional[str] = None
|
|
12
12
|
max_seq_length: int = 512 # max sequence length
|
|
13
13
|
# prompt for llm based model
|
|
14
|
-
prompt: str =
|
|
14
|
+
prompt: str = ''
|
|
15
15
|
# model kwargs
|
|
16
16
|
model_kwargs: dict = field(default_factory=dict)
|
|
17
17
|
# config kwargs
|
|
18
18
|
config_kwargs: Dict[str, Any] = field(default_factory=dict)
|
|
19
19
|
# encode kwargs
|
|
20
|
-
encode_kwargs: dict = field(
|
|
21
|
-
|
|
22
|
-
)
|
|
23
|
-
hub: str = "modelscope" # modelscope or huggingface
|
|
20
|
+
encode_kwargs: dict = field(default_factory=lambda: {'show_progress_bar': True, 'batch_size': 32})
|
|
21
|
+
hub: str = 'modelscope' # modelscope or huggingface
|
|
24
22
|
|
|
25
23
|
def to_dict(self) -> Dict[str, Any]:
|
|
26
24
|
return {
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
25
|
+
'model_name_or_path': self.model_name_or_path,
|
|
26
|
+
'is_cross_encoder': self.is_cross_encoder,
|
|
27
|
+
'pooling_mode': self.pooling_mode,
|
|
28
|
+
'max_seq_length': self.max_seq_length,
|
|
29
|
+
'prompt': self.prompt,
|
|
30
|
+
'model_kwargs': self.model_kwargs,
|
|
31
|
+
'config_kwargs': self.config_kwargs,
|
|
32
|
+
'encode_kwargs': self.encode_kwargs,
|
|
33
|
+
'hub': self.hub,
|
|
36
34
|
}
|
|
37
35
|
|
|
38
36
|
|
|
@@ -42,20 +40,20 @@ class EvalArguments:
|
|
|
42
40
|
tasks: List[str] = field(default_factory=list) # task names
|
|
43
41
|
dataset_path: Optional[str] = None # custom dataset path
|
|
44
42
|
verbosity: int = 2 # verbosity level 0-3
|
|
45
|
-
output_folder: str =
|
|
43
|
+
output_folder: str = 'outputs' # output folder
|
|
46
44
|
overwrite_results: bool = True # overwrite results
|
|
47
45
|
limits: Optional[int] = None # limit number of samples
|
|
48
|
-
hub: str =
|
|
49
|
-
top_k: int = 5
|
|
46
|
+
hub: str = 'modelscope' # modelscope or huggingface
|
|
47
|
+
top_k: int = 5 # top k for reranking
|
|
50
48
|
|
|
51
49
|
def to_dict(self) -> Dict[str, Any]:
|
|
52
50
|
return {
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
51
|
+
'tasks': self.tasks,
|
|
52
|
+
'dataset_path': self.dataset_path,
|
|
53
|
+
'verbosity': self.verbosity,
|
|
54
|
+
'output_folder': self.output_folder,
|
|
55
|
+
'overwrite_results': self.overwrite_results,
|
|
56
|
+
'limits': self.limits,
|
|
57
|
+
'hub': self.hub,
|
|
58
|
+
'top_k': self.top_k,
|
|
61
59
|
}
|
|
@@ -1,12 +1,14 @@
|
|
|
1
|
+
import datasets
|
|
1
2
|
from collections import defaultdict
|
|
2
|
-
from typing import List
|
|
3
|
-
from mteb import AbsTask
|
|
4
3
|
from datasets import DatasetDict
|
|
5
4
|
from modelscope import MsDataset
|
|
6
|
-
import
|
|
7
|
-
from
|
|
5
|
+
from mteb import AbsTask
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from evalscope.backend.rag_eval.cmteb.tasks import CLS_CUSTOM, CLS_DICT, CLS_RETRIEVAL
|
|
9
|
+
from evalscope.constants import HubType
|
|
8
10
|
|
|
9
|
-
__all__ = [
|
|
11
|
+
__all__ = ['TaskBase']
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
class TaskBase:
|
|
@@ -28,7 +30,7 @@ class TaskBase:
|
|
|
28
30
|
from mteb.overview import TASKS_REGISTRY
|
|
29
31
|
|
|
30
32
|
task_cls = TASKS_REGISTRY[task_name]
|
|
31
|
-
if task_cls.metadata.type !=
|
|
33
|
+
if task_cls.metadata.type != 'Retrieval':
|
|
32
34
|
task_cls.load_data = load_data
|
|
33
35
|
|
|
34
36
|
# init task instance
|
|
@@ -41,33 +43,30 @@ def load_data(self, **kwargs):
|
|
|
41
43
|
if self.data_loaded:
|
|
42
44
|
return
|
|
43
45
|
|
|
44
|
-
limits = kwargs.get(
|
|
45
|
-
hub = kwargs.get(
|
|
46
|
-
name = self.metadata_dict.get(
|
|
47
|
-
path = self.metadata_dict[
|
|
46
|
+
limits = kwargs.get('limits', None)
|
|
47
|
+
hub = kwargs.get('hub', HubType.MODELSCOPE)
|
|
48
|
+
name = self.metadata_dict.get('name')
|
|
49
|
+
path = self.metadata_dict['dataset'].get('path')
|
|
48
50
|
|
|
49
|
-
assert path is not None,
|
|
51
|
+
assert path is not None, 'Path must be specified in dataset'
|
|
50
52
|
|
|
51
53
|
# Loading the dataset based on the source hub
|
|
52
|
-
if hub ==
|
|
54
|
+
if hub == HubType.MODELSCOPE:
|
|
53
55
|
import re
|
|
54
56
|
|
|
55
|
-
path = re.sub(r
|
|
57
|
+
path = re.sub(r'^mteb/', 'MTEB/', path)
|
|
56
58
|
dataset = MsDataset.load(path)
|
|
57
59
|
else:
|
|
58
|
-
dataset = datasets.load_dataset(**self.metadata_dict[
|
|
60
|
+
dataset = datasets.load_dataset(**self.metadata_dict['dataset']) # type: ignore
|
|
59
61
|
|
|
60
62
|
if limits is not None:
|
|
61
|
-
dataset = {
|
|
62
|
-
split: dataset[split].select(range(min(limits, len(dataset[split]))))
|
|
63
|
-
for split in dataset.keys()
|
|
64
|
-
}
|
|
63
|
+
dataset = {split: dataset[split].select(range(min(limits, len(dataset[split])))) for split in dataset.keys()}
|
|
65
64
|
|
|
66
65
|
if name in CLS_RETRIEVAL:
|
|
67
66
|
self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
|
|
68
67
|
dataset,
|
|
69
68
|
path,
|
|
70
|
-
self.metadata_dict[
|
|
69
|
+
self.metadata_dict['eval_splits'],
|
|
71
70
|
)
|
|
72
71
|
|
|
73
72
|
self.dataset = dataset
|
|
@@ -77,13 +76,13 @@ def load_data(self, **kwargs):
|
|
|
77
76
|
|
|
78
77
|
def load_retrieval_data(dataset, dataset_name: str, eval_splits: list) -> tuple:
|
|
79
78
|
eval_split = eval_splits[0]
|
|
80
|
-
qrels = MsDataset.load(dataset_name +
|
|
79
|
+
qrels = MsDataset.load(dataset_name + '-qrels')[eval_split]
|
|
81
80
|
|
|
82
|
-
corpus = {e[
|
|
83
|
-
queries = {e[
|
|
81
|
+
corpus = {e['id']: {'text': e['text']} for e in dataset['corpus']}
|
|
82
|
+
queries = {e['id']: e['text'] for e in dataset['queries']}
|
|
84
83
|
relevant_docs = defaultdict(dict)
|
|
85
84
|
for e in qrels:
|
|
86
|
-
relevant_docs[e[
|
|
85
|
+
relevant_docs[e['qid']][e['pid']] = e['score']
|
|
87
86
|
|
|
88
87
|
corpus = DatasetDict({eval_split: corpus})
|
|
89
88
|
queries = DatasetDict({eval_split: queries})
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import mteb
|
|
3
|
-
|
|
4
|
-
from evalscope.backend.rag_eval import cmteb
|
|
2
|
+
import os
|
|
5
3
|
from mteb.task_selection import results_to_dataframe
|
|
4
|
+
|
|
5
|
+
from evalscope.backend.rag_eval import EmbeddingModel, cmteb
|
|
6
6
|
from evalscope.utils.logger import get_logger
|
|
7
7
|
|
|
8
8
|
logger = get_logger()
|
|
@@ -19,8 +19,8 @@ def show_results(output_folder, model, results):
|
|
|
19
19
|
model_name,
|
|
20
20
|
revision,
|
|
21
21
|
)
|
|
22
|
-
logger.info(f
|
|
23
|
-
logger.info(f
|
|
22
|
+
logger.info(f'Evaluation results:\n{results_df.to_markdown()}')
|
|
23
|
+
logger.info(f'Evaluation results saved in {os.path.abspath(save_path)}')
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def one_stage_eval(
|
|
@@ -29,18 +29,16 @@ def one_stage_eval(
|
|
|
29
29
|
) -> None:
|
|
30
30
|
# load model
|
|
31
31
|
model = EmbeddingModel.load(**model_args)
|
|
32
|
-
custom_dataset_path = eval_args.pop(
|
|
32
|
+
custom_dataset_path = eval_args.pop('dataset_path', None)
|
|
33
33
|
# load task first to update instructions
|
|
34
|
-
tasks = cmteb.TaskBase.get_tasks(
|
|
35
|
-
task_names=eval_args["tasks"], dataset_path=custom_dataset_path
|
|
36
|
-
)
|
|
34
|
+
tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'], dataset_path=custom_dataset_path)
|
|
37
35
|
evaluation = mteb.MTEB(tasks=tasks)
|
|
38
36
|
|
|
39
37
|
# run evaluation
|
|
40
38
|
results = evaluation.run(model, **eval_args)
|
|
41
39
|
|
|
42
40
|
# save and log results
|
|
43
|
-
show_results(eval_args[
|
|
41
|
+
show_results(eval_args['output_folder'], model, results)
|
|
44
42
|
|
|
45
43
|
|
|
46
44
|
def two_stage_eval(
|
|
@@ -56,7 +54,7 @@ def two_stage_eval(
|
|
|
56
54
|
first_stage_path = f"{eval_args['output_folder']}/stage1"
|
|
57
55
|
second_stage_path = f"{eval_args['output_folder']}/stage2"
|
|
58
56
|
|
|
59
|
-
tasks = cmteb.TaskBase.get_tasks(task_names=eval_args[
|
|
57
|
+
tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'])
|
|
60
58
|
for task in tasks:
|
|
61
59
|
evaluation = mteb.MTEB(tasks=[task])
|
|
62
60
|
|
|
@@ -66,19 +64,19 @@ def two_stage_eval(
|
|
|
66
64
|
save_predictions=True,
|
|
67
65
|
output_folder=first_stage_path,
|
|
68
66
|
overwrite_results=True,
|
|
69
|
-
hub=eval_args[
|
|
70
|
-
limits=eval_args[
|
|
67
|
+
hub=eval_args['hub'],
|
|
68
|
+
limits=eval_args['limits'],
|
|
71
69
|
)
|
|
72
70
|
# stage 2: run cross encoder
|
|
73
71
|
results = evaluation.run(
|
|
74
72
|
cross_encoder,
|
|
75
|
-
top_k=eval_args[
|
|
73
|
+
top_k=eval_args['top_k'],
|
|
76
74
|
save_predictions=True,
|
|
77
75
|
output_folder=second_stage_path,
|
|
78
|
-
previous_results=f
|
|
76
|
+
previous_results=f'{first_stage_path}/{task.metadata.name}_default_predictions.json',
|
|
79
77
|
overwrite_results=True,
|
|
80
|
-
hub=eval_args[
|
|
81
|
-
limits=eval_args[
|
|
78
|
+
hub=eval_args['hub'],
|
|
79
|
+
limits=eval_args['limits'],
|
|
82
80
|
)
|
|
83
81
|
|
|
84
82
|
# save and log results
|