evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -1,28 +1,27 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
import os
|
|
4
5
|
import time
|
|
5
|
-
import json
|
|
6
|
-
import re
|
|
7
|
-
from copy import deepcopy
|
|
8
6
|
from collections import OrderedDict
|
|
9
|
-
|
|
7
|
+
from copy import deepcopy
|
|
10
8
|
from tqdm import tqdm
|
|
11
|
-
from typing import
|
|
9
|
+
from typing import Any, Dict, List, Optional, Union
|
|
12
10
|
|
|
13
11
|
from evalscope.benchmarks import DataAdapter
|
|
14
|
-
from evalscope.
|
|
12
|
+
from evalscope.config import TaskConfig
|
|
13
|
+
from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
|
|
14
|
+
ReviewKeys)
|
|
15
15
|
from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
|
|
16
16
|
from evalscope.tools.combine_reports import gen_table
|
|
17
|
-
from evalscope.utils import
|
|
18
|
-
|
|
17
|
+
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
18
|
+
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
19
19
|
from evalscope.utils.logger import get_logger
|
|
20
20
|
|
|
21
21
|
logger = get_logger()
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class Evaluator(object):
|
|
25
|
-
|
|
26
25
|
"""
|
|
27
26
|
The evaluator for model on datasets.
|
|
28
27
|
|
|
@@ -33,11 +32,8 @@ class Evaluator(object):
|
|
|
33
32
|
data_adapter: DataAdapter, the data adapter for the dataset.
|
|
34
33
|
subset_list: list, the subset list for the dataset.
|
|
35
34
|
model_adapter: BaseModelAdapter, the model adapter for the model.
|
|
36
|
-
use_cache:
|
|
37
|
-
|
|
38
|
-
root_cache_dir: str, the root cache dir. Default: DEFAULT_ROOT_CACHE_DIR
|
|
39
|
-
outputs_dir: str, the outputs dir. Default: ''
|
|
40
|
-
is_custom_outputs_dir: bool, whether to use custom outputs dir. Default: False (deprecated)
|
|
35
|
+
use_cache: str, path to local cache. Default: None
|
|
36
|
+
outputs_dir: OutputsStructure, the outputs dir. Default: None
|
|
41
37
|
datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
|
|
42
38
|
datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
|
|
43
39
|
stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
|
|
@@ -51,24 +47,20 @@ class Evaluator(object):
|
|
|
51
47
|
data_adapter: DataAdapter,
|
|
52
48
|
subset_list: Optional[list] = None,
|
|
53
49
|
model_adapter: Optional[BaseModelAdapter] = None,
|
|
54
|
-
use_cache:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
stage: Optional[str] = 'all', # refer to evalscope.constants.EvalStage
|
|
62
|
-
eval_type: Optional[str] = 'checkpoint', # `checkpoint` or `service` or `custom`
|
|
63
|
-
overall_task_cfg: Optional[dict] = None,
|
|
50
|
+
use_cache: Optional[str] = None,
|
|
51
|
+
outputs: Optional[OutputsStructure] = None,
|
|
52
|
+
datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
53
|
+
datasets_hub: Optional[str] = HubType.MODELSCOPE,
|
|
54
|
+
stage: Optional[str] = EvalStage.ALL,
|
|
55
|
+
eval_type: Optional[str] = EvalType.CHECKPOINT,
|
|
56
|
+
overall_task_cfg: Optional[TaskConfig] = None,
|
|
64
57
|
**kwargs):
|
|
65
58
|
|
|
66
59
|
self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
67
|
-
self.
|
|
68
|
-
|
|
69
|
-
|
|
60
|
+
self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
|
|
61
|
+
self.model_name = overall_task_cfg.model_id
|
|
62
|
+
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
70
63
|
|
|
71
|
-
self.root_cache_dir = os.path.expanduser(root_cache_dir)
|
|
72
64
|
self.datasets_dir = os.path.expanduser(datasets_dir)
|
|
73
65
|
self.kwargs = kwargs
|
|
74
66
|
self.data_adapter = data_adapter
|
|
@@ -78,70 +70,31 @@ class Evaluator(object):
|
|
|
78
70
|
self.use_cache = use_cache
|
|
79
71
|
self.overall_task_cfg = overall_task_cfg
|
|
80
72
|
if isinstance(self.model_adapter, CustomModelAdapter):
|
|
81
|
-
self.overall_task_cfg.
|
|
73
|
+
self.overall_task_cfg.model_args = self.model_adapter.custom_model.config
|
|
82
74
|
|
|
83
75
|
self.model_cfg = self.model_adapter.model_cfg
|
|
84
|
-
self.model_id = self.model_cfg['model_id']
|
|
85
|
-
self.model_revision = self.model_cfg.get('revision', None)
|
|
86
|
-
self.model_revision_str = self.model_revision if self.model_revision is not None else 'none'
|
|
87
|
-
|
|
88
|
-
# Get default outputs_dir
|
|
89
|
-
# TODO: refactor outputs_dir, del timestamp concat
|
|
90
|
-
# if not is_custom_outputs_dir:
|
|
91
|
-
# outputs_dir = make_outputs_dir(work_dir=outputs_dir,
|
|
92
|
-
# model_id=self.model_id,
|
|
93
|
-
# model_revision=self.model_revision_str)
|
|
94
|
-
|
|
95
|
-
self.outputs_dir = os.path.expanduser(outputs_dir)
|
|
96
76
|
|
|
97
77
|
# Deal with the output paths
|
|
98
|
-
self.outputs_structure =
|
|
78
|
+
self.outputs_structure = outputs
|
|
99
79
|
|
|
100
80
|
# Load dataset
|
|
101
|
-
self.dataset = self.data_adapter.load(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
81
|
+
self.dataset = self.data_adapter.load(
|
|
82
|
+
dataset_name_or_path=dataset_name_or_path,
|
|
83
|
+
subset_list=subset_list,
|
|
84
|
+
work_dir=self.datasets_dir,
|
|
85
|
+
datasets_hub=datasets_hub,
|
|
86
|
+
**kwargs)
|
|
106
87
|
|
|
107
88
|
# Get prompts from dataset
|
|
108
89
|
self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
|
|
109
90
|
del self.dataset
|
|
110
91
|
|
|
111
|
-
|
|
112
|
-
# TODO: refactor mem cache manager
|
|
113
|
-
# mem_cache_file_name = self.dataset_name_or_path.replace('/', '_') + \
|
|
114
|
-
# '_' + self.model_id.replace('/', '_') + \
|
|
115
|
-
# '_' + self.model_revision_str + \
|
|
116
|
-
# '_cache.pkl'
|
|
117
|
-
# self.mem_cache_path = os.path.join(self.root_cache_dir, 'mem_cache', mem_cache_file_name)
|
|
118
|
-
|
|
119
|
-
# Note: mem_cache is deprecated, use `use_cache` instead
|
|
120
|
-
self.mem_cache = None
|
|
121
|
-
self.mem_cache_method = mem_cache_method
|
|
122
|
-
# if self.use_cache:
|
|
123
|
-
# self.mem_cache = init_mem_cache(method=self.mem_cache_method, cache_file_path=self.mem_cache_path)
|
|
124
|
-
# logger.info(f'** Using memory cache with size: {len(self.mem_cache)}')
|
|
125
|
-
|
|
126
|
-
def _pred_answer(self,
|
|
127
|
-
input_d: dict,
|
|
128
|
-
infer_cfg: dict,
|
|
129
|
-
subset_name: str,
|
|
130
|
-
answer_id: str = None) -> dict:
|
|
131
|
-
|
|
132
|
-
# Get answer from memory cache
|
|
133
|
-
if self.mem_cache is not None:
|
|
134
|
-
if answer_id in self.mem_cache:
|
|
135
|
-
logger.info(f'** Reusing answer `{answer_id}` in memory cache.')
|
|
136
|
-
return self.mem_cache[answer_id]
|
|
92
|
+
def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
|
|
137
93
|
|
|
138
94
|
ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
|
|
139
95
|
ans[AnswerKeys.ANSWER_ID] = answer_id
|
|
140
96
|
ans[AnswerKeys.SUBSET_NAME] = subset_name
|
|
141
97
|
|
|
142
|
-
if self.mem_cache is not None:
|
|
143
|
-
self.mem_cache[answer_id] = ans
|
|
144
|
-
|
|
145
98
|
return ans
|
|
146
99
|
|
|
147
100
|
def get_answers(self,
|
|
@@ -177,26 +130,21 @@ class Evaluator(object):
|
|
|
177
130
|
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
178
131
|
|
|
179
132
|
answers_list = []
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
pred_file_name: str = self.custom_task_name + '_' + subset_name + '.jsonl'
|
|
184
|
-
else:
|
|
185
|
-
pred_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
|
|
186
|
-
|
|
187
|
-
pred_file_path: str = os.path.join(pred_dir, pred_file_name)
|
|
133
|
+
pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
|
|
134
|
+
pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
|
|
135
|
+
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
188
136
|
|
|
189
137
|
if self.use_cache and os.path.exists(pred_file_path):
|
|
190
138
|
answers_list = jsonl_to_list(pred_file_path)
|
|
191
|
-
logger.info(f'
|
|
192
|
-
|
|
193
|
-
|
|
139
|
+
logger.info(f'Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
|
|
140
|
+
# Note: assume prediction in order of prompts_list
|
|
141
|
+
prompts_list = prompts_list[len(answers_list):]
|
|
194
142
|
|
|
195
143
|
if isinstance(self.model_adapter, CustomModelAdapter):
|
|
196
144
|
# Batch inference for custom model
|
|
197
145
|
|
|
198
|
-
resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
|
|
199
|
-
|
|
146
|
+
resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
|
|
147
|
+
inputs=prompts_list, infer_cfg=infer_cfg)
|
|
200
148
|
|
|
201
149
|
assert len(prompts_list) == len(resp_answers_list), \
|
|
202
150
|
f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
|
|
@@ -207,10 +155,10 @@ class Evaluator(object):
|
|
|
207
155
|
model_cfg_str = json.dumps(
|
|
208
156
|
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
209
157
|
ensure_ascii=False)
|
|
210
|
-
input_prompt_str = json.dumps(
|
|
211
|
-
|
|
212
|
-
infer_cfg_str = json.dumps(
|
|
213
|
-
|
|
158
|
+
input_prompt_str = json.dumps(
|
|
159
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
|
|
160
|
+
infer_cfg_str = json.dumps(
|
|
161
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
214
162
|
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
215
163
|
|
|
216
164
|
resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
@@ -220,6 +168,7 @@ class Evaluator(object):
|
|
|
220
168
|
resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
|
|
221
169
|
|
|
222
170
|
answers_list.append(resp_d)
|
|
171
|
+
dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
223
172
|
|
|
224
173
|
else:
|
|
225
174
|
for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
|
|
@@ -228,17 +177,15 @@ class Evaluator(object):
|
|
|
228
177
|
model_cfg_str = json.dumps(
|
|
229
178
|
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
230
179
|
ensure_ascii=False)
|
|
231
|
-
input_prompt_str = json.dumps(
|
|
232
|
-
|
|
233
|
-
infer_cfg_str = json.dumps(
|
|
234
|
-
|
|
180
|
+
input_prompt_str = json.dumps(
|
|
181
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
|
|
182
|
+
infer_cfg_str = json.dumps(
|
|
183
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
235
184
|
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
236
185
|
|
|
237
186
|
# Get answers
|
|
238
|
-
answer_d: dict = self._pred_answer(
|
|
239
|
-
|
|
240
|
-
subset_name=subset_name,
|
|
241
|
-
answer_id=answer_id)
|
|
187
|
+
answer_d: dict = self._pred_answer(
|
|
188
|
+
input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
|
|
242
189
|
|
|
243
190
|
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
244
191
|
answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
|
|
@@ -249,26 +196,12 @@ class Evaluator(object):
|
|
|
249
196
|
logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
|
|
250
197
|
|
|
251
198
|
answers_list.append(answer_d)
|
|
199
|
+
dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
252
200
|
|
|
253
|
-
|
|
254
|
-
logger.error(f'** Got empty predictions on subset {subset_name} of dataset: {self.dataset_name_or_path}')
|
|
255
|
-
|
|
256
|
-
# Dump answers
|
|
257
|
-
os.makedirs(pred_dir, exist_ok=True)
|
|
258
|
-
dump_jsonl_data(answers_list, pred_file_path)
|
|
259
|
-
|
|
201
|
+
logger.info(f'Dump predictions to {pred_file_path}.')
|
|
260
202
|
return answers_list
|
|
261
203
|
|
|
262
|
-
def _get_review(self,
|
|
263
|
-
answer_d: dict,
|
|
264
|
-
review_id: str = None,
|
|
265
|
-
reviewer_spec: dict = None) -> dict:
|
|
266
|
-
|
|
267
|
-
# Get review from memory cache
|
|
268
|
-
if self.mem_cache is not None:
|
|
269
|
-
if review_id in self.mem_cache:
|
|
270
|
-
logger.info(f'** Reusing review `{review_id}` in memory cache.')
|
|
271
|
-
return self.mem_cache[review_id]
|
|
204
|
+
def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict = None) -> dict:
|
|
272
205
|
|
|
273
206
|
if reviewer_spec is None:
|
|
274
207
|
reviewer_spec = {}
|
|
@@ -286,15 +219,16 @@ class Evaluator(object):
|
|
|
286
219
|
for choice in choices:
|
|
287
220
|
raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
|
|
288
221
|
answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
|
|
289
|
-
answer_content = self.data_adapter.parse_pred_result(
|
|
290
|
-
|
|
291
|
-
eval_type=self.eval_type)
|
|
222
|
+
answer_content = self.data_adapter.parse_pred_result(
|
|
223
|
+
result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
|
|
292
224
|
gold_content = self.data_adapter.get_gold_answer(raw_input_d)
|
|
293
225
|
|
|
294
226
|
review_result = self.data_adapter.match(gold_content, answer_content)
|
|
295
|
-
choice[ReviewKeys.REVIEW] = {
|
|
296
|
-
|
|
297
|
-
|
|
227
|
+
choice[ReviewKeys.REVIEW] = {
|
|
228
|
+
ReviewKeys.GOLD: gold_content,
|
|
229
|
+
ReviewKeys.PRED: answer_content,
|
|
230
|
+
ReviewKeys.RESULT: review_result
|
|
231
|
+
}
|
|
298
232
|
|
|
299
233
|
rev_choices.append(choice)
|
|
300
234
|
|
|
@@ -304,9 +238,6 @@ class Evaluator(object):
|
|
|
304
238
|
review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
|
|
305
239
|
review_res[ReviewKeys.REVIEW_TIME] = time.time()
|
|
306
240
|
|
|
307
|
-
if self.mem_cache is not None:
|
|
308
|
-
self.mem_cache[review_id] = review_res
|
|
309
|
-
|
|
310
241
|
return review_res
|
|
311
242
|
|
|
312
243
|
def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
|
|
@@ -324,26 +255,25 @@ class Evaluator(object):
|
|
|
324
255
|
"""
|
|
325
256
|
reviews_list = []
|
|
326
257
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
else:
|
|
331
|
-
review_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
|
|
332
|
-
review_file_path: str = os.path.join(review_dir, review_file_name)
|
|
258
|
+
review_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
|
|
259
|
+
review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
|
|
260
|
+
os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
|
|
333
261
|
|
|
334
262
|
if self.use_cache and os.path.exists(review_file_path):
|
|
335
|
-
logger.warning(f'
|
|
263
|
+
logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
|
|
336
264
|
|
|
337
265
|
for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
|
|
338
266
|
|
|
339
267
|
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
340
268
|
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
341
269
|
|
|
342
|
-
reviewer_spec: dict = {
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
270
|
+
reviewer_spec: dict = {
|
|
271
|
+
'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
|
|
272
|
+
'reviewer': ['Evaluator'],
|
|
273
|
+
'revision': ['default']
|
|
274
|
+
}
|
|
275
|
+
reviewer_spec_str = json.dumps(
|
|
276
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
347
277
|
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
348
278
|
|
|
349
279
|
# Get review
|
|
@@ -354,9 +284,8 @@ class Evaluator(object):
|
|
|
354
284
|
|
|
355
285
|
reviews_list.append(review_d)
|
|
356
286
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
dump_jsonl_data(reviews_list, review_file_path)
|
|
287
|
+
# Dump reviews
|
|
288
|
+
dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
|
|
360
289
|
|
|
361
290
|
return reviews_list
|
|
362
291
|
|
|
@@ -375,7 +304,7 @@ class Evaluator(object):
|
|
|
375
304
|
review_res_list = []
|
|
376
305
|
for review_d in reviews_list:
|
|
377
306
|
if not review_d[ReviewKeys.REVIEWED]:
|
|
378
|
-
logger.warning(f'
|
|
307
|
+
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
379
308
|
continue
|
|
380
309
|
|
|
381
310
|
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
@@ -385,7 +314,7 @@ class Evaluator(object):
|
|
|
385
314
|
|
|
386
315
|
return metric_score
|
|
387
316
|
|
|
388
|
-
def dump_report(self,
|
|
317
|
+
def dump_report(self, reviews_score_all: dict, use_table: bool = True):
|
|
389
318
|
"""
|
|
390
319
|
Get report for total reviews of specific dataset.
|
|
391
320
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -396,50 +325,31 @@ class Evaluator(object):
|
|
|
396
325
|
|
|
397
326
|
Returns: None
|
|
398
327
|
"""
|
|
328
|
+
# Get report map
|
|
329
|
+
report_map: dict = self.data_adapter.gen_report(
|
|
330
|
+
subset_score_map=reviews_score_all, report_name=self.custom_task_name)
|
|
331
|
+
report_map.update(dict(model_name=self.model_name, dataset_name=self.dataset_name))
|
|
399
332
|
|
|
400
333
|
# Dump report
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
report_file_name: str = self.custom_task_name + '.json'
|
|
405
|
-
else:
|
|
406
|
-
report_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '.json'
|
|
334
|
+
report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
|
|
335
|
+
self.dataset_name + '.json')
|
|
336
|
+
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
|
407
337
|
|
|
408
|
-
|
|
409
|
-
report_path: str = os.path.join(report_dir, report_file_name)
|
|
338
|
+
# Write report
|
|
410
339
|
with open(report_path, 'w') as f:
|
|
411
340
|
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
412
|
-
|
|
413
|
-
logger.info(f'** Dump report: {report_file_name} \n')
|
|
341
|
+
logger.info(f'Dump report: {report_path} \n')
|
|
414
342
|
|
|
343
|
+
# Make table
|
|
415
344
|
if use_table:
|
|
416
345
|
try:
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
except:
|
|
346
|
+
report_table: str = gen_table([self.outputs_structure.reports_dir])
|
|
347
|
+
logger.info(f'Report table: \n{report_table} \n')
|
|
348
|
+
except Exception:
|
|
421
349
|
logger.error('Failed to generate report table.')
|
|
350
|
+
return report_map
|
|
422
351
|
|
|
423
|
-
|
|
424
|
-
# if self.mem_cache is not None:
|
|
425
|
-
# logger.info(f'** Saving memory cache with size: {len(self.mem_cache)}')
|
|
426
|
-
# Cache.save(cache=self.mem_cache, path=self.mem_cache_path)
|
|
427
|
-
|
|
428
|
-
# def clear_cache(self):
|
|
429
|
-
# """
|
|
430
|
-
# Clear memory cache.
|
|
431
|
-
#
|
|
432
|
-
# Returns: None
|
|
433
|
-
# """
|
|
434
|
-
# if self.mem_cache is not None:
|
|
435
|
-
# cache_len = len(self.mem_cache)
|
|
436
|
-
# self.mem_cache.clear()
|
|
437
|
-
# logger.info(f'** Memory cache cleared, length changed: {cache_len} -> {len(self.mem_cache)}')
|
|
438
|
-
|
|
439
|
-
def eval(self,
|
|
440
|
-
infer_cfg: dict = None,
|
|
441
|
-
debug: bool = False,
|
|
442
|
-
**kwargs) -> dict:
|
|
352
|
+
def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict:
|
|
443
353
|
"""
|
|
444
354
|
Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
|
|
445
355
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -465,27 +375,22 @@ class Evaluator(object):
|
|
|
465
375
|
|
|
466
376
|
logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
|
|
467
377
|
|
|
468
|
-
reviews_score_all = {}
|
|
378
|
+
reviews_score_all = {} # {subset_name: (score, num)}
|
|
469
379
|
stage_answers_dict = {}
|
|
470
380
|
stage_reviews_dict = {}
|
|
471
381
|
|
|
472
382
|
for subset_name, prompts_list in self.prompts.items():
|
|
473
|
-
limit =
|
|
383
|
+
limit = kwargs.get('limit', len(prompts_list))
|
|
474
384
|
prompts_list = prompts_list[:limit]
|
|
475
385
|
|
|
476
|
-
answers_list: list = self.get_answers(
|
|
477
|
-
|
|
478
|
-
infer_cfg=infer_cfg,
|
|
479
|
-
debug=debug,
|
|
480
|
-
**kwargs)
|
|
386
|
+
answers_list: list = self.get_answers(
|
|
387
|
+
subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug, **kwargs)
|
|
481
388
|
if self.stage == EvalStage.INFER:
|
|
482
389
|
stage_answers_dict[subset_name] = answers_list
|
|
483
390
|
continue
|
|
484
391
|
|
|
485
|
-
reviews_list: list = self.get_reviews(
|
|
486
|
-
|
|
487
|
-
debug=debug,
|
|
488
|
-
**kwargs)
|
|
392
|
+
reviews_list: list = self.get_reviews(
|
|
393
|
+
subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
|
|
489
394
|
|
|
490
395
|
metric_res = self.compute_metrics(reviews_list=reviews_list)
|
|
491
396
|
reviews_score_all[subset_name] = (metric_res, len(reviews_list))
|
|
@@ -498,193 +403,8 @@ class Evaluator(object):
|
|
|
498
403
|
return stage_reviews_dict
|
|
499
404
|
|
|
500
405
|
# Generate report
|
|
501
|
-
report_map
|
|
502
|
-
report_name=self.custom_task_name)
|
|
503
|
-
self.dump_report(report_map=report_map)
|
|
504
|
-
|
|
505
|
-
# Dump overall task config
|
|
506
|
-
overall_task_cfg_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.CONFIGS_DIR),
|
|
507
|
-
'task_output_config.yaml')
|
|
508
|
-
overall_task_cfg_file = os.path.abspath(overall_task_cfg_file)
|
|
509
|
-
|
|
510
|
-
# TODO: check the robustness of dump yaml
|
|
511
|
-
try:
|
|
512
|
-
logger.info(f'** Dump overall task config to {overall_task_cfg_file}')
|
|
513
|
-
logger.info(f'** The overall task config:\n {self.overall_task_cfg}')
|
|
514
|
-
if 'model' in self.overall_task_cfg and not isinstance(self.overall_task_cfg['model'], str):
|
|
515
|
-
self.overall_task_cfg['model'] = None
|
|
516
|
-
logger.info(f'>> Overwrite overall_task_cfg for `model` due to it is not a string')
|
|
517
|
-
if 'model_args' in self.overall_task_cfg and self.overall_task_cfg.get('model_args') is not None:
|
|
518
|
-
self.overall_task_cfg['model_args'].update({'precision': str(self.overall_task_cfg['model_args']['precision'])})
|
|
519
|
-
logger.info(f'>> Overwrite overall_task_cfg for `model_args.precision` due to it is not a string')
|
|
520
|
-
|
|
521
|
-
dict_to_yaml(self.overall_task_cfg, overall_task_cfg_file)
|
|
522
|
-
except Exception as e:
|
|
523
|
-
logger.warning(f'Failed to dump overall task config: {e}')
|
|
524
|
-
|
|
525
|
-
# Note: deprecated
|
|
526
|
-
# self.save_cache()
|
|
527
|
-
# self.clear_cache()
|
|
528
|
-
|
|
529
|
-
logger.info(f'\n**** Evaluation finished on {self.dataset_name_or_path} ****\n')
|
|
530
|
-
|
|
531
|
-
return report_map
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
class HumanevalEvaluator(object):
|
|
406
|
+
report_map = self.dump_report(reviews_score_all)
|
|
535
407
|
|
|
536
|
-
|
|
537
|
-
problem_file: str,
|
|
538
|
-
model_id: str,
|
|
539
|
-
model_revision: str,
|
|
540
|
-
model_adapter: BaseModelAdapter,
|
|
541
|
-
outputs_dir: Optional[str] = '',
|
|
542
|
-
is_custom_outputs_dir: bool = False,
|
|
543
|
-
k: List[int] = [1, 10, 100],
|
|
544
|
-
n_workers: int = 4,
|
|
545
|
-
timeout: float = 3.0,):
|
|
546
|
-
try:
|
|
547
|
-
from human_eval.evaluation import evaluate_functional_correctness
|
|
548
|
-
from human_eval.data import read_problems, write_jsonl
|
|
549
|
-
except ImportError:
|
|
550
|
-
raise ImportError('Please install human_eval:'
|
|
551
|
-
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
552
|
-
'Note that you need to enable the execution code in the human_eval/execution.py first.')
|
|
553
|
-
|
|
554
|
-
self.problem_file = problem_file
|
|
555
|
-
self.k = k
|
|
556
|
-
self.num_workers = n_workers
|
|
557
|
-
self.timeout = timeout
|
|
558
|
-
self.model_adapter = model_adapter
|
|
559
|
-
|
|
560
|
-
self.read_problems_func = read_problems
|
|
561
|
-
self.write_jsonl_func = write_jsonl
|
|
562
|
-
self.eval_func = evaluate_functional_correctness
|
|
563
|
-
|
|
564
|
-
# {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
|
|
565
|
-
self.problems = self.read_problems_func(self.problem_file)
|
|
566
|
-
|
|
567
|
-
# Get default outputs_dir
|
|
568
|
-
model_revision_str: str = model_revision if model_revision is not None else 'none'
|
|
569
|
-
# if not is_custom_outputs_dir:
|
|
570
|
-
# outputs_dir = make_outputs_dir(work_dir=outputs_dir,
|
|
571
|
-
# model_id=model_id,
|
|
572
|
-
# model_revision=model_revision_str)
|
|
573
|
-
self.outputs_dir = os.path.expanduser(outputs_dir)
|
|
574
|
-
|
|
575
|
-
# Deal with the output paths
|
|
576
|
-
self.outputs_structure = process_outputs_structure(self.outputs_dir)
|
|
577
|
-
|
|
578
|
-
def get_answers(self, infer_cfg: dict) -> List[dict]:
|
|
579
|
-
ans_list: list = []
|
|
580
|
-
system_prompt: str = 'Complete the following python code:\n'
|
|
581
|
-
for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
|
|
582
|
-
prompt: str = system_prompt + data_d['prompt']
|
|
583
|
-
inputs: dict = {'data': [prompt]}
|
|
584
|
-
# pred_res: dict = self.model_adapter.predict(inputs)
|
|
585
|
-
|
|
586
|
-
pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
|
|
587
|
-
|
|
588
|
-
pred_ans: str = pred_res['choices'][0]['message']['content']
|
|
589
|
-
pred_ans = self._postprocess(pred_ans)
|
|
590
|
-
|
|
591
|
-
ans_list.append({'task_id': task_id, 'completion': pred_ans})
|
|
592
|
-
|
|
593
|
-
return ans_list
|
|
594
|
-
|
|
595
|
-
def eval(self, infer_cfg: dict, **kwargs):
|
|
408
|
+
logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
|
|
596
409
|
|
|
597
|
-
|
|
598
|
-
ans_list: list = self.get_answers(infer_cfg)
|
|
599
|
-
ans_out_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR),
|
|
600
|
-
'human_eval_predictions.jsonl')
|
|
601
|
-
|
|
602
|
-
self.write_jsonl_func(filename=ans_out_file, data=ans_list)
|
|
603
|
-
# logger.info(f'** Dump predictions to {ans_out_file} successfully.')
|
|
604
|
-
logger.info('** Dump predictions successfully.')
|
|
605
|
-
|
|
606
|
-
# evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
|
|
607
|
-
results = self.eval_func(sample_file=ans_out_file,
|
|
608
|
-
k=self.k,
|
|
609
|
-
n_workers=self.num_workers,
|
|
610
|
-
timeout=self.timeout,
|
|
611
|
-
problem_file=self.problem_file)
|
|
612
|
-
|
|
613
|
-
# output: report
|
|
614
|
-
report_map: dict = self.gen_report(results=results)
|
|
615
|
-
report_dir: str = self.outputs_structure.get(OutputsStructure.REPORTS_DIR)
|
|
616
|
-
report_file: str = os.path.join(report_dir, 'human_eval_report.json')
|
|
617
|
-
|
|
618
|
-
with open(report_file, 'w') as f:
|
|
619
|
-
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
620
|
-
# logger.info(f'** Dump report to {report_file} \n')
|
|
621
|
-
logger.info(f'** Dump report \n')
|
|
622
|
-
|
|
623
|
-
try:
|
|
624
|
-
# Make table
|
|
625
|
-
report_table: str = gen_table([report_dir])
|
|
626
|
-
logger.info(f'** Report table: \n {report_table} \n')
|
|
627
|
-
except:
|
|
628
|
-
logger.error('Failed to generate report table.')
|
|
629
|
-
|
|
630
|
-
def gen_report(self, results: dict) -> dict:
|
|
631
|
-
"""
|
|
632
|
-
Generate report from evaluation results.
|
|
633
|
-
|
|
634
|
-
Returns:
|
|
635
|
-
{
|
|
636
|
-
"name":"ARC-Challenge",
|
|
637
|
-
"metric":"WeightedAverageAccuracy",
|
|
638
|
-
"score":0.3389,
|
|
639
|
-
"category":[
|
|
640
|
-
{
|
|
641
|
-
"name":"DEFAULT",
|
|
642
|
-
"score":0.3389,
|
|
643
|
-
"subset":[
|
|
644
|
-
{
|
|
645
|
-
"name":"ARC-Challenge",
|
|
646
|
-
"score":0.3389
|
|
647
|
-
},
|
|
648
|
-
]
|
|
649
|
-
}
|
|
650
|
-
],
|
|
651
|
-
"total_num":100
|
|
652
|
-
}
|
|
653
|
-
"""
|
|
654
|
-
results = {k: normalize_score(score=v) for k, v in results.items()}
|
|
655
|
-
|
|
656
|
-
category_d = dict(name='DEFAULT',
|
|
657
|
-
score=results,
|
|
658
|
-
subset=[])
|
|
659
|
-
|
|
660
|
-
res_map = dict(name='HumanEval',
|
|
661
|
-
metric='pass@k',
|
|
662
|
-
score=results,
|
|
663
|
-
category=[category_d],
|
|
664
|
-
total_num=len(self.problems))
|
|
665
|
-
|
|
666
|
-
return res_map
|
|
667
|
-
|
|
668
|
-
@classmethod
|
|
669
|
-
def _postprocess(cls, text: str) -> str:
|
|
670
|
-
if '```' in text:
|
|
671
|
-
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
|
|
672
|
-
if len(blocks) == 0:
|
|
673
|
-
text = text.split('```')[1] # fall back to default strategy
|
|
674
|
-
else:
|
|
675
|
-
text = blocks[0] # fetch the first code block
|
|
676
|
-
if not text.startswith('\n'): # in case starting with ```python
|
|
677
|
-
text = text[max(text.find('\n') + 1, 0):]
|
|
678
|
-
if text.strip().startswith('from') or text.strip().startswith('import'):
|
|
679
|
-
def_idx = text.find('def')
|
|
680
|
-
if def_idx != -1:
|
|
681
|
-
text = text[max(text.find('\n', def_idx) + 1, 0):]
|
|
682
|
-
text = text.split('\n\n')[0]
|
|
683
|
-
if text.strip().startswith('def'):
|
|
684
|
-
text = '\n'.join(text.split('\n')[1:])
|
|
685
|
-
if not text.startswith(' '):
|
|
686
|
-
if text.startswith(' '):
|
|
687
|
-
text = ' ' + text.lstrip()
|
|
688
|
-
else:
|
|
689
|
-
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
690
|
-
return text
|
|
410
|
+
return report_map
|