evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +2 -3
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -1,28 +1,26 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
import os
|
|
4
5
|
import time
|
|
5
|
-
import json
|
|
6
|
-
import re
|
|
7
|
-
from copy import deepcopy
|
|
8
6
|
from collections import OrderedDict
|
|
9
|
-
|
|
7
|
+
from copy import deepcopy
|
|
10
8
|
from tqdm import tqdm
|
|
11
|
-
from typing import
|
|
9
|
+
from typing import Any, Dict, List, Optional, Union
|
|
12
10
|
|
|
13
11
|
from evalscope.benchmarks import DataAdapter
|
|
14
|
-
from evalscope.
|
|
12
|
+
from evalscope.config import TaskConfig
|
|
13
|
+
from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
|
|
14
|
+
OutputsStructure, ReviewKeys)
|
|
15
15
|
from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
|
|
16
16
|
from evalscope.tools.combine_reports import gen_table
|
|
17
|
-
from evalscope.utils import
|
|
18
|
-
normalize_score, dict_to_yaml, jsonl_to_list
|
|
17
|
+
from evalscope.utils import dict_torch_dtype_to_str, dump_jsonl_data, gen_hash, jsonl_to_list
|
|
19
18
|
from evalscope.utils.logger import get_logger
|
|
20
19
|
|
|
21
20
|
logger = get_logger()
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
class Evaluator(object):
|
|
25
|
-
|
|
26
24
|
"""
|
|
27
25
|
The evaluator for model on datasets.
|
|
28
26
|
|
|
@@ -33,11 +31,8 @@ class Evaluator(object):
|
|
|
33
31
|
data_adapter: DataAdapter, the data adapter for the dataset.
|
|
34
32
|
subset_list: list, the subset list for the dataset.
|
|
35
33
|
model_adapter: BaseModelAdapter, the model adapter for the model.
|
|
36
|
-
use_cache:
|
|
37
|
-
|
|
38
|
-
root_cache_dir: str, the root cache dir. Default: DEFAULT_ROOT_CACHE_DIR
|
|
39
|
-
outputs_dir: str, the outputs dir. Default: ''
|
|
40
|
-
is_custom_outputs_dir: bool, whether to use custom outputs dir. Default: False (deprecated)
|
|
34
|
+
use_cache: str, path to local cache. Default: None
|
|
35
|
+
outputs_dir: OutputsStructure, the outputs dir. Default: None
|
|
41
36
|
datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
|
|
42
37
|
datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
|
|
43
38
|
stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
|
|
@@ -51,24 +46,20 @@ class Evaluator(object):
|
|
|
51
46
|
data_adapter: DataAdapter,
|
|
52
47
|
subset_list: Optional[list] = None,
|
|
53
48
|
model_adapter: Optional[BaseModelAdapter] = None,
|
|
54
|
-
use_cache:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
stage: Optional[str] = 'all', # refer to evalscope.constants.EvalStage
|
|
62
|
-
eval_type: Optional[str] = 'checkpoint', # `checkpoint` or `service` or `custom`
|
|
63
|
-
overall_task_cfg: Optional[dict] = None,
|
|
49
|
+
use_cache: Optional[str] = None,
|
|
50
|
+
outputs: Optional[OutputsStructure] = None,
|
|
51
|
+
datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
52
|
+
datasets_hub: Optional[str] = HubType.MODELSCOPE,
|
|
53
|
+
stage: Optional[str] = EvalStage.ALL,
|
|
54
|
+
eval_type: Optional[str] = EvalType.CHECKPOINT,
|
|
55
|
+
overall_task_cfg: Optional[TaskConfig] = None,
|
|
64
56
|
**kwargs):
|
|
65
57
|
|
|
66
58
|
self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
67
|
-
self.
|
|
68
|
-
|
|
69
|
-
|
|
59
|
+
self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
|
|
60
|
+
self.model_name = os.path.basename(str(overall_task_cfg.model).rstrip(os.sep))
|
|
61
|
+
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
70
62
|
|
|
71
|
-
self.root_cache_dir = os.path.expanduser(root_cache_dir)
|
|
72
63
|
self.datasets_dir = os.path.expanduser(datasets_dir)
|
|
73
64
|
self.kwargs = kwargs
|
|
74
65
|
self.data_adapter = data_adapter
|
|
@@ -78,70 +69,31 @@ class Evaluator(object):
|
|
|
78
69
|
self.use_cache = use_cache
|
|
79
70
|
self.overall_task_cfg = overall_task_cfg
|
|
80
71
|
if isinstance(self.model_adapter, CustomModelAdapter):
|
|
81
|
-
self.overall_task_cfg.
|
|
72
|
+
self.overall_task_cfg.model_args = self.model_adapter.custom_model.config
|
|
82
73
|
|
|
83
74
|
self.model_cfg = self.model_adapter.model_cfg
|
|
84
|
-
self.model_id = self.model_cfg['model_id']
|
|
85
|
-
self.model_revision = self.model_cfg.get('revision', None)
|
|
86
|
-
self.model_revision_str = self.model_revision if self.model_revision is not None else 'none'
|
|
87
|
-
|
|
88
|
-
# Get default outputs_dir
|
|
89
|
-
# TODO: refactor outputs_dir, del timestamp concat
|
|
90
|
-
# if not is_custom_outputs_dir:
|
|
91
|
-
# outputs_dir = make_outputs_dir(work_dir=outputs_dir,
|
|
92
|
-
# model_id=self.model_id,
|
|
93
|
-
# model_revision=self.model_revision_str)
|
|
94
|
-
|
|
95
|
-
self.outputs_dir = os.path.expanduser(outputs_dir)
|
|
96
75
|
|
|
97
76
|
# Deal with the output paths
|
|
98
|
-
self.outputs_structure =
|
|
77
|
+
self.outputs_structure = outputs
|
|
99
78
|
|
|
100
79
|
# Load dataset
|
|
101
|
-
self.dataset = self.data_adapter.load(
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
80
|
+
self.dataset = self.data_adapter.load(
|
|
81
|
+
dataset_name_or_path=dataset_name_or_path,
|
|
82
|
+
subset_list=subset_list,
|
|
83
|
+
work_dir=self.datasets_dir,
|
|
84
|
+
datasets_hub=datasets_hub,
|
|
85
|
+
**kwargs)
|
|
106
86
|
|
|
107
87
|
# Get prompts from dataset
|
|
108
88
|
self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
|
|
109
89
|
del self.dataset
|
|
110
90
|
|
|
111
|
-
|
|
112
|
-
# TODO: refactor mem cache manager
|
|
113
|
-
# mem_cache_file_name = self.dataset_name_or_path.replace('/', '_') + \
|
|
114
|
-
# '_' + self.model_id.replace('/', '_') + \
|
|
115
|
-
# '_' + self.model_revision_str + \
|
|
116
|
-
# '_cache.pkl'
|
|
117
|
-
# self.mem_cache_path = os.path.join(self.root_cache_dir, 'mem_cache', mem_cache_file_name)
|
|
118
|
-
|
|
119
|
-
# Note: mem_cache is deprecated, use `use_cache` instead
|
|
120
|
-
self.mem_cache = None
|
|
121
|
-
self.mem_cache_method = mem_cache_method
|
|
122
|
-
# if self.use_cache:
|
|
123
|
-
# self.mem_cache = init_mem_cache(method=self.mem_cache_method, cache_file_path=self.mem_cache_path)
|
|
124
|
-
# logger.info(f'** Using memory cache with size: {len(self.mem_cache)}')
|
|
125
|
-
|
|
126
|
-
def _pred_answer(self,
|
|
127
|
-
input_d: dict,
|
|
128
|
-
infer_cfg: dict,
|
|
129
|
-
subset_name: str,
|
|
130
|
-
answer_id: str = None) -> dict:
|
|
131
|
-
|
|
132
|
-
# Get answer from memory cache
|
|
133
|
-
if self.mem_cache is not None:
|
|
134
|
-
if answer_id in self.mem_cache:
|
|
135
|
-
logger.info(f'** Reusing answer `{answer_id}` in memory cache.')
|
|
136
|
-
return self.mem_cache[answer_id]
|
|
91
|
+
def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
|
|
137
92
|
|
|
138
93
|
ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
|
|
139
94
|
ans[AnswerKeys.ANSWER_ID] = answer_id
|
|
140
95
|
ans[AnswerKeys.SUBSET_NAME] = subset_name
|
|
141
96
|
|
|
142
|
-
if self.mem_cache is not None:
|
|
143
|
-
self.mem_cache[answer_id] = ans
|
|
144
|
-
|
|
145
97
|
return ans
|
|
146
98
|
|
|
147
99
|
def get_answers(self,
|
|
@@ -177,26 +129,21 @@ class Evaluator(object):
|
|
|
177
129
|
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
178
130
|
|
|
179
131
|
answers_list = []
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
pred_file_name: str = self.custom_task_name + '_' + subset_name + '.jsonl'
|
|
184
|
-
else:
|
|
185
|
-
pred_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
|
|
186
|
-
|
|
187
|
-
pred_file_path: str = os.path.join(pred_dir, pred_file_name)
|
|
132
|
+
pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
|
|
133
|
+
pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
|
|
134
|
+
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
188
135
|
|
|
189
136
|
if self.use_cache and os.path.exists(pred_file_path):
|
|
190
137
|
answers_list = jsonl_to_list(pred_file_path)
|
|
191
|
-
logger.info(f'
|
|
192
|
-
|
|
193
|
-
|
|
138
|
+
logger.info(f'Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
|
|
139
|
+
# Note: assume prediction in order of prompts_list
|
|
140
|
+
prompts_list = prompts_list[len(answers_list):]
|
|
194
141
|
|
|
195
142
|
if isinstance(self.model_adapter, CustomModelAdapter):
|
|
196
143
|
# Batch inference for custom model
|
|
197
144
|
|
|
198
|
-
resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
|
|
199
|
-
|
|
145
|
+
resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
|
|
146
|
+
inputs=prompts_list, infer_cfg=infer_cfg)
|
|
200
147
|
|
|
201
148
|
assert len(prompts_list) == len(resp_answers_list), \
|
|
202
149
|
f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
|
|
@@ -207,10 +154,10 @@ class Evaluator(object):
|
|
|
207
154
|
model_cfg_str = json.dumps(
|
|
208
155
|
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
209
156
|
ensure_ascii=False)
|
|
210
|
-
input_prompt_str = json.dumps(
|
|
211
|
-
|
|
212
|
-
infer_cfg_str = json.dumps(
|
|
213
|
-
|
|
157
|
+
input_prompt_str = json.dumps(
|
|
158
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
|
|
159
|
+
infer_cfg_str = json.dumps(
|
|
160
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
214
161
|
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
215
162
|
|
|
216
163
|
resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
@@ -220,6 +167,7 @@ class Evaluator(object):
|
|
|
220
167
|
resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
|
|
221
168
|
|
|
222
169
|
answers_list.append(resp_d)
|
|
170
|
+
dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
223
171
|
|
|
224
172
|
else:
|
|
225
173
|
for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
|
|
@@ -228,17 +176,15 @@ class Evaluator(object):
|
|
|
228
176
|
model_cfg_str = json.dumps(
|
|
229
177
|
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
230
178
|
ensure_ascii=False)
|
|
231
|
-
input_prompt_str = json.dumps(
|
|
232
|
-
|
|
233
|
-
infer_cfg_str = json.dumps(
|
|
234
|
-
|
|
179
|
+
input_prompt_str = json.dumps(
|
|
180
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
|
|
181
|
+
infer_cfg_str = json.dumps(
|
|
182
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
235
183
|
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
236
184
|
|
|
237
185
|
# Get answers
|
|
238
|
-
answer_d: dict = self._pred_answer(
|
|
239
|
-
|
|
240
|
-
subset_name=subset_name,
|
|
241
|
-
answer_id=answer_id)
|
|
186
|
+
answer_d: dict = self._pred_answer(
|
|
187
|
+
input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
|
|
242
188
|
|
|
243
189
|
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
244
190
|
answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
|
|
@@ -249,26 +195,12 @@ class Evaluator(object):
|
|
|
249
195
|
logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
|
|
250
196
|
|
|
251
197
|
answers_list.append(answer_d)
|
|
198
|
+
dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
252
199
|
|
|
253
|
-
|
|
254
|
-
logger.error(f'** Got empty predictions on subset {subset_name} of dataset: {self.dataset_name_or_path}')
|
|
255
|
-
|
|
256
|
-
# Dump answers
|
|
257
|
-
os.makedirs(pred_dir, exist_ok=True)
|
|
258
|
-
dump_jsonl_data(answers_list, pred_file_path)
|
|
259
|
-
|
|
200
|
+
logger.info(f'Dump predictions to {pred_file_path}.')
|
|
260
201
|
return answers_list
|
|
261
202
|
|
|
262
|
-
def _get_review(self,
|
|
263
|
-
answer_d: dict,
|
|
264
|
-
review_id: str = None,
|
|
265
|
-
reviewer_spec: dict = None) -> dict:
|
|
266
|
-
|
|
267
|
-
# Get review from memory cache
|
|
268
|
-
if self.mem_cache is not None:
|
|
269
|
-
if review_id in self.mem_cache:
|
|
270
|
-
logger.info(f'** Reusing review `{review_id}` in memory cache.')
|
|
271
|
-
return self.mem_cache[review_id]
|
|
203
|
+
def _get_review(self, answer_d: dict, review_id: str = None, reviewer_spec: dict = None) -> dict:
|
|
272
204
|
|
|
273
205
|
if reviewer_spec is None:
|
|
274
206
|
reviewer_spec = {}
|
|
@@ -286,15 +218,16 @@ class Evaluator(object):
|
|
|
286
218
|
for choice in choices:
|
|
287
219
|
raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
|
|
288
220
|
answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
|
|
289
|
-
answer_content = self.data_adapter.parse_pred_result(
|
|
290
|
-
|
|
291
|
-
eval_type=self.eval_type)
|
|
221
|
+
answer_content = self.data_adapter.parse_pred_result(
|
|
222
|
+
result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
|
|
292
223
|
gold_content = self.data_adapter.get_gold_answer(raw_input_d)
|
|
293
224
|
|
|
294
225
|
review_result = self.data_adapter.match(gold_content, answer_content)
|
|
295
|
-
choice[ReviewKeys.REVIEW] = {
|
|
296
|
-
|
|
297
|
-
|
|
226
|
+
choice[ReviewKeys.REVIEW] = {
|
|
227
|
+
ReviewKeys.GOLD: gold_content,
|
|
228
|
+
ReviewKeys.PRED: answer_content,
|
|
229
|
+
ReviewKeys.RESULT: review_result
|
|
230
|
+
}
|
|
298
231
|
|
|
299
232
|
rev_choices.append(choice)
|
|
300
233
|
|
|
@@ -304,9 +237,6 @@ class Evaluator(object):
|
|
|
304
237
|
review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
|
|
305
238
|
review_res[ReviewKeys.REVIEW_TIME] = time.time()
|
|
306
239
|
|
|
307
|
-
if self.mem_cache is not None:
|
|
308
|
-
self.mem_cache[review_id] = review_res
|
|
309
|
-
|
|
310
240
|
return review_res
|
|
311
241
|
|
|
312
242
|
def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
|
|
@@ -324,26 +254,25 @@ class Evaluator(object):
|
|
|
324
254
|
"""
|
|
325
255
|
reviews_list = []
|
|
326
256
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
else:
|
|
331
|
-
review_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
|
|
332
|
-
review_file_path: str = os.path.join(review_dir, review_file_name)
|
|
257
|
+
review_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
|
|
258
|
+
review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
|
|
259
|
+
os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
|
|
333
260
|
|
|
334
261
|
if self.use_cache and os.path.exists(review_file_path):
|
|
335
|
-
logger.warning(f'
|
|
262
|
+
logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
|
|
336
263
|
|
|
337
264
|
for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
|
|
338
265
|
|
|
339
266
|
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
340
267
|
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
341
268
|
|
|
342
|
-
reviewer_spec: dict = {
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
269
|
+
reviewer_spec: dict = {
|
|
270
|
+
'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
|
|
271
|
+
'reviewer': ['Evaluator'],
|
|
272
|
+
'revision': ['default']
|
|
273
|
+
}
|
|
274
|
+
reviewer_spec_str = json.dumps(
|
|
275
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
347
276
|
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
348
277
|
|
|
349
278
|
# Get review
|
|
@@ -354,9 +283,8 @@ class Evaluator(object):
|
|
|
354
283
|
|
|
355
284
|
reviews_list.append(review_d)
|
|
356
285
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
dump_jsonl_data(reviews_list, review_file_path)
|
|
286
|
+
# Dump reviews
|
|
287
|
+
dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
|
|
360
288
|
|
|
361
289
|
return reviews_list
|
|
362
290
|
|
|
@@ -375,7 +303,7 @@ class Evaluator(object):
|
|
|
375
303
|
review_res_list = []
|
|
376
304
|
for review_d in reviews_list:
|
|
377
305
|
if not review_d[ReviewKeys.REVIEWED]:
|
|
378
|
-
logger.warning(f'
|
|
306
|
+
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
379
307
|
continue
|
|
380
308
|
|
|
381
309
|
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
@@ -385,7 +313,7 @@ class Evaluator(object):
|
|
|
385
313
|
|
|
386
314
|
return metric_score
|
|
387
315
|
|
|
388
|
-
def dump_report(self,
|
|
316
|
+
def dump_report(self, reviews_score_all: dict, use_table: bool = True):
|
|
389
317
|
"""
|
|
390
318
|
Get report for total reviews of specific dataset.
|
|
391
319
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -396,50 +324,31 @@ class Evaluator(object):
|
|
|
396
324
|
|
|
397
325
|
Returns: None
|
|
398
326
|
"""
|
|
327
|
+
# Get report map
|
|
328
|
+
report_map: dict = self.data_adapter.gen_report(
|
|
329
|
+
subset_score_map=reviews_score_all, report_name=self.custom_task_name)
|
|
330
|
+
report_map.update(dict(model_name=self.model_name, dataset_name=self.dataset_name))
|
|
399
331
|
|
|
400
332
|
# Dump report
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
report_file_name: str = self.custom_task_name + '.json'
|
|
405
|
-
else:
|
|
406
|
-
report_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '.json'
|
|
333
|
+
report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
|
|
334
|
+
self.dataset_name + '.json')
|
|
335
|
+
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
|
407
336
|
|
|
408
|
-
|
|
409
|
-
report_path: str = os.path.join(report_dir, report_file_name)
|
|
337
|
+
# Write report
|
|
410
338
|
with open(report_path, 'w') as f:
|
|
411
339
|
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
412
|
-
|
|
413
|
-
logger.info(f'** Dump report: {report_file_name} \n')
|
|
340
|
+
logger.info(f'Dump report: {report_path} \n')
|
|
414
341
|
|
|
342
|
+
# Make table
|
|
415
343
|
if use_table:
|
|
416
344
|
try:
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
except:
|
|
345
|
+
report_table: str = gen_table([self.outputs_structure.reports_dir])
|
|
346
|
+
logger.info(f'Report table: \n{report_table} \n')
|
|
347
|
+
except Exception:
|
|
421
348
|
logger.error('Failed to generate report table.')
|
|
349
|
+
return report_map
|
|
422
350
|
|
|
423
|
-
|
|
424
|
-
# if self.mem_cache is not None:
|
|
425
|
-
# logger.info(f'** Saving memory cache with size: {len(self.mem_cache)}')
|
|
426
|
-
# Cache.save(cache=self.mem_cache, path=self.mem_cache_path)
|
|
427
|
-
|
|
428
|
-
# def clear_cache(self):
|
|
429
|
-
# """
|
|
430
|
-
# Clear memory cache.
|
|
431
|
-
#
|
|
432
|
-
# Returns: None
|
|
433
|
-
# """
|
|
434
|
-
# if self.mem_cache is not None:
|
|
435
|
-
# cache_len = len(self.mem_cache)
|
|
436
|
-
# self.mem_cache.clear()
|
|
437
|
-
# logger.info(f'** Memory cache cleared, length changed: {cache_len} -> {len(self.mem_cache)}')
|
|
438
|
-
|
|
439
|
-
def eval(self,
|
|
440
|
-
infer_cfg: dict = None,
|
|
441
|
-
debug: bool = False,
|
|
442
|
-
**kwargs) -> dict:
|
|
351
|
+
def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict:
|
|
443
352
|
"""
|
|
444
353
|
Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
|
|
445
354
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -465,27 +374,22 @@ class Evaluator(object):
|
|
|
465
374
|
|
|
466
375
|
logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
|
|
467
376
|
|
|
468
|
-
reviews_score_all = {}
|
|
377
|
+
reviews_score_all = {} # {subset_name: (score, num)}
|
|
469
378
|
stage_answers_dict = {}
|
|
470
379
|
stage_reviews_dict = {}
|
|
471
380
|
|
|
472
381
|
for subset_name, prompts_list in self.prompts.items():
|
|
473
|
-
limit =
|
|
382
|
+
limit = kwargs.get('limit', len(prompts_list))
|
|
474
383
|
prompts_list = prompts_list[:limit]
|
|
475
384
|
|
|
476
|
-
answers_list: list = self.get_answers(
|
|
477
|
-
|
|
478
|
-
infer_cfg=infer_cfg,
|
|
479
|
-
debug=debug,
|
|
480
|
-
**kwargs)
|
|
385
|
+
answers_list: list = self.get_answers(
|
|
386
|
+
subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug, **kwargs)
|
|
481
387
|
if self.stage == EvalStage.INFER:
|
|
482
388
|
stage_answers_dict[subset_name] = answers_list
|
|
483
389
|
continue
|
|
484
390
|
|
|
485
|
-
reviews_list: list = self.get_reviews(
|
|
486
|
-
|
|
487
|
-
debug=debug,
|
|
488
|
-
**kwargs)
|
|
391
|
+
reviews_list: list = self.get_reviews(
|
|
392
|
+
subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
|
|
489
393
|
|
|
490
394
|
metric_res = self.compute_metrics(reviews_list=reviews_list)
|
|
491
395
|
reviews_score_all[subset_name] = (metric_res, len(reviews_list))
|
|
@@ -498,193 +402,8 @@ class Evaluator(object):
|
|
|
498
402
|
return stage_reviews_dict
|
|
499
403
|
|
|
500
404
|
# Generate report
|
|
501
|
-
report_map
|
|
502
|
-
report_name=self.custom_task_name)
|
|
503
|
-
self.dump_report(report_map=report_map)
|
|
504
|
-
|
|
505
|
-
# Dump overall task config
|
|
506
|
-
overall_task_cfg_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.CONFIGS_DIR),
|
|
507
|
-
'task_output_config.yaml')
|
|
508
|
-
overall_task_cfg_file = os.path.abspath(overall_task_cfg_file)
|
|
509
|
-
|
|
510
|
-
# TODO: check the robustness of dump yaml
|
|
511
|
-
try:
|
|
512
|
-
logger.info(f'** Dump overall task config to {overall_task_cfg_file}')
|
|
513
|
-
logger.info(f'** The overall task config:\n {self.overall_task_cfg}')
|
|
514
|
-
if 'model' in self.overall_task_cfg and not isinstance(self.overall_task_cfg['model'], str):
|
|
515
|
-
self.overall_task_cfg['model'] = None
|
|
516
|
-
logger.info(f'>> Overwrite overall_task_cfg for `model` due to it is not a string')
|
|
517
|
-
if 'model_args' in self.overall_task_cfg and self.overall_task_cfg.get('model_args') is not None:
|
|
518
|
-
self.overall_task_cfg['model_args'].update({'precision': str(self.overall_task_cfg['model_args']['precision'])})
|
|
519
|
-
logger.info(f'>> Overwrite overall_task_cfg for `model_args.precision` due to it is not a string')
|
|
520
|
-
|
|
521
|
-
dict_to_yaml(self.overall_task_cfg, overall_task_cfg_file)
|
|
522
|
-
except Exception as e:
|
|
523
|
-
logger.warning(f'Failed to dump overall task config: {e}')
|
|
524
|
-
|
|
525
|
-
# Note: deprecated
|
|
526
|
-
# self.save_cache()
|
|
527
|
-
# self.clear_cache()
|
|
528
|
-
|
|
529
|
-
logger.info(f'\n**** Evaluation finished on {self.dataset_name_or_path} ****\n')
|
|
530
|
-
|
|
531
|
-
return report_map
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
class HumanevalEvaluator(object):
|
|
405
|
+
report_map = self.dump_report(reviews_score_all)
|
|
535
406
|
|
|
536
|
-
|
|
537
|
-
problem_file: str,
|
|
538
|
-
model_id: str,
|
|
539
|
-
model_revision: str,
|
|
540
|
-
model_adapter: BaseModelAdapter,
|
|
541
|
-
outputs_dir: Optional[str] = '',
|
|
542
|
-
is_custom_outputs_dir: bool = False,
|
|
543
|
-
k: List[int] = [1, 10, 100],
|
|
544
|
-
n_workers: int = 4,
|
|
545
|
-
timeout: float = 3.0,):
|
|
546
|
-
try:
|
|
547
|
-
from human_eval.evaluation import evaluate_functional_correctness
|
|
548
|
-
from human_eval.data import read_problems, write_jsonl
|
|
549
|
-
except ImportError:
|
|
550
|
-
raise ImportError('Please install human_eval:'
|
|
551
|
-
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
552
|
-
'Note that you need to enable the execution code in the human_eval/execution.py first.')
|
|
553
|
-
|
|
554
|
-
self.problem_file = problem_file
|
|
555
|
-
self.k = k
|
|
556
|
-
self.num_workers = n_workers
|
|
557
|
-
self.timeout = timeout
|
|
558
|
-
self.model_adapter = model_adapter
|
|
559
|
-
|
|
560
|
-
self.read_problems_func = read_problems
|
|
561
|
-
self.write_jsonl_func = write_jsonl
|
|
562
|
-
self.eval_func = evaluate_functional_correctness
|
|
563
|
-
|
|
564
|
-
# {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
|
|
565
|
-
self.problems = self.read_problems_func(self.problem_file)
|
|
566
|
-
|
|
567
|
-
# Get default outputs_dir
|
|
568
|
-
model_revision_str: str = model_revision if model_revision is not None else 'none'
|
|
569
|
-
# if not is_custom_outputs_dir:
|
|
570
|
-
# outputs_dir = make_outputs_dir(work_dir=outputs_dir,
|
|
571
|
-
# model_id=model_id,
|
|
572
|
-
# model_revision=model_revision_str)
|
|
573
|
-
self.outputs_dir = os.path.expanduser(outputs_dir)
|
|
574
|
-
|
|
575
|
-
# Deal with the output paths
|
|
576
|
-
self.outputs_structure = process_outputs_structure(self.outputs_dir)
|
|
577
|
-
|
|
578
|
-
def get_answers(self, infer_cfg: dict) -> List[dict]:
|
|
579
|
-
ans_list: list = []
|
|
580
|
-
system_prompt: str = 'Complete the following python code:\n'
|
|
581
|
-
for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
|
|
582
|
-
prompt: str = system_prompt + data_d['prompt']
|
|
583
|
-
inputs: dict = {'data': [prompt]}
|
|
584
|
-
# pred_res: dict = self.model_adapter.predict(inputs)
|
|
585
|
-
|
|
586
|
-
pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
|
|
587
|
-
|
|
588
|
-
pred_ans: str = pred_res['choices'][0]['message']['content']
|
|
589
|
-
pred_ans = self._postprocess(pred_ans)
|
|
590
|
-
|
|
591
|
-
ans_list.append({'task_id': task_id, 'completion': pred_ans})
|
|
592
|
-
|
|
593
|
-
return ans_list
|
|
594
|
-
|
|
595
|
-
def eval(self, infer_cfg: dict, **kwargs):
|
|
407
|
+
logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
|
|
596
408
|
|
|
597
|
-
|
|
598
|
-
ans_list: list = self.get_answers(infer_cfg)
|
|
599
|
-
ans_out_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR),
|
|
600
|
-
'human_eval_predictions.jsonl')
|
|
601
|
-
|
|
602
|
-
self.write_jsonl_func(filename=ans_out_file, data=ans_list)
|
|
603
|
-
# logger.info(f'** Dump predictions to {ans_out_file} successfully.')
|
|
604
|
-
logger.info('** Dump predictions successfully.')
|
|
605
|
-
|
|
606
|
-
# evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
|
|
607
|
-
results = self.eval_func(sample_file=ans_out_file,
|
|
608
|
-
k=self.k,
|
|
609
|
-
n_workers=self.num_workers,
|
|
610
|
-
timeout=self.timeout,
|
|
611
|
-
problem_file=self.problem_file)
|
|
612
|
-
|
|
613
|
-
# output: report
|
|
614
|
-
report_map: dict = self.gen_report(results=results)
|
|
615
|
-
report_dir: str = self.outputs_structure.get(OutputsStructure.REPORTS_DIR)
|
|
616
|
-
report_file: str = os.path.join(report_dir, 'human_eval_report.json')
|
|
617
|
-
|
|
618
|
-
with open(report_file, 'w') as f:
|
|
619
|
-
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
620
|
-
# logger.info(f'** Dump report to {report_file} \n')
|
|
621
|
-
logger.info(f'** Dump report \n')
|
|
622
|
-
|
|
623
|
-
try:
|
|
624
|
-
# Make table
|
|
625
|
-
report_table: str = gen_table([report_dir])
|
|
626
|
-
logger.info(f'** Report table: \n {report_table} \n')
|
|
627
|
-
except:
|
|
628
|
-
logger.error('Failed to generate report table.')
|
|
629
|
-
|
|
630
|
-
def gen_report(self, results: dict) -> dict:
|
|
631
|
-
"""
|
|
632
|
-
Generate report from evaluation results.
|
|
633
|
-
|
|
634
|
-
Returns:
|
|
635
|
-
{
|
|
636
|
-
"name":"ARC-Challenge",
|
|
637
|
-
"metric":"WeightedAverageAccuracy",
|
|
638
|
-
"score":0.3389,
|
|
639
|
-
"category":[
|
|
640
|
-
{
|
|
641
|
-
"name":"DEFAULT",
|
|
642
|
-
"score":0.3389,
|
|
643
|
-
"subset":[
|
|
644
|
-
{
|
|
645
|
-
"name":"ARC-Challenge",
|
|
646
|
-
"score":0.3389
|
|
647
|
-
},
|
|
648
|
-
]
|
|
649
|
-
}
|
|
650
|
-
],
|
|
651
|
-
"total_num":100
|
|
652
|
-
}
|
|
653
|
-
"""
|
|
654
|
-
results = {k: normalize_score(score=v) for k, v in results.items()}
|
|
655
|
-
|
|
656
|
-
category_d = dict(name='DEFAULT',
|
|
657
|
-
score=results,
|
|
658
|
-
subset=[])
|
|
659
|
-
|
|
660
|
-
res_map = dict(name='HumanEval',
|
|
661
|
-
metric='pass@k',
|
|
662
|
-
score=results,
|
|
663
|
-
category=[category_d],
|
|
664
|
-
total_num=len(self.problems))
|
|
665
|
-
|
|
666
|
-
return res_map
|
|
667
|
-
|
|
668
|
-
@classmethod
|
|
669
|
-
def _postprocess(cls, text: str) -> str:
|
|
670
|
-
if '```' in text:
|
|
671
|
-
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
|
|
672
|
-
if len(blocks) == 0:
|
|
673
|
-
text = text.split('```')[1] # fall back to default strategy
|
|
674
|
-
else:
|
|
675
|
-
text = blocks[0] # fetch the first code block
|
|
676
|
-
if not text.startswith('\n'): # in case starting with ```python
|
|
677
|
-
text = text[max(text.find('\n') + 1, 0):]
|
|
678
|
-
if text.strip().startswith('from') or text.strip().startswith('import'):
|
|
679
|
-
def_idx = text.find('def')
|
|
680
|
-
if def_idx != -1:
|
|
681
|
-
text = text[max(text.find('\n', def_idx) + 1, 0):]
|
|
682
|
-
text = text.split('\n\n')[0]
|
|
683
|
-
if text.strip().startswith('def'):
|
|
684
|
-
text = '\n'.join(text.split('\n')[1:])
|
|
685
|
-
if not text.startswith(' '):
|
|
686
|
-
if text.startswith(' '):
|
|
687
|
-
text = ' ' + text.lstrip()
|
|
688
|
-
else:
|
|
689
|
-
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
690
|
-
return text
|
|
409
|
+
return report_map
|