evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
|
-
import json
|
|
5
3
|
import glob
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from collections import defaultdict
|
|
6
7
|
from tabulate import tabulate
|
|
8
|
+
|
|
7
9
|
from evalscope.utils.logger import get_logger
|
|
8
10
|
|
|
9
11
|
logger = get_logger()
|
|
10
|
-
|
|
11
12
|
"""
|
|
12
13
|
Combine and generate table for reports of LLMs.
|
|
13
14
|
"""
|
|
@@ -15,33 +16,29 @@ Combine and generate table for reports of LLMs.
|
|
|
15
16
|
|
|
16
17
|
def get_report(report_file: str):
|
|
17
18
|
data_d: dict = json.load(open(report_file, 'r'))
|
|
18
|
-
dataset_name = data_d['
|
|
19
|
-
|
|
19
|
+
dataset_name = data_d['dataset_name']
|
|
20
|
+
model_name = data_d['model_name']
|
|
21
|
+
score = data_d['score'] # float or dict
|
|
22
|
+
metric = data_d['metric']
|
|
20
23
|
score_d = {}
|
|
21
24
|
if isinstance(score, dict):
|
|
22
|
-
# score_d = dict([(k, round(v, 4) * 100) for k, v in score.items()])
|
|
23
25
|
score_d = score
|
|
24
26
|
elif isinstance(score, float):
|
|
25
|
-
|
|
26
|
-
score_d['acc'] = score
|
|
27
|
+
score_d[metric] = score
|
|
27
28
|
else:
|
|
28
29
|
raise ValueError(f'Unknown score type: {type(score)}')
|
|
29
|
-
# score_str = '\n'.join([str(v) + ' (' + k + ')' for k, v in score_d.items()])
|
|
30
30
|
score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
|
|
31
31
|
|
|
32
|
-
return {'dataset_name': dataset_name, 'score': score_str}
|
|
32
|
+
return model_name, {'dataset_name': dataset_name, 'score': score_str}
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
def get_model_reports(model_report_dir: str):
|
|
36
36
|
model_report_dir = os.path.normpath(model_report_dir)
|
|
37
|
-
|
|
38
|
-
model_info = os.path.basename(os.path.normpath(model_report_dir))
|
|
39
|
-
model_name = '_'.join(model_info.split('_')[:-1][3:])
|
|
40
|
-
report_files = glob.glob(os.path.join(model_report_dir, 'reports', '*.json'))
|
|
37
|
+
report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
|
|
41
38
|
|
|
42
|
-
model_reports_d =
|
|
39
|
+
model_reports_d = defaultdict(list)
|
|
43
40
|
for file_path in report_files:
|
|
44
|
-
report_d = get_report(file_path)
|
|
41
|
+
model_name, report_d = get_report(file_path)
|
|
45
42
|
model_reports_d[model_name].append(report_d)
|
|
46
43
|
|
|
47
44
|
return model_reports_d
|
|
@@ -55,8 +52,6 @@ def gen_table(reports_path_list: list):
|
|
|
55
52
|
for report_path in reports_path_list:
|
|
56
53
|
model_reports_d = get_model_reports(report_path)
|
|
57
54
|
for model_name, report_list in model_reports_d.items():
|
|
58
|
-
# report_list: [{'dataset_name': 'CompetitionMath', 'score': '4.42 (acc)'},
|
|
59
|
-
# {'dataset_name': 'GSM8K', 'score': '28.51 (acc)'}]
|
|
60
55
|
report_list = sorted(report_list, key=lambda x: x['dataset_name'])
|
|
61
56
|
if not is_headers_set:
|
|
62
57
|
headers.extend([x['dataset_name'] for x in report_list])
|
|
@@ -71,37 +66,34 @@ def gen_table(reports_path_list: list):
|
|
|
71
66
|
report_table = tabulate(table_values, headers=headers, tablefmt='grid')
|
|
72
67
|
return report_table
|
|
73
68
|
|
|
69
|
+
|
|
74
70
|
class ReportsRecorder:
|
|
75
71
|
COMMON_DATASET_PATH = []
|
|
76
72
|
CUSTOM_DATASET_PATH = []
|
|
77
73
|
|
|
78
|
-
def __init__(self, oss_url: str =
|
|
74
|
+
def __init__(self, oss_url: str = '', endpoint: str = ''):
|
|
79
75
|
if oss_url and endpoint:
|
|
80
76
|
import oss2
|
|
81
77
|
from oss2.credentials import EnvironmentVariableCredentialsProvider
|
|
82
78
|
|
|
83
79
|
auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
|
|
84
|
-
oss_url = oss_url.replace(
|
|
80
|
+
oss_url = oss_url.replace('oss://', '').split('/')
|
|
85
81
|
bucket_name = oss_url[0]
|
|
86
82
|
|
|
87
|
-
self.object_path =
|
|
83
|
+
self.object_path = '/'.join(oss_url[1:])
|
|
88
84
|
self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
|
|
89
85
|
else:
|
|
90
|
-
self.object_path =
|
|
86
|
+
self.object_path = ''
|
|
91
87
|
self.bucket = None
|
|
92
88
|
|
|
93
|
-
|
|
94
89
|
def append_path(self, report_path: str, dataset_name: str):
|
|
95
|
-
if dataset_name ==
|
|
90
|
+
if dataset_name == 'general_qa':
|
|
96
91
|
self.CUSTOM_DATASET_PATH.append(report_path)
|
|
97
92
|
else:
|
|
98
93
|
self.COMMON_DATASET_PATH.append(report_path)
|
|
99
|
-
|
|
94
|
+
|
|
100
95
|
def dump_reports(self, output_dir: str):
|
|
101
|
-
result = {
|
|
102
|
-
"CommonDataset": [],
|
|
103
|
-
"CustomDataset": []
|
|
104
|
-
}
|
|
96
|
+
result = {'CommonDataset': [], 'CustomDataset': []}
|
|
105
97
|
for line in self.COMMON_DATASET_PATH:
|
|
106
98
|
with open(line, 'r') as f:
|
|
107
99
|
report = json.load(f)
|
|
@@ -109,20 +101,21 @@ class ReportsRecorder:
|
|
|
109
101
|
for line in self.CUSTOM_DATASET_PATH:
|
|
110
102
|
with open(line, 'r') as f:
|
|
111
103
|
report = json.load(f)
|
|
112
|
-
report.update({
|
|
104
|
+
report.update({'name': os.path.basename(line)})
|
|
113
105
|
result['CustomDataset'].append(report)
|
|
114
|
-
|
|
106
|
+
|
|
115
107
|
os.makedirs(output_dir, exist_ok=True)
|
|
116
|
-
output_file_name =
|
|
108
|
+
output_file_name = 'metric.json'
|
|
117
109
|
output_path = os.path.join(output_dir, output_file_name)
|
|
118
110
|
with open(output_path, 'w+') as f:
|
|
119
111
|
f.write(json.dumps(result, ensure_ascii=False, indent=4))
|
|
120
|
-
|
|
112
|
+
|
|
121
113
|
if self.bucket:
|
|
122
114
|
remote_path = os.path.join(self.object_path, output_file_name)
|
|
123
|
-
logger.info(f
|
|
115
|
+
logger.info(f'** Upload report to oss: {remote_path}')
|
|
124
116
|
self.bucket.put_object_from_file(remote_path, output_path)
|
|
125
117
|
|
|
118
|
+
|
|
126
119
|
if __name__ == '__main__':
|
|
127
120
|
report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
|
|
128
121
|
report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
|
|
@@ -4,12 +4,10 @@ import time
|
|
|
4
4
|
|
|
5
5
|
from evalscope.models.custom import CustomModel
|
|
6
6
|
from evalscope.run import run_task
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.utils import yaml_to_dict
|
|
7
|
+
from evalscope.utils.io_utils import yaml_to_dict
|
|
9
8
|
from evalscope.utils.logger import get_logger
|
|
10
9
|
|
|
11
10
|
logger = get_logger()
|
|
12
|
-
|
|
13
11
|
"""
|
|
14
12
|
This script is used to rewrite the evaluation results without re-running the model predictions.
|
|
15
13
|
"""
|
|
@@ -26,19 +24,20 @@ class DummyCustomModel(CustomModel):
|
|
|
26
24
|
response = 'The answer is C. NOTE: ONLY FOR TEST'
|
|
27
25
|
|
|
28
26
|
res_d: dict = {
|
|
29
|
-
'choices': [
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
'
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
'role': 'assistant'
|
|
36
|
-
}
|
|
27
|
+
'choices': [{
|
|
28
|
+
'index': 0,
|
|
29
|
+
'message': {
|
|
30
|
+
# 'content': f'The answer is B. Raw prompt: {prompt}',
|
|
31
|
+
'content': response,
|
|
32
|
+
'role': 'assistant'
|
|
37
33
|
}
|
|
38
|
-
],
|
|
39
|
-
'created':
|
|
40
|
-
|
|
41
|
-
'
|
|
34
|
+
}],
|
|
35
|
+
'created':
|
|
36
|
+
time.time(),
|
|
37
|
+
'model':
|
|
38
|
+
self.config.get('model_id'), # should be model_id
|
|
39
|
+
'object':
|
|
40
|
+
'chat.completion',
|
|
42
41
|
'usage': {
|
|
43
42
|
'completion_tokens': 0,
|
|
44
43
|
'prompt_tokens': 0,
|
|
@@ -49,36 +48,6 @@ class DummyCustomModel(CustomModel):
|
|
|
49
48
|
return [res_d for _ in prompts]
|
|
50
49
|
|
|
51
50
|
|
|
52
|
-
def get_task_cfg(cfg_file: str, model_instance: CustomModel):
|
|
53
|
-
if cfg_file:
|
|
54
|
-
cfg_file: str = os.path.abspath(cfg_file)
|
|
55
|
-
logger.info(f'Loading task config from {cfg_file}')
|
|
56
|
-
task_cfg_d: dict = yaml_to_dict(yaml_file=cfg_file)
|
|
57
|
-
task_cfg_d.update({'model': model_instance})
|
|
58
|
-
logger.info(f'**Task config: {task_cfg_d}')
|
|
59
|
-
else:
|
|
60
|
-
# 默认config 示例
|
|
61
|
-
task_cfg_d = {
|
|
62
|
-
'model_args': {},
|
|
63
|
-
'generation_config': {},
|
|
64
|
-
'dataset_args': {},
|
|
65
|
-
'dry_run': False,
|
|
66
|
-
'model': model_instance, # NOTE: model_id or # model_dir or model_instance(CustomModel)
|
|
67
|
-
'eval_type': 'custom', # NOTE: `checkpoint` or `custom` or `service`
|
|
68
|
-
'datasets': ['arc'],
|
|
69
|
-
'work_dir': DEFAULT_ROOT_CACHE_DIR,
|
|
70
|
-
'outputs': './outputs/eval_swift_dummy',
|
|
71
|
-
'mem_cache': False,
|
|
72
|
-
'dataset_hub': 'ModelScope',
|
|
73
|
-
'dataset_dir': DEFAULT_ROOT_CACHE_DIR,
|
|
74
|
-
'stage': 'all',
|
|
75
|
-
'limit': 10,
|
|
76
|
-
'debug': False
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
return task_cfg_d
|
|
80
|
-
|
|
81
|
-
|
|
82
51
|
if __name__ == '__main__':
|
|
83
52
|
# step1: 如果outputs做了迁移,需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
|
|
84
53
|
# step2: 执行此脚本,默认使用use_cache=True,实现免推理对eval结果进行刷新
|
|
@@ -91,5 +60,4 @@ if __name__ == '__main__':
|
|
|
91
60
|
task_cfg_d.update({'model': swift_model})
|
|
92
61
|
|
|
93
62
|
eval_results: dict = run_task(task_cfg=task_cfg_d)
|
|
94
|
-
print(
|
|
95
|
-
|
|
63
|
+
print('** Evaluation results finished !\n')
|
evalscope/utils/__init__.py
CHANGED
evalscope/utils/arena_utils.py
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) lmsys.org.
|
|
3
3
|
|
|
4
|
-
import random
|
|
5
|
-
from collections import OrderedDict, defaultdict
|
|
6
|
-
from typing import List, Sequence, Union
|
|
7
|
-
|
|
8
4
|
import numpy as np
|
|
9
5
|
import pandas as pd
|
|
10
6
|
import pyarrow as pa
|
|
7
|
+
import random
|
|
8
|
+
from collections import OrderedDict, defaultdict
|
|
9
|
+
from typing import List, Sequence, Union
|
|
11
10
|
|
|
12
11
|
from evalscope.utils.logger import get_logger
|
|
13
12
|
|
|
@@ -25,9 +24,7 @@ def compute_elo(battles,
|
|
|
25
24
|
init_rating=1000):
|
|
26
25
|
rating = defaultdict(lambda: init_rating)
|
|
27
26
|
|
|
28
|
-
for rd, model_a, model_b, win in battles[[
|
|
29
|
-
col_model_a, col_model_b, col_win
|
|
30
|
-
]].itertuples():
|
|
27
|
+
for rd, model_a, model_b, win in battles[[col_model_a, col_model_b, col_win]].itertuples():
|
|
31
28
|
ra = rating[model_a]
|
|
32
29
|
rb = rating[model_b]
|
|
33
30
|
ea = 1 / (1 + base**((rb - ra) / scale))
|
|
@@ -46,9 +43,7 @@ def compute_elo(battles,
|
|
|
46
43
|
return rating
|
|
47
44
|
|
|
48
45
|
|
|
49
|
-
def merge_ques_ans(answer_list_all,
|
|
50
|
-
merge_key: str = 'question_id',
|
|
51
|
-
merge_mode: str = 'inner') -> pd.DataFrame:
|
|
46
|
+
def merge_ques_ans(answer_list_all, merge_key: str = 'question_id', merge_mode: str = 'inner') -> pd.DataFrame:
|
|
52
47
|
"""
|
|
53
48
|
Merge question and answer list to unifiled data.
|
|
54
49
|
|
|
@@ -67,18 +62,11 @@ def merge_ques_ans(answer_list_all,
|
|
|
67
62
|
"""
|
|
68
63
|
ans_df = pd.DataFrame()
|
|
69
64
|
for ans_list in answer_list_all:
|
|
70
|
-
ans_list = [{
|
|
71
|
-
'question_id': item['question_id'],
|
|
72
|
-
item['model_id']: item
|
|
73
|
-
} for item in ans_list]
|
|
65
|
+
ans_list = [{'question_id': item['question_id'], item['model_id']: item} for item in ans_list]
|
|
74
66
|
if ans_df.empty:
|
|
75
67
|
ans_df = pa.Table.from_pylist(ans_list).to_pandas()
|
|
76
68
|
else:
|
|
77
|
-
ans_df = pd.merge(
|
|
78
|
-
ans_df,
|
|
79
|
-
pa.Table.from_pylist(ans_list).to_pandas(),
|
|
80
|
-
on=merge_key,
|
|
81
|
-
how=merge_mode)
|
|
69
|
+
ans_df = pd.merge(ans_df, pa.Table.from_pylist(ans_list).to_pandas(), on=merge_key, how=merge_mode)
|
|
82
70
|
|
|
83
71
|
return ans_df
|
|
84
72
|
|
|
@@ -112,21 +100,17 @@ def get_battle_pairs(columns: List[str], baseline_idx: int = -1) -> List[tuple]:
|
|
|
112
100
|
|
|
113
101
|
if baseline_idx != -1:
|
|
114
102
|
n_column = columns[baseline_idx]
|
|
115
|
-
res_list = [(column, n_column) for column in columns
|
|
116
|
-
if column != n_column]
|
|
103
|
+
res_list = [(column, n_column) for column in columns if column != n_column]
|
|
117
104
|
else:
|
|
118
105
|
mat = np.ones((cols_num, cols_num))
|
|
119
106
|
mat_lower_tril = np.tril(mat, k=-1)
|
|
120
107
|
x_ids, y_ids = np.where(mat_lower_tril == 1)
|
|
121
|
-
res_list = [(columns[x_id], columns[y_id])
|
|
122
|
-
for x_id, y_id in zip(x_ids, y_ids)]
|
|
108
|
+
res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
|
|
123
109
|
|
|
124
110
|
return res_list
|
|
125
111
|
|
|
126
112
|
|
|
127
|
-
def get_battle_pairs_origin(columns: List[str],
|
|
128
|
-
compare_base: bool = False,
|
|
129
|
-
swap: bool = False): # TODO: to refactor
|
|
113
|
+
def get_battle_pairs_origin(columns: List[str], compare_base: bool = False, swap: bool = False): # TODO: to refactor
|
|
130
114
|
"""
|
|
131
115
|
Get battle pair names from columns.
|
|
132
116
|
|
|
@@ -152,8 +136,7 @@ def get_battle_pairs_origin(columns: List[str],
|
|
|
152
136
|
mat = np.ones((cols_num, cols_num))
|
|
153
137
|
mat_lower_tril = np.tril(mat, k=-1)
|
|
154
138
|
x_ids, y_ids = np.where(mat_lower_tril == 1)
|
|
155
|
-
res_list = [(columns[x_id], columns[y_id])
|
|
156
|
-
for x_id, y_id in zip(x_ids, y_ids)]
|
|
139
|
+
res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
|
|
157
140
|
else:
|
|
158
141
|
for column in columns[1:]:
|
|
159
142
|
res_list.append((columns[0], column))
|
|
@@ -163,8 +146,7 @@ def get_battle_pairs_origin(columns: List[str],
|
|
|
163
146
|
return res_list
|
|
164
147
|
|
|
165
148
|
|
|
166
|
-
def shuffle_pairwise_preferences(
|
|
167
|
-
df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
|
|
149
|
+
def shuffle_pairwise_preferences(df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
|
|
168
150
|
"""Shuffle the outputs of a pairwise preference dataframe.
|
|
169
151
|
|
|
170
152
|
Examples
|
|
@@ -182,8 +164,7 @@ def shuffle_pairwise_preferences(
|
|
|
182
164
|
df['output_2'] = np.where(arr_is_shuffle, col_1, col_2)
|
|
183
165
|
|
|
184
166
|
if 'preference' in df.columns:
|
|
185
|
-
df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'],
|
|
186
|
-
df['preference'])
|
|
167
|
+
df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'], df['preference'])
|
|
187
168
|
|
|
188
169
|
return df
|
|
189
170
|
|
|
@@ -202,20 +183,14 @@ class BattlePairSelection:
|
|
|
202
183
|
# Make sure model_elo_map to be ordered when compare_base is true.
|
|
203
184
|
self.model_elo_map = model_elo_map
|
|
204
185
|
|
|
205
|
-
def top_k(self,
|
|
206
|
-
k: int = DEFAULT_K,
|
|
207
|
-
compare_base: bool = False,
|
|
208
|
-
swap: bool = False) -> list:
|
|
186
|
+
def top_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
|
|
209
187
|
if k <= 0:
|
|
210
188
|
k = self.DEFAULT_K
|
|
211
189
|
sorted_res = sorted(self.model_elo_map.items(), key=lambda x: x[1])[:k]
|
|
212
190
|
sorted_res = list(dict(sorted_res).keys())
|
|
213
191
|
return get_battle_pairs_origin(sorted_res, compare_base, swap)
|
|
214
192
|
|
|
215
|
-
def random_k(self,
|
|
216
|
-
k: int = DEFAULT_K,
|
|
217
|
-
compare_base: bool = False,
|
|
218
|
-
swap: bool = False) -> list:
|
|
193
|
+
def random_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
|
|
219
194
|
if k <= 0:
|
|
220
195
|
k = self.DEFAULT_K
|
|
221
196
|
if k > len(self.model_elo_map):
|
|
@@ -226,21 +201,16 @@ class BattlePairSelection:
|
|
|
226
201
|
res = list(res.keys())
|
|
227
202
|
return get_battle_pairs_origin(res, compare_base, swap)
|
|
228
203
|
|
|
229
|
-
def volatility_index(self,
|
|
230
|
-
frac: float = 0.2,
|
|
231
|
-
compare_base: bool = False,
|
|
232
|
-
swap: bool = False) -> list:
|
|
204
|
+
def volatility_index(self, frac: float = 0.2, compare_base: bool = False, swap: bool = False) -> list:
|
|
233
205
|
res_list = []
|
|
234
|
-
candidate_list = get_battle_pairs_origin(
|
|
235
|
-
list(self.model_elo_map.keys()), compare_base, swap)
|
|
206
|
+
candidate_list = get_battle_pairs_origin(list(self.model_elo_map.keys()), compare_base, swap)
|
|
236
207
|
for t in candidate_list:
|
|
237
208
|
model_a = t[0]
|
|
238
209
|
model_b = t[1]
|
|
239
210
|
base_elo_a = self.model_elo_map.get(model_a)
|
|
240
211
|
base_elo_b = self.model_elo_map.get(model_b)
|
|
241
212
|
|
|
242
|
-
vol_frac = abs(base_elo_b - base_elo_a) / max(
|
|
243
|
-
base_elo_a, base_elo_b)
|
|
213
|
+
vol_frac = abs(base_elo_b - base_elo_a) / max(base_elo_a, base_elo_b)
|
|
244
214
|
if vol_frac <= frac:
|
|
245
215
|
res_list.append(t)
|
|
246
216
|
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
|
+
import torch
|
|
3
4
|
from contextlib import contextmanager
|
|
4
5
|
from functools import partial
|
|
5
|
-
from threading import Thread
|
|
6
|
-
from typing import List, Literal, Optional, Union
|
|
7
|
-
|
|
8
|
-
import torch
|
|
9
6
|
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
10
7
|
from pydantic import BaseModel, Field
|
|
8
|
+
from threading import Thread
|
|
11
9
|
from transformers import TextIteratorStreamer
|
|
10
|
+
from typing import List, Literal, Optional, Union
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
class Usage(BaseModel):
|
|
@@ -44,7 +43,7 @@ class DeltaMessage(BaseModel):
|
|
|
44
43
|
|
|
45
44
|
class ChatCompletionRequest(BaseModel):
|
|
46
45
|
model: str
|
|
47
|
-
messages: List[ChatMessage]
|
|
46
|
+
messages: Union[List[ChatMessage], str]
|
|
48
47
|
temperature: Optional[float] = None
|
|
49
48
|
top_p: Optional[float] = None
|
|
50
49
|
max_tokens: Optional[int] = 2048
|
|
@@ -4,7 +4,6 @@
|
|
|
4
4
|
import ast
|
|
5
5
|
import re
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
# from . import utils as ann_utils
|
|
9
8
|
from evalscope.constants import ArenaWinner
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
@@ -51,9 +50,7 @@ def lmsys_parser(completion, output_format):
|
|
|
51
50
|
else:
|
|
52
51
|
raise Exception('Invalid score pair.')
|
|
53
52
|
except Exception as e:
|
|
54
|
-
logger.error(
|
|
55
|
-
f'{e}\nContent: {completion}\nYou must manually fix the score pair.'
|
|
56
|
-
)
|
|
53
|
+
logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
|
|
57
54
|
return ArenaWinner.UNKNOWN, [-1, -1]
|
|
58
55
|
elif output_format == '[[A]]':
|
|
59
56
|
if '[[A]]' in completion:
|
|
@@ -63,8 +60,7 @@ def lmsys_parser(completion, output_format):
|
|
|
63
60
|
elif '[[C]]' in completion:
|
|
64
61
|
winner = ArenaWinner.TIE
|
|
65
62
|
else:
|
|
66
|
-
logger.error(
|
|
67
|
-
f'\nContent: {completion}\nYou must manually fix the score.')
|
|
63
|
+
logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
|
|
68
64
|
winner = ArenaWinner.UNKNOWN
|
|
69
65
|
return winner
|
|
70
66
|
|
|
@@ -76,8 +72,7 @@ def ranking_parser(completion, **kwargs):
|
|
|
76
72
|
else:
|
|
77
73
|
ordered_completions = completion
|
|
78
74
|
|
|
79
|
-
rank = [c for c in ordered_completions
|
|
80
|
-
if c['model'] == 'model_a'][0]['rank']
|
|
75
|
+
rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
|
|
81
76
|
assert rank in [1, 2]
|
|
82
77
|
|
|
83
78
|
return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import jsonlines as jsonl
|
|
3
|
+
import os
|
|
4
|
+
import yaml
|
|
5
|
+
|
|
6
|
+
from evalscope.constants import DumpMode
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class OutputsStructure:
|
|
13
|
+
LOGS_DIR = 'logs'
|
|
14
|
+
PREDICTIONS_DIR = 'predictions'
|
|
15
|
+
REVIEWS_DIR = 'reviews'
|
|
16
|
+
REPORTS_DIR = 'reports'
|
|
17
|
+
CONFIGS_DIR = 'configs'
|
|
18
|
+
|
|
19
|
+
def __init__(self, outputs_dir: str, is_make=True):
|
|
20
|
+
self.outputs_dir = outputs_dir
|
|
21
|
+
self.is_make = is_make
|
|
22
|
+
self._dirs = {
|
|
23
|
+
'logs_dir': None,
|
|
24
|
+
'predictions_dir': None,
|
|
25
|
+
'reviews_dir': None,
|
|
26
|
+
'reports_dir': None,
|
|
27
|
+
'configs_dir': None
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def _get_dir(self, attr_name, dir_name):
|
|
31
|
+
if self._dirs[attr_name] is None:
|
|
32
|
+
dir_path = os.path.join(self.outputs_dir, dir_name)
|
|
33
|
+
if self.is_make:
|
|
34
|
+
os.makedirs(dir_path, exist_ok=True)
|
|
35
|
+
self._dirs[attr_name] = dir_path
|
|
36
|
+
return self._dirs[attr_name]
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def logs_dir(self):
|
|
40
|
+
return self._get_dir('logs_dir', OutputsStructure.LOGS_DIR)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def predictions_dir(self):
|
|
44
|
+
return self._get_dir('predictions_dir', OutputsStructure.PREDICTIONS_DIR)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def reviews_dir(self):
|
|
48
|
+
return self._get_dir('reviews_dir', OutputsStructure.REVIEWS_DIR)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def reports_dir(self):
|
|
52
|
+
return self._get_dir('reports_dir', OutputsStructure.REPORTS_DIR)
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def configs_dir(self):
|
|
56
|
+
return self._get_dir('configs_dir', OutputsStructure.CONFIGS_DIR)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def jsonl_to_list(jsonl_file):
|
|
60
|
+
"""
|
|
61
|
+
Read jsonl file to list.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
jsonl_file: jsonl file path.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
list: list of lines. Each line is a dict.
|
|
68
|
+
"""
|
|
69
|
+
res_list = []
|
|
70
|
+
with jsonl.open(jsonl_file, mode='r') as reader:
|
|
71
|
+
for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
|
|
72
|
+
res_list.append(line)
|
|
73
|
+
return res_list
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def jsonl_to_reader(jsonl_file):
|
|
77
|
+
"""
|
|
78
|
+
Read jsonl file to reader object.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
jsonl_file: jsonl file path.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
reader: jsonl reader object.
|
|
85
|
+
"""
|
|
86
|
+
with jsonl.open(jsonl_file, mode='r') as reader:
|
|
87
|
+
return reader
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
|
|
91
|
+
"""
|
|
92
|
+
Dump data to jsonl file.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
data_list: data list to be dumped. [{'a': 'aaa'}, ...]
|
|
96
|
+
jsonl_file: jsonl file path.
|
|
97
|
+
dump_mode: dump mode. It can be 'overwrite' or 'append'.
|
|
98
|
+
"""
|
|
99
|
+
if not jsonl_file:
|
|
100
|
+
raise ValueError('output file must be provided.')
|
|
101
|
+
|
|
102
|
+
jsonl_file = os.path.expanduser(jsonl_file)
|
|
103
|
+
|
|
104
|
+
if not isinstance(data_list, list):
|
|
105
|
+
data_list = [data_list]
|
|
106
|
+
|
|
107
|
+
if dump_mode == DumpMode.OVERWRITE:
|
|
108
|
+
dump_mode = 'w'
|
|
109
|
+
elif dump_mode == DumpMode.APPEND:
|
|
110
|
+
dump_mode = 'a'
|
|
111
|
+
with jsonl.open(jsonl_file, mode=dump_mode) as writer:
|
|
112
|
+
writer.write_all(data_list)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def jsonl_to_csv():
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def yaml_to_dict(yaml_file) -> dict:
|
|
120
|
+
"""
|
|
121
|
+
Read yaml file to dict.
|
|
122
|
+
"""
|
|
123
|
+
with open(yaml_file, 'r') as f:
|
|
124
|
+
try:
|
|
125
|
+
stream = yaml.safe_load(f)
|
|
126
|
+
except yaml.YAMLError as e:
|
|
127
|
+
logger.error(f'{e}')
|
|
128
|
+
raise e
|
|
129
|
+
|
|
130
|
+
return stream
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def dict_to_yaml(d: dict, yaml_file: str):
|
|
134
|
+
"""
|
|
135
|
+
Dump dict to yaml file.
|
|
136
|
+
"""
|
|
137
|
+
with open(yaml_file, 'w') as f:
|
|
138
|
+
yaml.dump(d, f, default_flow_style=False)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def json_to_dict(json_file) -> dict:
|
|
142
|
+
"""
|
|
143
|
+
Read json file to dict.
|
|
144
|
+
"""
|
|
145
|
+
with open(json_file, 'r') as f:
|
|
146
|
+
try:
|
|
147
|
+
stream = json.load(f)
|
|
148
|
+
except json.JSONDecodeError as e:
|
|
149
|
+
logger.error(f'{e}')
|
|
150
|
+
raise e
|
|
151
|
+
|
|
152
|
+
return stream
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def are_paths_same(path1, path2):
|
|
156
|
+
"""
|
|
157
|
+
Check if two paths are the same.
|
|
158
|
+
"""
|
|
159
|
+
real_path1 = os.path.realpath(os.path.abspath(os.path.expanduser(path1)))
|
|
160
|
+
real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2)))
|
|
161
|
+
|
|
162
|
+
return real_path1 == real_path2
|
evalscope/utils/logger.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import importlib.util as iutil
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
from typing import Optional
|
|
4
5
|
|
|
5
6
|
init_loggers = {}
|
|
@@ -9,11 +10,12 @@ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
|
9
10
|
|
|
10
11
|
detailed_formatter = logging.Formatter(detailed_format)
|
|
11
12
|
simple_formatter = logging.Formatter(simple_format)
|
|
13
|
+
DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
|
|
12
14
|
|
|
13
|
-
logging.basicConfig(format=simple_format, level=
|
|
15
|
+
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
def get_logger(log_file: Optional[str] = None, log_level: int =
|
|
18
|
+
def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
|
|
17
19
|
"""Get logging logger
|
|
18
20
|
|
|
19
21
|
Args:
|
|
@@ -29,12 +31,12 @@ def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, fi
|
|
|
29
31
|
logger.propagate = False
|
|
30
32
|
|
|
31
33
|
if logger_name in init_loggers:
|
|
32
|
-
if
|
|
34
|
+
if force:
|
|
33
35
|
logger.setLevel(log_level)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
for handler in logger.handlers:
|
|
37
|
+
handler.setLevel(log_level)
|
|
38
|
+
handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
|
|
39
|
+
add_file_handler_if_needed(logger, log_file, file_mode, log_level)
|
|
38
40
|
return logger
|
|
39
41
|
|
|
40
42
|
# handle duplicate logs to the console
|
|
@@ -73,6 +75,14 @@ def get_logger(log_file: Optional[str] = None, log_level: int = logging.INFO, fi
|
|
|
73
75
|
return logger
|
|
74
76
|
|
|
75
77
|
|
|
78
|
+
def configure_logging(debug: bool, log_file: Optional[str] = None):
|
|
79
|
+
"""Configure logging level based on the debug flag."""
|
|
80
|
+
if log_file:
|
|
81
|
+
get_logger(log_file=log_file, force=True)
|
|
82
|
+
if debug:
|
|
83
|
+
get_logger(log_level=logging.DEBUG, force=True)
|
|
84
|
+
|
|
85
|
+
|
|
76
86
|
def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
|
|
77
87
|
for handler in logger.handlers:
|
|
78
88
|
if isinstance(handler, logging.FileHandler):
|