evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from transformers import GenerationConfig
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
|
|
5
|
+
# Use the default values of temperature/top_p/top_k in generation_config.
|
|
6
|
+
if generation_config.temperature == 0:
|
|
7
|
+
generation_config.do_sample = False
|
|
8
|
+
if generation_config.do_sample is False:
|
|
9
|
+
generation_config.temperature = 1.
|
|
10
|
+
generation_config.top_p = 1.
|
|
11
|
+
generation_config.top_k = 50
|
evalscope/utils/utils.py
CHANGED
|
@@ -5,20 +5,13 @@ import functools
|
|
|
5
5
|
import hashlib
|
|
6
6
|
import importlib
|
|
7
7
|
import importlib.util
|
|
8
|
+
import numpy as np
|
|
8
9
|
import os
|
|
9
10
|
import random
|
|
10
11
|
import re
|
|
11
|
-
import sys
|
|
12
|
-
from typing import Any, Dict, List, Tuple, Union
|
|
13
|
-
|
|
14
|
-
import json
|
|
15
|
-
import jsonlines as jsonl
|
|
16
|
-
import numpy as np
|
|
17
12
|
import torch
|
|
18
|
-
import
|
|
19
|
-
import yaml
|
|
13
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
20
14
|
|
|
21
|
-
from evalscope.constants import DumpMode, OutputsStructure
|
|
22
15
|
from evalscope.utils.logger import get_logger
|
|
23
16
|
|
|
24
17
|
logger = get_logger()
|
|
@@ -37,101 +30,6 @@ def test_level_list():
|
|
|
37
30
|
return TEST_LEVEL_LIST
|
|
38
31
|
|
|
39
32
|
|
|
40
|
-
def jsonl_to_list(jsonl_file):
|
|
41
|
-
"""
|
|
42
|
-
Read jsonl file to list.
|
|
43
|
-
|
|
44
|
-
Args:
|
|
45
|
-
jsonl_file: jsonl file path.
|
|
46
|
-
|
|
47
|
-
Returns:
|
|
48
|
-
list: list of lines. Each line is a dict.
|
|
49
|
-
"""
|
|
50
|
-
res_list = []
|
|
51
|
-
with jsonl.open(jsonl_file, mode='r') as reader:
|
|
52
|
-
for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
|
|
53
|
-
res_list.append(line)
|
|
54
|
-
return res_list
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def jsonl_to_reader(jsonl_file):
|
|
58
|
-
"""
|
|
59
|
-
Read jsonl file to reader object.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
jsonl_file: jsonl file path.
|
|
63
|
-
|
|
64
|
-
Returns:
|
|
65
|
-
reader: jsonl reader object.
|
|
66
|
-
"""
|
|
67
|
-
with jsonl.open(jsonl_file, mode='r') as reader:
|
|
68
|
-
return reader
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def jsonl_to_csv():
|
|
72
|
-
pass
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
|
|
76
|
-
"""
|
|
77
|
-
Dump data to jsonl file.
|
|
78
|
-
|
|
79
|
-
Args:
|
|
80
|
-
data_list: data list to be dumped. [{'a': 'aaa'}, ...]
|
|
81
|
-
jsonl_file: jsonl file path.
|
|
82
|
-
dump_mode: dump mode. It can be 'overwrite' or 'append'.
|
|
83
|
-
"""
|
|
84
|
-
if not jsonl_file:
|
|
85
|
-
raise ValueError('output file must be provided.')
|
|
86
|
-
|
|
87
|
-
jsonl_file = os.path.expanduser(jsonl_file)
|
|
88
|
-
|
|
89
|
-
if dump_mode == DumpMode.OVERWRITE:
|
|
90
|
-
dump_mode = 'w'
|
|
91
|
-
elif dump_mode == DumpMode.APPEND:
|
|
92
|
-
dump_mode = 'a'
|
|
93
|
-
with jsonl.open(jsonl_file, mode=dump_mode) as writer:
|
|
94
|
-
writer.write_all(data_list)
|
|
95
|
-
logger.info(f'Dump data to {jsonl_file} successfully.')
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def yaml_to_dict(yaml_file) -> dict:
|
|
99
|
-
"""
|
|
100
|
-
Read yaml file to dict.
|
|
101
|
-
"""
|
|
102
|
-
with open(yaml_file, 'r') as f:
|
|
103
|
-
try:
|
|
104
|
-
stream = yaml.safe_load(f)
|
|
105
|
-
except yaml.YAMLError as e:
|
|
106
|
-
logger.error(f'{e}')
|
|
107
|
-
raise e
|
|
108
|
-
|
|
109
|
-
return stream
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def dict_to_yaml(d: dict, yaml_file: str):
|
|
113
|
-
"""
|
|
114
|
-
Dump dict to yaml file.
|
|
115
|
-
"""
|
|
116
|
-
with open(yaml_file, 'w') as f:
|
|
117
|
-
yaml.dump(d, f, default_flow_style=False)
|
|
118
|
-
logger.info(f'Dump data to {yaml_file} successfully.')
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def json_to_dict(json_file) -> dict:
|
|
122
|
-
"""
|
|
123
|
-
Read json file to dict.
|
|
124
|
-
"""
|
|
125
|
-
with open(json_file, 'r') as f:
|
|
126
|
-
try:
|
|
127
|
-
stream = json.load(f)
|
|
128
|
-
except json.JSONDecodeError as e:
|
|
129
|
-
logger.error(f'{e}')
|
|
130
|
-
raise e
|
|
131
|
-
|
|
132
|
-
return stream
|
|
133
|
-
|
|
134
|
-
|
|
135
33
|
def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
|
|
136
34
|
module_name, spliter, cls_name = eval_class_ref.partition(':')
|
|
137
35
|
|
|
@@ -148,25 +46,13 @@ def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
|
|
|
148
46
|
return functools.partial(obj_cls, *args, **kwargs)
|
|
149
47
|
|
|
150
48
|
|
|
151
|
-
def markdown_table(header_l, data_l):
|
|
152
|
-
md_str = f'| {" | ".join(header_l)} |'
|
|
153
|
-
md_str += f'\n| {" | ".join(["---"] * len(header_l))} |'
|
|
154
|
-
for data in data_l:
|
|
155
|
-
if isinstance(data, str):
|
|
156
|
-
data = [data]
|
|
157
|
-
assert len(data) <= len(header_l)
|
|
158
|
-
tmp = data + [''] * (len(header_l) - len(data))
|
|
159
|
-
md_str += f'\n| {" | ".join(tmp)} |'
|
|
160
|
-
return md_str
|
|
161
|
-
|
|
162
|
-
|
|
163
49
|
def random_seeded_choice(seed: Union[int, str, float], choices, **kwargs):
|
|
164
50
|
"""Random choice with a (potentially string) seed."""
|
|
165
51
|
return random.Random(seed).choices(choices, k=1, **kwargs)[0]
|
|
166
52
|
|
|
167
53
|
|
|
168
|
-
def gen_hash(name: str):
|
|
169
|
-
return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()
|
|
54
|
+
def gen_hash(name: str, bits: int = 32):
|
|
55
|
+
return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
|
|
170
56
|
|
|
171
57
|
|
|
172
58
|
def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
|
|
@@ -312,52 +198,6 @@ class ResponseParser:
|
|
|
312
198
|
return ''
|
|
313
199
|
|
|
314
200
|
|
|
315
|
-
def make_outputs_dir(root_dir: str, datasets: list, model_id: str, model_revision: str):
|
|
316
|
-
# model_revision = model_revision if model_revision is not None else 'none'
|
|
317
|
-
# now = datetime.datetime.now()
|
|
318
|
-
# format_time = now.strftime('%Y%m%d_%H%M%S')
|
|
319
|
-
# outputs_name = format_time + '_' + 'default' + '_' + model_id.replace('/', '_') + '_' + model_revision
|
|
320
|
-
# outputs_dir = os.path.join(work_dir, outputs_name)
|
|
321
|
-
# dataset_name = dataset_id.replace('/', '_')
|
|
322
|
-
# outputs_dir = os.path.join(work_dir, dataset_name)
|
|
323
|
-
|
|
324
|
-
if not model_id:
|
|
325
|
-
model_id = 'default'
|
|
326
|
-
model_id = model_id.replace('/', '_')
|
|
327
|
-
|
|
328
|
-
if not model_revision:
|
|
329
|
-
model_revision = 'default'
|
|
330
|
-
|
|
331
|
-
outputs_dir = os.path.join(root_dir, f"eval_{'-'.join(datasets)}_{model_id}_{model_revision}")
|
|
332
|
-
|
|
333
|
-
return outputs_dir
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
def process_outputs_structure(outputs_dir: str, is_make: bool = True) -> dict:
|
|
337
|
-
logs_dir = os.path.join(outputs_dir, 'logs')
|
|
338
|
-
predictions_dir = os.path.join(outputs_dir, 'predictions')
|
|
339
|
-
reviews_dir = os.path.join(outputs_dir, 'reviews')
|
|
340
|
-
reports_dir = os.path.join(outputs_dir, 'reports')
|
|
341
|
-
configs_dir = os.path.join(outputs_dir, 'configs')
|
|
342
|
-
|
|
343
|
-
if is_make:
|
|
344
|
-
os.makedirs(outputs_dir, exist_ok=True)
|
|
345
|
-
os.makedirs(logs_dir, exist_ok=True)
|
|
346
|
-
os.makedirs(predictions_dir, exist_ok=True)
|
|
347
|
-
os.makedirs(reviews_dir, exist_ok=True)
|
|
348
|
-
os.makedirs(reports_dir, exist_ok=True)
|
|
349
|
-
os.makedirs(configs_dir, exist_ok=True)
|
|
350
|
-
|
|
351
|
-
outputs_structure = {
|
|
352
|
-
OutputsStructure.LOGS_DIR: logs_dir,
|
|
353
|
-
OutputsStructure.PREDICTIONS_DIR: predictions_dir,
|
|
354
|
-
OutputsStructure.REVIEWS_DIR: reviews_dir,
|
|
355
|
-
OutputsStructure.REPORTS_DIR: reports_dir,
|
|
356
|
-
OutputsStructure.CONFIGS_DIR: configs_dir,
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
return outputs_structure
|
|
360
|
-
|
|
361
201
|
|
|
362
202
|
def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
|
|
363
203
|
"""
|
|
@@ -401,148 +241,6 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
|
|
|
401
241
|
return score
|
|
402
242
|
|
|
403
243
|
|
|
404
|
-
def split_str_parts_by(text: str, delimiters: List[str]):
|
|
405
|
-
"""Split the text field into parts.
|
|
406
|
-
Args:
|
|
407
|
-
text: A text to be split.
|
|
408
|
-
delimiters: The delimiters.
|
|
409
|
-
Returns:
|
|
410
|
-
The split text in list of dicts.
|
|
411
|
-
"""
|
|
412
|
-
all_start_chars = [d[0] for d in delimiters]
|
|
413
|
-
all_length = [len(d) for d in delimiters]
|
|
414
|
-
|
|
415
|
-
text_list = []
|
|
416
|
-
last_words = ''
|
|
417
|
-
|
|
418
|
-
while len(text) > 0:
|
|
419
|
-
for char_idx, char in enumerate(text):
|
|
420
|
-
match_index = [idx for idx, start_char in enumerate(all_start_chars) if start_char == char]
|
|
421
|
-
is_delimiter = False
|
|
422
|
-
for index in match_index:
|
|
423
|
-
if text[char_idx:char_idx + all_length[index]] == delimiters[index]:
|
|
424
|
-
if last_words:
|
|
425
|
-
if text_list:
|
|
426
|
-
text_list[-1]['content'] = last_words
|
|
427
|
-
else:
|
|
428
|
-
text_list.append({'key': '', 'content': last_words})
|
|
429
|
-
last_words = ''
|
|
430
|
-
text_list.append({'key': delimiters[index]})
|
|
431
|
-
text = text[char_idx + all_length[index]:]
|
|
432
|
-
is_delimiter = True
|
|
433
|
-
break
|
|
434
|
-
if not is_delimiter:
|
|
435
|
-
last_words += char
|
|
436
|
-
else:
|
|
437
|
-
break
|
|
438
|
-
if last_words == text:
|
|
439
|
-
text = ''
|
|
440
|
-
|
|
441
|
-
text_list[-1]['content'] = last_words
|
|
442
|
-
return text_list
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
def calculate_loss_scale(response: str, use_loss_scale=False) -> Tuple[List[str], List[float]]:
|
|
446
|
-
"""Calculate the loss scale by splitting the agent response.
|
|
447
|
-
This algorithm comes from paper: https://arxiv.org/pdf/2309.00986.pdf
|
|
448
|
-
Agent response format:
|
|
449
|
-
```text
|
|
450
|
-
Thought: you should always think about what to do
|
|
451
|
-
Action: the action to take, should be one of the above tools[fire_recognition,
|
|
452
|
-
fire_alert, call_police, call_fireman]
|
|
453
|
-
Action Input: the input to the action
|
|
454
|
-
Observation: the result of the action
|
|
455
|
-
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
|
|
456
|
-
Thought: I now know the final answer
|
|
457
|
-
Final Answer: the final answer to the original input question
|
|
458
|
-
```
|
|
459
|
-
Args:
|
|
460
|
-
response: The response text
|
|
461
|
-
use_loss_scale: Use weighted loss. With this, some part of the loss will be enhanced to improve performance.
|
|
462
|
-
Returns:
|
|
463
|
-
A tuple of agent response parts and their weights.
|
|
464
|
-
"""
|
|
465
|
-
if 'Action:' in response and 'Observation:' in response and use_loss_scale:
|
|
466
|
-
agent_keyword = ['Action:', 'Action Input:', 'Thought:', 'Final Answer:', 'Observation:']
|
|
467
|
-
agent_parts = split_str_parts_by(response, agent_keyword)
|
|
468
|
-
weights = []
|
|
469
|
-
agent_content = []
|
|
470
|
-
for c in agent_parts:
|
|
471
|
-
if c['key'] in ('Action:', 'Action Input:'):
|
|
472
|
-
weights += [2.0]
|
|
473
|
-
weights += [2.0]
|
|
474
|
-
elif c['key'] in ('Thought:', 'Final Answer:', ''):
|
|
475
|
-
weights += [1.0]
|
|
476
|
-
weights += [1.0]
|
|
477
|
-
elif c['key'] in ('Observation:', ):
|
|
478
|
-
weights += [2.0]
|
|
479
|
-
weights += [0.0]
|
|
480
|
-
agent_content.append(c['key'])
|
|
481
|
-
agent_content.append(c['content'])
|
|
482
|
-
return agent_content, weights
|
|
483
|
-
else:
|
|
484
|
-
return [response], [1.0]
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
def get_bucket_sizes(max_length: int) -> List[int]:
|
|
488
|
-
return [max_length // 4 * (i + 1) for i in range(4)]
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
def _get_closet_bucket(bucket_sizes, data_length):
|
|
492
|
-
"""Select the one from bucket_sizes that is closest in distance to
|
|
493
|
-
data_length. This is required for TorchAcc.
|
|
494
|
-
"""
|
|
495
|
-
cloest_length = sys.maxsize
|
|
496
|
-
for b in bucket_sizes:
|
|
497
|
-
if b == data_length or ((b < cloest_length) and (b > data_length)):
|
|
498
|
-
cloest_length = b
|
|
499
|
-
|
|
500
|
-
if cloest_length == sys.maxsize:
|
|
501
|
-
bucket_sizes.append(data_length)
|
|
502
|
-
cloest_length = data_length
|
|
503
|
-
|
|
504
|
-
return cloest_length
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
def pad_and_split_batch(padding_to, input_ids, attention_mask, labels, loss_scale, max_length, tokenizer, rank,
|
|
508
|
-
world_size):
|
|
509
|
-
if padding_to is None:
|
|
510
|
-
longest_len = input_ids.shape[-1]
|
|
511
|
-
bucket_sizes = get_bucket_sizes(max_length)
|
|
512
|
-
bucket_data_length = _get_closet_bucket(bucket_sizes, longest_len)
|
|
513
|
-
padding_length = bucket_data_length - input_ids.shape[1]
|
|
514
|
-
input_ids = F.pad(input_ids, (0, padding_length), 'constant', tokenizer.pad_token_id)
|
|
515
|
-
attention_mask = F.pad(attention_mask, (0, padding_length), 'constant', 0)
|
|
516
|
-
if loss_scale:
|
|
517
|
-
loss_scale = F.pad(loss_scale, (0, padding_length), 'constant', 0.)
|
|
518
|
-
labels = F.pad(labels, (0, padding_length), 'constant', -100)
|
|
519
|
-
|
|
520
|
-
# manully split the batch to different DP rank.
|
|
521
|
-
batch_size = input_ids.shape[0] // world_size
|
|
522
|
-
if batch_size > 0:
|
|
523
|
-
start = rank * batch_size
|
|
524
|
-
end = (rank + 1) * batch_size
|
|
525
|
-
input_ids = input_ids[start:end, :]
|
|
526
|
-
attention_mask = attention_mask[start:end, :]
|
|
527
|
-
labels = labels[start:end, :]
|
|
528
|
-
if loss_scale:
|
|
529
|
-
loss_scale = loss_scale[start:end, :]
|
|
530
|
-
return input_ids, attention_mask, labels, loss_scale
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
def get_dist_setting() -> Tuple[int, int, int, int]:
|
|
534
|
-
"""return rank, local_rank, world_size, local_world_size"""
|
|
535
|
-
rank = int(os.getenv('RANK', -1))
|
|
536
|
-
local_rank = int(os.getenv('LOCAL_RANK', -1))
|
|
537
|
-
world_size = int(os.getenv('WORLD_SIZE', 1))
|
|
538
|
-
local_world_size = int(os.getenv('LOCAL_WORLD_SIZE', 1))
|
|
539
|
-
return rank, local_rank, world_size, local_world_size
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
def use_torchacc() -> bool:
|
|
543
|
-
return os.getenv('USE_TORCHACC', '0') == '1'
|
|
544
|
-
|
|
545
|
-
|
|
546
244
|
def is_module_installed(module_name):
|
|
547
245
|
try:
|
|
548
246
|
importlib.import_module(module_name)
|
|
@@ -576,6 +274,7 @@ def get_valid_list(input_list, candidate_list):
|
|
|
576
274
|
|
|
577
275
|
def get_latest_folder_path(work_dir):
|
|
578
276
|
from datetime import datetime
|
|
277
|
+
|
|
579
278
|
# Get all subdirectories in the work_dir
|
|
580
279
|
folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
|
|
581
280
|
|
evalscope/version.py
CHANGED