evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,11 @@
1
+ from transformers import GenerationConfig
2
+
3
+
4
+ def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
5
+ # Use the default values of temperature/top_p/top_k in generation_config.
6
+ if generation_config.temperature == 0:
7
+ generation_config.do_sample = False
8
+ if generation_config.do_sample is False:
9
+ generation_config.temperature = 1.
10
+ generation_config.top_p = 1.
11
+ generation_config.top_k = 50
evalscope/utils/utils.py CHANGED
@@ -5,20 +5,13 @@ import functools
5
5
  import hashlib
6
6
  import importlib
7
7
  import importlib.util
8
+ import numpy as np
8
9
  import os
9
10
  import random
10
11
  import re
11
- import sys
12
- from typing import Any, Dict, List, Tuple, Union
13
-
14
- import json
15
- import jsonlines as jsonl
16
- import numpy as np
17
12
  import torch
18
- import torch.nn.functional as F
19
- import yaml
13
+ from typing import Any, Dict, List, Tuple, Union
20
14
 
21
- from evalscope.constants import DumpMode, OutputsStructure
22
15
  from evalscope.utils.logger import get_logger
23
16
 
24
17
  logger = get_logger()
@@ -37,101 +30,6 @@ def test_level_list():
37
30
  return TEST_LEVEL_LIST
38
31
 
39
32
 
40
- def jsonl_to_list(jsonl_file):
41
- """
42
- Read jsonl file to list.
43
-
44
- Args:
45
- jsonl_file: jsonl file path.
46
-
47
- Returns:
48
- list: list of lines. Each line is a dict.
49
- """
50
- res_list = []
51
- with jsonl.open(jsonl_file, mode='r') as reader:
52
- for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
53
- res_list.append(line)
54
- return res_list
55
-
56
-
57
- def jsonl_to_reader(jsonl_file):
58
- """
59
- Read jsonl file to reader object.
60
-
61
- Args:
62
- jsonl_file: jsonl file path.
63
-
64
- Returns:
65
- reader: jsonl reader object.
66
- """
67
- with jsonl.open(jsonl_file, mode='r') as reader:
68
- return reader
69
-
70
-
71
- def jsonl_to_csv():
72
- pass
73
-
74
-
75
- def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
76
- """
77
- Dump data to jsonl file.
78
-
79
- Args:
80
- data_list: data list to be dumped. [{'a': 'aaa'}, ...]
81
- jsonl_file: jsonl file path.
82
- dump_mode: dump mode. It can be 'overwrite' or 'append'.
83
- """
84
- if not jsonl_file:
85
- raise ValueError('output file must be provided.')
86
-
87
- jsonl_file = os.path.expanduser(jsonl_file)
88
-
89
- if dump_mode == DumpMode.OVERWRITE:
90
- dump_mode = 'w'
91
- elif dump_mode == DumpMode.APPEND:
92
- dump_mode = 'a'
93
- with jsonl.open(jsonl_file, mode=dump_mode) as writer:
94
- writer.write_all(data_list)
95
- logger.info(f'Dump data to {jsonl_file} successfully.')
96
-
97
-
98
- def yaml_to_dict(yaml_file) -> dict:
99
- """
100
- Read yaml file to dict.
101
- """
102
- with open(yaml_file, 'r') as f:
103
- try:
104
- stream = yaml.safe_load(f)
105
- except yaml.YAMLError as e:
106
- logger.error(f'{e}')
107
- raise e
108
-
109
- return stream
110
-
111
-
112
- def dict_to_yaml(d: dict, yaml_file: str):
113
- """
114
- Dump dict to yaml file.
115
- """
116
- with open(yaml_file, 'w') as f:
117
- yaml.dump(d, f, default_flow_style=False)
118
- logger.info(f'Dump data to {yaml_file} successfully.')
119
-
120
-
121
- def json_to_dict(json_file) -> dict:
122
- """
123
- Read json file to dict.
124
- """
125
- with open(json_file, 'r') as f:
126
- try:
127
- stream = json.load(f)
128
- except json.JSONDecodeError as e:
129
- logger.error(f'{e}')
130
- raise e
131
-
132
- return stream
133
-
134
-
135
33
  def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
136
34
  module_name, spliter, cls_name = eval_class_ref.partition(':')
137
35
 
@@ -148,25 +46,13 @@ def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
148
46
  return functools.partial(obj_cls, *args, **kwargs)
149
47
 
150
48
 
151
- def markdown_table(header_l, data_l):
152
- md_str = f'| {" | ".join(header_l)} |'
153
- md_str += f'\n| {" | ".join(["---"] * len(header_l))} |'
154
- for data in data_l:
155
- if isinstance(data, str):
156
- data = [data]
157
- assert len(data) <= len(header_l)
158
- tmp = data + [''] * (len(header_l) - len(data))
159
- md_str += f'\n| {" | ".join(tmp)} |'
160
- return md_str
161
-
162
-
163
49
  def random_seeded_choice(seed: Union[int, str, float], choices, **kwargs):
164
50
  """Random choice with a (potentially string) seed."""
165
51
  return random.Random(seed).choices(choices, k=1, **kwargs)[0]
166
52
 
167
53
 
168
- def gen_hash(name: str):
169
- return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()
54
+ def gen_hash(name: str, bits: int = 32):
55
+ return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
170
56
 
171
57
 
172
58
  def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
@@ -312,52 +198,6 @@ class ResponseParser:
312
198
  return ''
313
199
 
314
200
 
315
- def make_outputs_dir(root_dir: str, datasets: list, model_id: str, model_revision: str):
316
- # model_revision = model_revision if model_revision is not None else 'none'
317
- # now = datetime.datetime.now()
318
- # format_time = now.strftime('%Y%m%d_%H%M%S')
319
- # outputs_name = format_time + '_' + 'default' + '_' + model_id.replace('/', '_') + '_' + model_revision
320
- # outputs_dir = os.path.join(work_dir, outputs_name)
321
- # dataset_name = dataset_id.replace('/', '_')
322
- # outputs_dir = os.path.join(work_dir, dataset_name)
323
-
324
- if not model_id:
325
- model_id = 'default'
326
- model_id = model_id.replace('/', '_')
327
-
328
- if not model_revision:
329
- model_revision = 'default'
330
-
331
- outputs_dir = os.path.join(root_dir, f"eval_{'-'.join(datasets)}_{model_id}_{model_revision}")
332
-
333
- return outputs_dir
334
-
335
-
336
- def process_outputs_structure(outputs_dir: str, is_make: bool = True) -> dict:
337
- logs_dir = os.path.join(outputs_dir, 'logs')
338
- predictions_dir = os.path.join(outputs_dir, 'predictions')
339
- reviews_dir = os.path.join(outputs_dir, 'reviews')
340
- reports_dir = os.path.join(outputs_dir, 'reports')
341
- configs_dir = os.path.join(outputs_dir, 'configs')
342
-
343
- if is_make:
344
- os.makedirs(outputs_dir, exist_ok=True)
345
- os.makedirs(logs_dir, exist_ok=True)
346
- os.makedirs(predictions_dir, exist_ok=True)
347
- os.makedirs(reviews_dir, exist_ok=True)
348
- os.makedirs(reports_dir, exist_ok=True)
349
- os.makedirs(configs_dir, exist_ok=True)
350
-
351
- outputs_structure = {
352
- OutputsStructure.LOGS_DIR: logs_dir,
353
- OutputsStructure.PREDICTIONS_DIR: predictions_dir,
354
- OutputsStructure.REVIEWS_DIR: reviews_dir,
355
- OutputsStructure.REPORTS_DIR: reports_dir,
356
- OutputsStructure.CONFIGS_DIR: configs_dir,
357
- }
358
-
359
- return outputs_structure
360
-
361
201
 
362
202
  def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
363
203
  """
@@ -401,148 +241,6 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
401
241
  return score
402
242
 
403
243
 
404
- def split_str_parts_by(text: str, delimiters: List[str]):
405
- """Split the text field into parts.
406
- Args:
407
- text: A text to be split.
408
- delimiters: The delimiters.
409
- Returns:
410
- The split text in list of dicts.
411
- """
412
- all_start_chars = [d[0] for d in delimiters]
413
- all_length = [len(d) for d in delimiters]
414
-
415
- text_list = []
416
- last_words = ''
417
-
418
- while len(text) > 0:
419
- for char_idx, char in enumerate(text):
420
- match_index = [idx for idx, start_char in enumerate(all_start_chars) if start_char == char]
421
- is_delimiter = False
422
- for index in match_index:
423
- if text[char_idx:char_idx + all_length[index]] == delimiters[index]:
424
- if last_words:
425
- if text_list:
426
- text_list[-1]['content'] = last_words
427
- else:
428
- text_list.append({'key': '', 'content': last_words})
429
- last_words = ''
430
- text_list.append({'key': delimiters[index]})
431
- text = text[char_idx + all_length[index]:]
432
- is_delimiter = True
433
- break
434
- if not is_delimiter:
435
- last_words += char
436
- else:
437
- break
438
- if last_words == text:
439
- text = ''
440
-
441
- text_list[-1]['content'] = last_words
442
- return text_list
443
-
444
-
445
- def calculate_loss_scale(response: str, use_loss_scale=False) -> Tuple[List[str], List[float]]:
446
- """Calculate the loss scale by splitting the agent response.
447
- This algorithm comes from paper: https://arxiv.org/pdf/2309.00986.pdf
448
- Agent response format:
449
- ```text
450
- Thought: you should always think about what to do
451
- Action: the action to take, should be one of the above tools[fire_recognition,
452
- fire_alert, call_police, call_fireman]
453
- Action Input: the input to the action
454
- Observation: the result of the action
455
- ... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
456
- Thought: I now know the final answer
457
- Final Answer: the final answer to the original input question
458
- ```
459
- Args:
460
- response: The response text
461
- use_loss_scale: Use weighted loss. With this, some part of the loss will be enhanced to improve performance.
462
- Returns:
463
- A tuple of agent response parts and their weights.
464
- """
465
- if 'Action:' in response and 'Observation:' in response and use_loss_scale:
466
- agent_keyword = ['Action:', 'Action Input:', 'Thought:', 'Final Answer:', 'Observation:']
467
- agent_parts = split_str_parts_by(response, agent_keyword)
468
- weights = []
469
- agent_content = []
470
- for c in agent_parts:
471
- if c['key'] in ('Action:', 'Action Input:'):
472
- weights += [2.0]
473
- weights += [2.0]
474
- elif c['key'] in ('Thought:', 'Final Answer:', ''):
475
- weights += [1.0]
476
- weights += [1.0]
477
- elif c['key'] in ('Observation:', ):
478
- weights += [2.0]
479
- weights += [0.0]
480
- agent_content.append(c['key'])
481
- agent_content.append(c['content'])
482
- return agent_content, weights
483
- else:
484
- return [response], [1.0]
485
-
486
-
487
- def get_bucket_sizes(max_length: int) -> List[int]:
488
- return [max_length // 4 * (i + 1) for i in range(4)]
489
-
490
-
491
- def _get_closet_bucket(bucket_sizes, data_length):
492
- """Select the one from bucket_sizes that is closest in distance to
493
- data_length. This is required for TorchAcc.
494
- """
495
- cloest_length = sys.maxsize
496
- for b in bucket_sizes:
497
- if b == data_length or ((b < cloest_length) and (b > data_length)):
498
- cloest_length = b
499
-
500
- if cloest_length == sys.maxsize:
501
- bucket_sizes.append(data_length)
502
- cloest_length = data_length
503
-
504
- return cloest_length
505
-
506
-
507
- def pad_and_split_batch(padding_to, input_ids, attention_mask, labels, loss_scale, max_length, tokenizer, rank,
508
- world_size):
509
- if padding_to is None:
510
- longest_len = input_ids.shape[-1]
511
- bucket_sizes = get_bucket_sizes(max_length)
512
- bucket_data_length = _get_closet_bucket(bucket_sizes, longest_len)
513
- padding_length = bucket_data_length - input_ids.shape[1]
514
- input_ids = F.pad(input_ids, (0, padding_length), 'constant', tokenizer.pad_token_id)
515
- attention_mask = F.pad(attention_mask, (0, padding_length), 'constant', 0)
516
- if loss_scale:
517
- loss_scale = F.pad(loss_scale, (0, padding_length), 'constant', 0.)
518
- labels = F.pad(labels, (0, padding_length), 'constant', -100)
519
-
520
- # manully split the batch to different DP rank.
521
- batch_size = input_ids.shape[0] // world_size
522
- if batch_size > 0:
523
- start = rank * batch_size
524
- end = (rank + 1) * batch_size
525
- input_ids = input_ids[start:end, :]
526
- attention_mask = attention_mask[start:end, :]
527
- labels = labels[start:end, :]
528
- if loss_scale:
529
- loss_scale = loss_scale[start:end, :]
530
- return input_ids, attention_mask, labels, loss_scale
531
-
532
-
533
- def get_dist_setting() -> Tuple[int, int, int, int]:
534
- """return rank, local_rank, world_size, local_world_size"""
535
- rank = int(os.getenv('RANK', -1))
536
- local_rank = int(os.getenv('LOCAL_RANK', -1))
537
- world_size = int(os.getenv('WORLD_SIZE', 1))
538
- local_world_size = int(os.getenv('LOCAL_WORLD_SIZE', 1))
539
- return rank, local_rank, world_size, local_world_size
540
-
541
-
542
- def use_torchacc() -> bool:
543
- return os.getenv('USE_TORCHACC', '0') == '1'
544
-
545
-
546
244
  def is_module_installed(module_name):
547
245
  try:
548
246
  importlib.import_module(module_name)
@@ -576,6 +274,7 @@ def get_valid_list(input_list, candidate_list):
576
274
 
577
275
  def get_latest_folder_path(work_dir):
578
276
  from datetime import datetime
277
+
579
278
  # Get all subdirectories in the work_dir
580
279
  folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
581
280
 
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.7.2'
4
- __release_datetime__ = '2024-12-04 12:00:00'
3
+ __version__ = '0.8.1'
4
+ __release_datetime__ = '2024-12-17 20:00:00'