evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +3 -4
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.1.dist-info/RECORD +0 -286
  230. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -2,18 +2,17 @@
2
2
  # Copyright (c) EleutherAI. and its affiliates.
3
3
  # Copyright (c) OpenAI. and its affiliates.
4
4
  import itertools
5
- import math
6
- from collections.abc import Iterable
7
- from collections import defaultdict
8
- from typing import Dict, List, Union
9
- from nltk.translate.bleu_score import sentence_bleu
10
- from nltk import word_tokenize
11
5
  import jieba
12
-
6
+ import math
13
7
  import numpy as np
8
+ import random
14
9
  import sacrebleu
15
10
  import sklearn.metrics
16
- import random
11
+ from collections import defaultdict
12
+ from collections.abc import Iterable
13
+ from nltk import word_tokenize
14
+ from nltk.translate.bleu_score import sentence_bleu
15
+ from typing import Dict, List, Union
17
16
 
18
17
 
19
18
  def mean(arr):
@@ -22,12 +21,12 @@ def mean(arr):
22
21
 
23
22
  def pop_stddev(arr):
24
23
  mu = mean(arr)
25
- return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
24
+ return math.sqrt(sum([(x - mu)**2 for x in arr]) / len(arr))
26
25
 
27
26
 
28
27
  def sample_stddev(arr):
29
28
  mu = mean(arr)
30
- return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
29
+ return math.sqrt(sum([(x - mu)**2 for x in arr]) / (len(arr) - 1))
31
30
 
32
31
 
33
32
  def mean_stderr(arr):
@@ -134,13 +133,14 @@ def bleu(items):
134
133
  refs, preds = _sacreformat(refs, preds)
135
134
  return sacrebleu.corpus_bleu(preds, refs).score
136
135
 
136
+
137
137
  def bleu_ngram_one_sample(predict, reference):
138
138
  """
139
139
  Calculate BLEU-1, BLEU-2, BLEU-3, and BLEU-4 scores
140
140
 
141
141
  Args:
142
142
  items: [(ref, pred)]
143
-
143
+
144
144
  Returns:
145
145
  {
146
146
  'bleu-1': 0.8,
@@ -150,6 +150,7 @@ def bleu_ngram_one_sample(predict, reference):
150
150
  }
151
151
 
152
152
  """
153
+
153
154
  def is_contains_chinese(strs):
154
155
  for _char in strs:
155
156
  if '\u4e00' <= _char <= '\u9fa5':
@@ -230,6 +231,7 @@ def _sacreformat(refs, preds):
230
231
 
231
232
 
232
233
  class _bootstrap_internal:
234
+
233
235
  def __init__(self, f, n):
234
236
  self.f = f
235
237
  self.n = n
@@ -260,11 +262,11 @@ def bootstrap_stderr(f, xs, iters):
260
262
 
261
263
  print('bootstrapping for stddev:', f.__name__)
262
264
  for bootstrap in tqdm(
263
- pool.imap(
264
- _bootstrap_internal(f, chunk_size),
265
- [(i, xs) for i in range(iters // chunk_size)],
266
- ),
267
- total=iters // chunk_size,
265
+ pool.imap(
266
+ _bootstrap_internal(f, chunk_size),
267
+ [(i, xs) for i in range(iters // chunk_size)],
268
+ ),
269
+ total=iters // chunk_size,
268
270
  ):
269
271
  # sample w replacement
270
272
  res.extend(bootstrap)
@@ -372,11 +374,9 @@ def calculate_arc_accuracy(question_answers: Dict[str, str], predictions: Dict[s
372
374
  return score / len(question_answers)
373
375
 
374
376
 
375
- def calculate_pass_at_k(
376
- num_samples: Union[int, List[int], np.ndarray],
377
- num_correct: Union[List[int], np.ndarray],
378
- k: int = 1
379
- ) -> np.ndarray:
377
+ def calculate_pass_at_k(num_samples: Union[int, List[int], np.ndarray],
378
+ num_correct: Union[List[int], np.ndarray],
379
+ k: int = 1) -> np.ndarray:
380
380
  """
381
381
  Estimates pass@k of each problem and returns them in an array.
382
382
  Examples:
@@ -1,18 +1,16 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ import jieba
3
4
  import logging
4
5
  from collections import defaultdict
5
6
  from pathlib import Path
7
+ from rouge_chinese import Rouge
6
8
  from statistics import mean
7
-
8
9
  from tqdm import tqdm
9
10
 
10
11
  from evalscope.constants import MetricsConstant
11
12
  from evalscope.metrics.bundled_rouge_score import rouge_scorer
12
13
 
13
- from rouge_chinese import Rouge
14
- import jieba
15
-
16
14
 
17
15
  class DummyTokenizer:
18
16
 
@@ -24,9 +22,7 @@ HERE = Path(__file__).absolute().parent
24
22
 
25
23
  logger = logging.getLogger(__name__)
26
24
 
27
- scorer = rouge_scorer.RougeScorer(
28
- ['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer()
29
- )
25
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
30
26
  zh_scorer = Rouge()
31
27
 
32
28
 
@@ -52,11 +48,7 @@ def compute_rouge_score(predict_l, reference_l):
52
48
  result[rouge_key].append(one_sample[rouge_key])
53
49
  rlt = {}
54
50
  for rouge_key in MetricsConstant.ROUGE_KEYS:
55
- rlt[rouge_key] = (
56
- mean(result[rouge_key]) * 100
57
- if rouge_key in result
58
- else MetricsConstant.INVALID_VALUE
59
- )
51
+ rlt[rouge_key] = (mean(result[rouge_key]) * 100 if rouge_key in result else MetricsConstant.INVALID_VALUE)
60
52
  return rlt
61
53
 
62
54
 
@@ -111,9 +103,9 @@ def _to_table(final_result) -> str:
111
103
  if not task:
112
104
  continue
113
105
  elif task == 'total':
114
- row.append(f'{final_result["total"]["rouge"][rouge_key] :0.2f}')
106
+ row.append(f'{final_result["total"]["rouge"][rouge_key]:0.2f}')
115
107
  else:
116
- row.append(f'{final_result["tasks"][task]["rouge"][rouge_key] :0.2f}')
108
+ row.append(f'{final_result["tasks"][task]["rouge"][rouge_key]:0.2f}')
117
109
  table.append('\t'.join(row))
118
110
 
119
111
  return '\n'.join(table)
@@ -122,23 +114,17 @@ def _to_table(final_result) -> str:
122
114
  def run_rouge_eval(data_l, md_level=2, report_metric_key='rouge-l-f'):
123
115
  print(f"{'#' * md_level} Rouge Eval")
124
116
  for data in tqdm(data_l):
125
- data['rouge'] = compute_rouge_score_one_sample(
126
- data['gen_tok_str'], data['reference_tok_str']
127
- )
117
+ data['rouge'] = compute_rouge_score_one_sample(data['gen_tok_str'], data['reference_tok_str'])
128
118
  task_data_d = defaultdict(list)
129
119
  for data in data_l:
130
120
  for task in data['task_tags']:
131
121
  task_data_d[task].append(data)
132
122
 
133
123
  total_rouge = mean([data['rouge'][report_metric_key] for data in data_l])
134
- print(
135
- f'[total], count: {len(data_l)}, {report_metric_key}: '
136
- f'{total_rouge * 100:0.2f}%'
137
- )
124
+ print(f'[total], count: {len(data_l)}, {report_metric_key}: '
125
+ f'{total_rouge * 100:0.2f}%')
138
126
 
139
127
  for task, task_data in task_data_d.items():
140
128
  task_rouge = mean([data['rouge'][report_metric_key] for data in task_data])
141
- print(
142
- f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
143
- f'{task_rouge * 100:0.2f}%'
144
- )
129
+ print(f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
130
+ f'{task_rouge * 100:0.2f}%')
@@ -1,4 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.models.model import BaseModel
4
- from evalscope.models.model import ChatBaseModel
3
+ from evalscope.models.model import BaseModel, ChatBaseModel
@@ -1,34 +1,36 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  import json
4
+ import requests
4
5
  import threading
5
6
  import time
6
7
  from asyncio import Queue
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from tqdm import tqdm
10
+ from typing import Dict, List, Optional, Union
7
11
 
8
- import requests
9
- from typing import Union, List, Optional, Dict
10
- from concurrent.futures import ThreadPoolExecutor
11
- from modelscope.utils.logger import get_logger
12
+ from evalscope.utils.logger import get_logger
12
13
 
13
14
  logger = get_logger()
14
15
 
15
16
 
16
17
  class OpenaiApi:
17
18
 
18
- def __init__(self,
19
- model: str,
20
- openai_api_key,
21
- openai_api_base,
22
- logprobs: Optional[bool] = False,
23
- top_logprobs: Optional[int] = None,
24
- max_new_tokens: int = 4096,
25
- temperature: Optional[float] = 0.0,
26
- repetition_penalty: Optional[float] = 1.0,
27
- is_chat: bool = True,
28
- verbose: bool = True,
29
- retry: int = 3,
30
- query_per_second: int = 10, # TODO
31
- **kwargs):
19
+ def __init__(
20
+ self,
21
+ model: str,
22
+ openai_api_key,
23
+ openai_api_base,
24
+ logprobs: Optional[bool] = False,
25
+ top_logprobs: Optional[int] = None,
26
+ max_new_tokens: int = 4096,
27
+ temperature: Optional[float] = 0.0,
28
+ repetition_penalty: Optional[float] = 1.0,
29
+ is_chat: bool = True,
30
+ verbose: bool = True,
31
+ retry: int = 3,
32
+ query_per_second: int = 10, # TODO
33
+ **kwargs):
32
34
 
33
35
  self.temperature = temperature
34
36
  self.repetition_penalty = repetition_penalty
@@ -45,14 +47,17 @@ class OpenaiApi:
45
47
 
46
48
  self.token_bucket = TokenBucket(query_per_second, verbose)
47
49
 
48
- def generate_simple(self, inputs: Union[List[str]]):
50
+ def generate_simple(self, inputs: Union[List[str]], num_proc: int = 8):
49
51
 
50
52
  def process_one(in_data: str):
51
53
 
52
54
  if self.is_chat:
53
55
  data = dict(
54
56
  model=self.model,
55
- messages=[{'role': 'user', 'content': in_data}],
57
+ messages=[{
58
+ 'role': 'user',
59
+ 'content': in_data
60
+ }],
56
61
  max_tokens=self.max_tokens,
57
62
  n=1,
58
63
  logprobs=self.logprobs,
@@ -72,7 +77,10 @@ class OpenaiApi:
72
77
 
73
78
  # todo
74
79
  openai_api_key = self.openai_api_key or ''
75
- header = {'Authorization': f'Bearer ', 'content-type': 'application/json', }
80
+ header = {
81
+ 'Authorization': f'Bearer {openai_api_key}',
82
+ 'content-type': 'application/json',
83
+ }
76
84
  data = json.dumps(data, ensure_ascii=False)
77
85
 
78
86
  if self.verbose:
@@ -91,14 +99,18 @@ class OpenaiApi:
91
99
  else:
92
100
  return resp['choices'][0]['text'].strip()
93
101
 
94
- with ThreadPoolExecutor() as executor:
95
- results = list(executor.map(process_one, inputs))
102
+ results = []
103
+ with ThreadPoolExecutor(max_workers=num_proc) as executor:
104
+ # Submit all tasks
105
+ future_to_task = {executor.submit(process_one, input_one): input_one for input_one in inputs}
106
+
107
+ # Show progress bar
108
+ for future in tqdm(as_completed(future_to_task), total=len(inputs)):
109
+ results.append(future.result())
96
110
 
97
111
  return results
98
112
 
99
- def generate(self,
100
- inputs: Union[List[str], List[List]],
101
- **kwargs) -> List[str]:
113
+ def generate(self, inputs: Union[List[str], List[List]], **kwargs) -> List[str]:
102
114
  """
103
115
  Generate responses from OpenAI API.
104
116
 
@@ -160,13 +172,12 @@ class OpenaiApi:
160
172
 
161
173
  def remove_none_val(input_d: dict):
162
174
  return {k: v for k, v in input_d.items() if v is not None}
175
+
163
176
  data = remove_none_val(data)
164
177
 
165
178
  if self.verbose:
166
179
  logger.info(f'>> Post data: {json.dumps(data, ensure_ascii=False)}')
167
- raw_response = requests.post(self.url,
168
- headers=header,
169
- data=json.dumps(data, ensure_ascii=False))
180
+ raw_response = requests.post(self.url, headers=header, data=json.dumps(data, ensure_ascii=False))
170
181
 
171
182
  response = raw_response.json()
172
183
  if self.verbose:
@@ -1,4 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  from evalscope.models.custom.custom_model import *
4
-
@@ -1,7 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from abc import ABC, abstractmethod
3
- from typing import Any, Union, Dict, List
4
2
  import torch
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Dict, List, Union
5
5
 
6
6
 
7
7
  class CustomModel(ABC):
@@ -11,7 +11,7 @@ class CustomModel(ABC):
11
11
  self.kwargs = kwargs
12
12
 
13
13
  if config.get('model_id', None) is None:
14
- raise ValueError(f"**Error: model_id is required in config for CustomModel. Got config: {config}")
14
+ raise ValueError(f'**Error: model_id is required in config for CustomModel. Got config: {config}')
15
15
 
16
16
  @abstractmethod
17
17
  @torch.no_grad()
@@ -2,6 +2,7 @@
2
2
 
3
3
  import random
4
4
  import time
5
+
5
6
  from evalscope.models import ChatBaseModel
6
7
  from evalscope.utils.logger import get_logger
7
8
 
@@ -32,15 +33,13 @@ class DummyChatModel(ChatBaseModel):
32
33
 
33
34
  # Build response
34
35
  res = {
35
- 'choices': [
36
- {
37
- 'index': 0,
38
- 'message': {
39
- 'content': choice,
40
- 'role': 'assistant'
41
- }
36
+ 'choices': [{
37
+ 'index': 0,
38
+ 'message': {
39
+ 'content': choice,
40
+ 'role': 'assistant'
42
41
  }
43
- ],
42
+ }],
44
43
  'created': time.time(),
45
44
  'model': self.MODEL_ID + '-' + self.REVISION,
46
45
  'object': 'chat.completion',