evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  # Convert datasets to webdataset format
2
2
  import os
3
- from tqdm import tqdm
4
3
  import torch
5
4
  import torch.utils.data
6
5
  import webdataset
6
+ from tqdm import tqdm
7
+
7
8
  from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import DatasetWrapper
8
- from evalscope.backend.rag_eval.utils.tools import path_to_bytes, PIL_to_bytes
9
+ from evalscope.backend.rag_eval.utils.tools import PIL_to_bytes, path_to_bytes
9
10
  from evalscope.utils.logger import get_logger
10
11
 
11
12
  logger = get_logger()
@@ -17,19 +18,21 @@ def convert_dataset(
17
18
  output_folder,
18
19
  *,
19
20
  transform=None,
20
- image_format="webp",
21
+ image_format='webp',
21
22
  max_count=10_000,
22
23
  max_size=1_000_000_000,
23
24
  multilabel=False,
24
25
  verbose=True,
25
26
  ):
26
27
  """
27
- Convert an iterable `dataset` of (image, label) pairs to webdataset (.tar) format, and store in `output_folder/split`.
28
+ Convert an iterable `dataset` of (image, label) pairs to webdataset (.tar) format, and store in
29
+ `output_folder/split`.
28
30
 
29
31
  Images may be passed in as either:
30
32
  * File paths: pass in `transform=path_to_bytes`;
31
33
  * PIL images: pass in `transform=PIL_to_bytes(image_format)` where `image_format` is e.g. "webp"; or
32
- * Raw binary data: use a PyTorch `Dataset` that supports `transform=PIL_to_bytes(image_format)`, and pass in `transform=None` here.
34
+ * Raw binary data: use a PyTorch `Dataset` that supports `transform=PIL_to_bytes(image_format)`, and
35
+ pass in `transform=None` here.
33
36
  Be sure that the transform is not applied twice.
34
37
 
35
38
  Copying image files directly or writing raw binary data is fastest since it allows multiprocessing;
@@ -37,9 +40,10 @@ def convert_dataset(
37
40
 
38
41
  Labels must be zero-indexed integers (for multilabel datasets, labels must be arrays/tensors).
39
42
 
40
- Classnames and zero-shot classification templates can be provided as attributes of the dataset (`.classes` and `.templates`)
41
- or filled in manually afterward. `dataset.classes` should be a list of strings indexed by the labels,
42
- and `dataset.templates` should be a list of strings containing `{c}` to specify where classnames are to be inserted.
43
+ Classnames and zero-shot classification templates can be provided as attributes of the dataset (`.classes`
44
+ and `.templates`) or filled in manually afterward. `dataset.classes` should be a list of strings indexed by
45
+ the labels, and `dataset.templates` should be a list of strings containing `{c}` to specify where classnames
46
+ are to be inserted.
43
47
  """
44
48
  # Create output directory
45
49
  os.makedirs(os.path.join(output_folder, split), exist_ok=True)
@@ -52,52 +56,44 @@ def convert_dataset(
52
56
  )
53
57
  if verbose:
54
58
  try:
55
- logger.info(f"Dataset size: {len(dataset)}")
59
+ logger.info(f'Dataset size: {len(dataset)}')
56
60
  except TypeError:
57
- logger.info("IterableDataset has no len()")
61
+ logger.info('IterableDataset has no len()')
58
62
  # Save classnames
59
- if hasattr(dataset, "classes") and dataset.classes:
60
- classnames_fname = os.path.join(output_folder, "classnames.txt")
61
- with open(classnames_fname, "w") as classnames_file:
62
- logger.info(*dataset.classes, sep="\n", end="\n", file=classnames_file)
63
+ if hasattr(dataset, 'classes') and dataset.classes:
64
+ classnames_fname = os.path.join(output_folder, 'classnames.txt')
65
+ with open(classnames_fname, 'w') as classnames_file:
66
+ logger.info(*dataset.classes, sep='\n', end='\n', file=classnames_file)
63
67
  if verbose:
64
68
  logger.info("Saved class names to '%s'" % classnames_fname)
65
69
  elif verbose:
66
- logger.info("WARNING: No class names found")
70
+ logger.info('WARNING: No class names found')
67
71
  # Save zeroshot templates
68
- if hasattr(dataset, "templates") and dataset.templates:
69
- templates_fname = os.path.join(
70
- output_folder, "zeroshot_classification_templates.txt"
71
- )
72
- with open(templates_fname, "w") as templates_file:
73
- logger.info(*dataset.templates, sep="\n", end="\n", file=templates_file)
72
+ if hasattr(dataset, 'templates') and dataset.templates:
73
+ templates_fname = os.path.join(output_folder, 'zeroshot_classification_templates.txt')
74
+ with open(templates_fname, 'w') as templates_file:
75
+ logger.info(*dataset.templates, sep='\n', end='\n', file=templates_file)
74
76
  if verbose:
75
77
  logger.info("Saved class names to '%s'" % templates_fname)
76
78
  elif verbose:
77
- logger.info("WARNING: No zeroshot classification templates found")
79
+ logger.info('WARNING: No zeroshot classification templates found')
78
80
  # Save dataset type
79
81
  if multilabel:
80
- type_fname = os.path.join(output_folder, "dataset_type.txt")
81
- with open(type_fname, "w") as type_file:
82
- logger.info("multilabel", end="\n", file=type_file)
82
+ type_fname = os.path.join(output_folder, 'dataset_type.txt')
83
+ with open(type_fname, 'w') as type_file:
84
+ logger.info('multilabel', end='\n', file=type_file)
83
85
  if verbose:
84
86
  logger.info("Saved dataset type to '%s'" % type_fname)
85
87
  # Write to TAR files
86
- data_fname = os.path.join(output_folder, split, r"%d.tar")
88
+ data_fname = os.path.join(output_folder, split, r'%d.tar')
87
89
  sink = webdataset.ShardWriter(data_fname, maxcount=max_count, maxsize=max_size)
88
90
  nsamples = 0
89
- label_type = "npy" if multilabel else "cls"
90
- for index, (input, output) in enumerate(tqdm(dataloader, desc="Converting")):
91
+ label_type = 'npy' if multilabel else 'cls'
92
+ for index, (input, output) in enumerate(tqdm(dataloader, desc='Converting')):
91
93
  nsamples += 1
92
94
  if isinstance(input, str) and transform is path_to_bytes:
93
95
  # If copying file, determine image format from extension
94
- extension = (
95
- os.path.splitext(input)[1]
96
- .replace(".", "")
97
- .lower()
98
- .replace("jpeg", "jpg")
99
- or image_format
100
- )
96
+ extension = (os.path.splitext(input)[1].replace('.', '').lower().replace('jpeg', 'jpg') or image_format)
101
97
  else:
102
98
  extension = image_format
103
99
  # Convert label if necessary
@@ -107,27 +103,22 @@ def convert_dataset(
107
103
  else:
108
104
  output = output.item()
109
105
  # Write example
110
- sink.write(
111
- {
112
- "__key__": "s%07d" % index,
113
- extension: transform(input) if transform else input,
114
- label_type: output,
115
- }
116
- )
106
+ sink.write({
107
+ '__key__': 's%07d' % index,
108
+ extension: transform(input) if transform else input,
109
+ label_type: output,
110
+ })
117
111
  num_shards = sink.shard
118
112
  sink.close()
119
113
  if verbose:
120
- logger.info(
121
- "Saved dataset to '%s'"
122
- % data_fname.replace(r"%d", "{0..%d}" % (num_shards - 1))
123
- )
114
+ logger.info("Saved dataset to '%s'" % data_fname.replace(r'%d', '{0..%d}' % (num_shards - 1)))
124
115
  # Save number of shards
125
- nshards_fname = os.path.join(output_folder, split, "nshards.txt")
126
- with open(nshards_fname, "w") as nshards_file:
127
- logger.info(num_shards, end="\n", file=nshards_file)
116
+ nshards_fname = os.path.join(output_folder, split, 'nshards.txt')
117
+ with open(nshards_fname, 'w') as nshards_file:
118
+ logger.info(num_shards, end='\n', file=nshards_file)
128
119
  if verbose:
129
120
  logger.info("Saved number of shards = %d to '%s'" % (num_shards, nshards_fname))
130
- logger.info("Final dataset size:", nsamples)
121
+ logger.info('Final dataset size:', nsamples)
131
122
 
132
123
 
133
124
  def convert_retrieval_dataset(
@@ -136,13 +127,14 @@ def convert_retrieval_dataset(
136
127
  output_folder,
137
128
  *,
138
129
  transform=None,
139
- image_format="webp",
130
+ image_format='webp',
140
131
  max_count=10_000,
141
132
  max_size=1_000_000_000,
142
133
  verbose=True,
143
134
  ):
144
135
  """
145
- Convert an iterable `dataset` of (image, [caption1, caption2, ...]) pairs to webdataset (.tar) format, and store in `output_folder/split`.
136
+ Convert an iterable `dataset` of (image, [caption1, caption2, ...]) pairs to webdataset (.tar) format,
137
+ and store in `output_folder/split`.
146
138
 
147
139
  Labels must be lists of strings, with no newlines.
148
140
 
@@ -159,72 +151,59 @@ def convert_retrieval_dataset(
159
151
  )
160
152
  if verbose:
161
153
  try:
162
- logger.info(f"Dataset size: {len(dataset)}")
154
+ logger.info(f'Dataset size: {len(dataset)}')
163
155
  except TypeError:
164
- logger.info("IterableDataset has no len()")
156
+ logger.info('IterableDataset has no len()')
165
157
  # No classnames
166
158
  # No zeroshot templates
167
159
  # Save dataset type
168
- type_fname = os.path.join(output_folder, "dataset_type.txt")
169
- with open(type_fname, "w") as type_file:
170
- logger.info("retrieval", end="\n", file=type_file)
160
+ type_fname = os.path.join(output_folder, 'dataset_type.txt')
161
+ with open(type_fname, 'w') as type_file:
162
+ logger.info('retrieval', end='\n', file=type_file)
171
163
  if verbose:
172
164
  logger.info("Saved dataset type to '%s'" % type_fname)
173
165
  # Write to TAR files
174
- data_fname = os.path.join(output_folder, split, r"%d.tar")
166
+ data_fname = os.path.join(output_folder, split, r'%d.tar')
175
167
  sink = webdataset.ShardWriter(data_fname, maxcount=max_count, maxsize=max_size)
176
168
  nsamples = 0
177
- for index, (input, output) in enumerate(tqdm(dataloader, desc="Converting")):
169
+ for index, (input, output) in enumerate(tqdm(dataloader, desc='Converting')):
178
170
  nsamples += 1
179
171
  if isinstance(input, str) and transform is path_to_bytes:
180
172
  # If copying file, determine image format from extension
181
- extension = (
182
- os.path.splitext(input)[1]
183
- .replace(".", "")
184
- .lower()
185
- .replace("jpeg", "jpg")
186
- or image_format
187
- )
173
+ extension = (os.path.splitext(input)[1].replace('.', '').lower().replace('jpeg', 'jpg') or image_format)
188
174
  else:
189
175
  extension = image_format
190
- sink.write(
191
- {
192
- "__key__": "s%07d" % index,
193
- extension: transform(input) if transform else input,
194
- "txt": "\n".join(caption.replace("\n", r"\n") for caption in output),
195
- }
196
- )
176
+ sink.write({
177
+ '__key__': 's%07d' % index,
178
+ extension: transform(input) if transform else input,
179
+ 'txt': '\n'.join(caption.replace('\n', r'\n') for caption in output),
180
+ })
197
181
  num_shards = sink.shard
198
182
  sink.close()
199
183
  if verbose:
200
- logger.info(
201
- "Saved dataset to '%s'"
202
- % data_fname.replace(r"%d", "{0..%d}" % (num_shards - 1))
203
- )
184
+ logger.info("Saved dataset to '%s'" % data_fname.replace(r'%d', '{0..%d}' % (num_shards - 1)))
204
185
  # Save number of shards
205
- nshards_fname = os.path.join(output_folder, split, "nshards.txt")
206
- with open(nshards_fname, "w") as nshards_file:
207
- logger.info(num_shards, end="\n", file=nshards_file)
186
+ nshards_fname = os.path.join(output_folder, split, 'nshards.txt')
187
+ with open(nshards_fname, 'w') as nshards_file:
188
+ logger.info(num_shards, end='\n', file=nshards_file)
208
189
  if verbose:
209
190
  logger.info("Saved number of shards = %d to '%s'" % (num_shards, nshards_fname))
210
- logger.info("Final dataset size:", nsamples)
191
+ logger.info('Final dataset size:', nsamples)
211
192
 
212
193
 
213
- if __name__ == "__main__":
194
+ if __name__ == '__main__':
214
195
  from modelscope.msdatasets import MsDataset
215
196
 
216
- splits = ["train", "validation"]
197
+ splits = ['train', 'validation']
217
198
  for split in splits:
218
- ds = MsDataset.load("modelscope/muge", split=split)
199
+ ds = MsDataset.load('modelscope/muge', split=split)
219
200
  hf_dataset = ds.to_hf_dataset()
220
- pytorch_dataset = DatasetWrapper(
221
- hf_dataset, image_key="image", text_key="query"
222
- )
201
+ pytorch_dataset = DatasetWrapper(hf_dataset, image_key='image', text_key='query')
223
202
  convert_retrieval_dataset(
224
203
  pytorch_dataset,
225
204
  split,
226
- "data/muge",
227
- transform=PIL_to_bytes("jpg"),
228
- image_format="jpg",
205
+ 'data/muge',
206
+ transform=PIL_to_bytes('jpg'),
207
+ image_format='jpg',
229
208
  max_count=50_000,
230
209
  )
@@ -1,4 +1,4 @@
1
- from evalscope.backend.rag_eval.cmteb.tasks import *
1
+ from evalscope.backend.rag_eval.cmteb.arguments import EvalArguments, ModelArguments
2
2
  from evalscope.backend.rag_eval.cmteb.base import *
3
- from evalscope.backend.rag_eval.cmteb.arguments import ModelArguments, EvalArguments
4
- from evalscope.backend.rag_eval.cmteb.task_template import one_stage_eval, two_stage_eval
3
+ from evalscope.backend.rag_eval.cmteb.task_template import one_stage_eval, two_stage_eval
4
+ from evalscope.backend.rag_eval.cmteb.tasks import *
@@ -1,38 +1,36 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List, Optional, Union, Dict, Any
2
+ from typing import Any, Dict, List, Optional, Union
3
3
 
4
4
 
5
5
  @dataclass
6
6
  class ModelArguments:
7
7
  # Arguments for embeding model: sentence transformer or cross encoder
8
- model_name_or_path: str = "" # model name or path
8
+ model_name_or_path: str = '' # model name or path
9
9
  is_cross_encoder: bool = False # whether the model is a cross encoder
10
10
  # pooling mode: Either “cls”, “lasttoken”, “max”, “mean”, “mean_sqrt_len_tokens”, or “weightedmean”.
11
11
  pooling_mode: Optional[str] = None
12
12
  max_seq_length: int = 512 # max sequence length
13
13
  # prompt for llm based model
14
- prompt: str = ""
14
+ prompt: str = ''
15
15
  # model kwargs
16
16
  model_kwargs: dict = field(default_factory=dict)
17
17
  # config kwargs
18
18
  config_kwargs: Dict[str, Any] = field(default_factory=dict)
19
19
  # encode kwargs
20
- encode_kwargs: dict = field(
21
- default_factory=lambda: {"show_progress_bar": True, "batch_size": 32}
22
- )
23
- hub: str = "modelscope" # modelscope or huggingface
20
+ encode_kwargs: dict = field(default_factory=lambda: {'show_progress_bar': True, 'batch_size': 32})
21
+ hub: str = 'modelscope' # modelscope or huggingface
24
22
 
25
23
  def to_dict(self) -> Dict[str, Any]:
26
24
  return {
27
- "model_name_or_path": self.model_name_or_path,
28
- "is_cross_encoder": self.is_cross_encoder,
29
- "pooling_mode": self.pooling_mode,
30
- "max_seq_length": self.max_seq_length,
31
- "prompt": self.prompt,
32
- "model_kwargs": self.model_kwargs,
33
- "config_kwargs": self.config_kwargs,
34
- "encode_kwargs": self.encode_kwargs,
35
- "hub": self.hub,
25
+ 'model_name_or_path': self.model_name_or_path,
26
+ 'is_cross_encoder': self.is_cross_encoder,
27
+ 'pooling_mode': self.pooling_mode,
28
+ 'max_seq_length': self.max_seq_length,
29
+ 'prompt': self.prompt,
30
+ 'model_kwargs': self.model_kwargs,
31
+ 'config_kwargs': self.config_kwargs,
32
+ 'encode_kwargs': self.encode_kwargs,
33
+ 'hub': self.hub,
36
34
  }
37
35
 
38
36
 
@@ -42,20 +40,20 @@ class EvalArguments:
42
40
  tasks: List[str] = field(default_factory=list) # task names
43
41
  dataset_path: Optional[str] = None # custom dataset path
44
42
  verbosity: int = 2 # verbosity level 0-3
45
- output_folder: str = "outputs" # output folder
43
+ output_folder: str = 'outputs' # output folder
46
44
  overwrite_results: bool = True # overwrite results
47
45
  limits: Optional[int] = None # limit number of samples
48
- hub: str = "modelscope" # modelscope or huggingface
49
- top_k: int = 5 # top k for reranking
46
+ hub: str = 'modelscope' # modelscope or huggingface
47
+ top_k: int = 5 # top k for reranking
50
48
 
51
49
  def to_dict(self) -> Dict[str, Any]:
52
50
  return {
53
- "tasks": self.tasks,
54
- "dataset_path": self.dataset_path,
55
- "verbosity": self.verbosity,
56
- "output_folder": self.output_folder,
57
- "overwrite_results": self.overwrite_results,
58
- "limits": self.limits,
59
- "hub": self.hub,
60
- "top_k": self.top_k,
51
+ 'tasks': self.tasks,
52
+ 'dataset_path': self.dataset_path,
53
+ 'verbosity': self.verbosity,
54
+ 'output_folder': self.output_folder,
55
+ 'overwrite_results': self.overwrite_results,
56
+ 'limits': self.limits,
57
+ 'hub': self.hub,
58
+ 'top_k': self.top_k,
61
59
  }
@@ -1,12 +1,14 @@
1
+ import datasets
1
2
  from collections import defaultdict
2
- from typing import List
3
- from mteb import AbsTask
4
3
  from datasets import DatasetDict
5
4
  from modelscope import MsDataset
6
- import datasets
7
- from evalscope.backend.rag_eval.cmteb.tasks import CLS_DICT, CLS_RETRIEVAL, CLS_CUSTOM
5
+ from mteb import AbsTask
6
+ from typing import List
7
+
8
+ from evalscope.backend.rag_eval.cmteb.tasks import CLS_CUSTOM, CLS_DICT, CLS_RETRIEVAL
9
+ from evalscope.constants import HubType
8
10
 
9
- __all__ = ["TaskBase"]
11
+ __all__ = ['TaskBase']
10
12
 
11
13
 
12
14
  class TaskBase:
@@ -28,7 +30,7 @@ class TaskBase:
28
30
  from mteb.overview import TASKS_REGISTRY
29
31
 
30
32
  task_cls = TASKS_REGISTRY[task_name]
31
- if task_cls.metadata.type != "Retrieval":
33
+ if task_cls.metadata.type != 'Retrieval':
32
34
  task_cls.load_data = load_data
33
35
 
34
36
  # init task instance
@@ -41,33 +43,30 @@ def load_data(self, **kwargs):
41
43
  if self.data_loaded:
42
44
  return
43
45
 
44
- limits = kwargs.get("limits", None)
45
- hub = kwargs.get("hub", "modelscope")
46
- name = self.metadata_dict.get("name")
47
- path = self.metadata_dict["dataset"].get("path")
46
+ limits = kwargs.get('limits', None)
47
+ hub = kwargs.get('hub', HubType.MODELSCOPE)
48
+ name = self.metadata_dict.get('name')
49
+ path = self.metadata_dict['dataset'].get('path')
48
50
 
49
- assert path is not None, "Path must be specified in dataset"
51
+ assert path is not None, 'Path must be specified in dataset'
50
52
 
51
53
  # Loading the dataset based on the source hub
52
- if hub == "modelscope":
54
+ if hub == HubType.MODELSCOPE:
53
55
  import re
54
56
 
55
- path = re.sub(r"^mteb/", "MTEB/", path)
57
+ path = re.sub(r'^mteb/', 'MTEB/', path)
56
58
  dataset = MsDataset.load(path)
57
59
  else:
58
- dataset = datasets.load_dataset(**self.metadata_dict["dataset"]) # type: ignore
60
+ dataset = datasets.load_dataset(**self.metadata_dict['dataset']) # type: ignore
59
61
 
60
62
  if limits is not None:
61
- dataset = {
62
- split: dataset[split].select(range(min(limits, len(dataset[split]))))
63
- for split in dataset.keys()
64
- }
63
+ dataset = {split: dataset[split].select(range(min(limits, len(dataset[split])))) for split in dataset.keys()}
65
64
 
66
65
  if name in CLS_RETRIEVAL:
67
66
  self.corpus, self.queries, self.relevant_docs = load_retrieval_data(
68
67
  dataset,
69
68
  path,
70
- self.metadata_dict["eval_splits"],
69
+ self.metadata_dict['eval_splits'],
71
70
  )
72
71
 
73
72
  self.dataset = dataset
@@ -77,13 +76,13 @@ def load_data(self, **kwargs):
77
76
 
78
77
  def load_retrieval_data(dataset, dataset_name: str, eval_splits: list) -> tuple:
79
78
  eval_split = eval_splits[0]
80
- qrels = MsDataset.load(dataset_name + "-qrels")[eval_split]
79
+ qrels = MsDataset.load(dataset_name + '-qrels')[eval_split]
81
80
 
82
- corpus = {e["id"]: {"text": e["text"]} for e in dataset["corpus"]}
83
- queries = {e["id"]: e["text"] for e in dataset["queries"]}
81
+ corpus = {e['id']: {'text': e['text']} for e in dataset['corpus']}
82
+ queries = {e['id']: e['text'] for e in dataset['queries']}
84
83
  relevant_docs = defaultdict(dict)
85
84
  for e in qrels:
86
- relevant_docs[e["qid"]][e["pid"]] = e["score"]
85
+ relevant_docs[e['qid']][e['pid']] = e['score']
87
86
 
88
87
  corpus = DatasetDict({eval_split: corpus})
89
88
  queries = DatasetDict({eval_split: queries})
@@ -1,8 +1,8 @@
1
- import os
2
1
  import mteb
3
- from evalscope.backend.rag_eval import EmbeddingModel
4
- from evalscope.backend.rag_eval import cmteb
2
+ import os
5
3
  from mteb.task_selection import results_to_dataframe
4
+
5
+ from evalscope.backend.rag_eval import EmbeddingModel, cmteb
6
6
  from evalscope.utils.logger import get_logger
7
7
 
8
8
  logger = get_logger()
@@ -19,8 +19,8 @@ def show_results(output_folder, model, results):
19
19
  model_name,
20
20
  revision,
21
21
  )
22
- logger.info(f"Evaluation results:\n{results_df.to_markdown()}")
23
- logger.info(f"Evaluation results saved in {os.path.abspath(save_path)}")
22
+ logger.info(f'Evaluation results:\n{results_df.to_markdown()}')
23
+ logger.info(f'Evaluation results saved in {os.path.abspath(save_path)}')
24
24
 
25
25
 
26
26
  def one_stage_eval(
@@ -29,18 +29,16 @@ def one_stage_eval(
29
29
  ) -> None:
30
30
  # load model
31
31
  model = EmbeddingModel.load(**model_args)
32
- custom_dataset_path = eval_args.pop("dataset_path", None)
32
+ custom_dataset_path = eval_args.pop('dataset_path', None)
33
33
  # load task first to update instructions
34
- tasks = cmteb.TaskBase.get_tasks(
35
- task_names=eval_args["tasks"], dataset_path=custom_dataset_path
36
- )
34
+ tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'], dataset_path=custom_dataset_path)
37
35
  evaluation = mteb.MTEB(tasks=tasks)
38
36
 
39
37
  # run evaluation
40
38
  results = evaluation.run(model, **eval_args)
41
39
 
42
40
  # save and log results
43
- show_results(eval_args["output_folder"], model, results)
41
+ show_results(eval_args['output_folder'], model, results)
44
42
 
45
43
 
46
44
  def two_stage_eval(
@@ -56,7 +54,7 @@ def two_stage_eval(
56
54
  first_stage_path = f"{eval_args['output_folder']}/stage1"
57
55
  second_stage_path = f"{eval_args['output_folder']}/stage2"
58
56
 
59
- tasks = cmteb.TaskBase.get_tasks(task_names=eval_args["tasks"])
57
+ tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'])
60
58
  for task in tasks:
61
59
  evaluation = mteb.MTEB(tasks=[task])
62
60
 
@@ -66,19 +64,19 @@ def two_stage_eval(
66
64
  save_predictions=True,
67
65
  output_folder=first_stage_path,
68
66
  overwrite_results=True,
69
- hub=eval_args["hub"],
70
- limits=eval_args["limits"],
67
+ hub=eval_args['hub'],
68
+ limits=eval_args['limits'],
71
69
  )
72
70
  # stage 2: run cross encoder
73
71
  results = evaluation.run(
74
72
  cross_encoder,
75
- top_k=eval_args["top_k"],
73
+ top_k=eval_args['top_k'],
76
74
  save_predictions=True,
77
75
  output_folder=second_stage_path,
78
- previous_results=f"{first_stage_path}/{task.metadata.name}_default_predictions.json",
76
+ previous_results=f'{first_stage_path}/{task.metadata.name}_default_predictions.json',
79
77
  overwrite_results=True,
80
- hub=eval_args["hub"],
81
- limits=eval_args["limits"],
78
+ hub=eval_args['hub'],
79
+ limits=eval_args['limits'],
82
80
  )
83
81
 
84
82
  # save and log results