evalscope 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (69) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gpqa/__init__.py +0 -0
  11. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  12. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  13. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  14. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  15. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  16. evalscope/benchmarks/ifeval/__init__.py +0 -0
  17. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  18. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  19. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  20. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  21. evalscope/benchmarks/ifeval/utils.py +134 -0
  22. evalscope/benchmarks/iquiz/__init__.py +0 -0
  23. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  24. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  25. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  26. evalscope/benchmarks/race/race_adapter.py +4 -73
  27. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  28. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  29. evalscope/cli/cli.py +2 -0
  30. evalscope/cli/start_app.py +30 -0
  31. evalscope/collections/evaluator.py +82 -62
  32. evalscope/collections/sampler.py +47 -41
  33. evalscope/collections/schema.py +14 -10
  34. evalscope/constants.py +4 -0
  35. evalscope/evaluator/evaluator.py +22 -13
  36. evalscope/metrics/__init__.py +2 -5
  37. evalscope/metrics/metrics.py +11 -2
  38. evalscope/metrics/named_metrics.py +17 -0
  39. evalscope/models/chat_adapter.py +2 -0
  40. evalscope/models/server_adapter.py +11 -4
  41. evalscope/perf/__init__.py +1 -0
  42. evalscope/perf/main.py +0 -1
  43. evalscope/perf/plugin/api/custom_api.py +1 -1
  44. evalscope/perf/plugin/api/openai_api.py +1 -1
  45. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  46. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  47. evalscope/report/__init__.py +5 -0
  48. evalscope/report/app.py +693 -0
  49. evalscope/report/combinator.py +73 -0
  50. evalscope/report/generator.py +80 -0
  51. evalscope/report/utils.py +133 -0
  52. evalscope/run.py +16 -11
  53. evalscope/summarizer.py +1 -1
  54. evalscope/utils/chat_service.py +1 -1
  55. evalscope/utils/logger.py +1 -0
  56. evalscope/utils/model_utils.py +5 -2
  57. evalscope/version.py +2 -2
  58. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +84 -7
  59. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +66 -51
  60. tests/cli/test_collection.py +11 -7
  61. tests/cli/test_run.py +13 -4
  62. evalscope/tools/__init__.py +0 -1
  63. evalscope/tools/combine_reports.py +0 -133
  64. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  65. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  66. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  67. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  68. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  69. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
evalscope/perf/main.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import asyncio
2
- import logging
3
2
  import os
4
3
  import platform
5
4
  from argparse import Namespace
@@ -1,5 +1,4 @@
1
1
  import json
2
- from transformers import AutoTokenizer
3
2
  from typing import Any, Dict, Iterator, List
4
3
 
5
4
  from evalscope.perf.arguments import Arguments
@@ -25,6 +24,7 @@ class CustomPlugin(ApiPluginBase):
25
24
  """
26
25
  super().__init__(model_path=mode_path)
27
26
  if mode_path is not None:
27
+ from transformers import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  import os
3
- from transformers import AutoTokenizer
4
3
  from typing import Any, Dict, Iterator, List, Union
5
4
 
6
5
  from evalscope.perf.arguments import Arguments
@@ -25,6 +24,7 @@ class OpenaiPlugin(ApiPluginBase):
25
24
  """
26
25
  super().__init__(model_path=mode_path)
27
26
  if mode_path is not None:
27
+ from transformers import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -1,6 +1,5 @@
1
1
  import base64
2
2
  from io import BytesIO
3
- from modelscope.msdatasets import MsDataset
4
3
  from PIL import Image
5
4
  from typing import Any, Dict, Iterator, List
6
5
 
@@ -26,6 +25,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
26
25
  super().__init__(query_parameters)
27
26
 
28
27
  def build_messages(self) -> Iterator[List[Dict]]:
28
+ from modelscope.msdatasets import MsDataset
29
29
  dataset = MsDataset.load('clip-benchmark/wds_flickr8k', split='test')
30
30
 
31
31
  for item in dataset:
@@ -1,4 +1,3 @@
1
- from modelscope import MsDataset
2
1
  from typing import Any, Dict, Iterator, List
3
2
 
4
3
  from evalscope.perf.arguments import Arguments
@@ -17,6 +16,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
17
16
 
18
17
  def build_messages(self) -> Iterator[List[Dict]]:
19
18
  if not self.query_parameters.dataset_path:
19
+ from modelscope import MsDataset
20
20
  ds = MsDataset.load('AI-ModelScope/LongAlpaca-12k', subset_name='default', split='train')
21
21
  else:
22
22
  ds = self.dataset_json_list(self.query_parameters.dataset_path)
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
4
+ from evalscope.report.generator import ReportGenerator
5
+ from evalscope.report.utils import Category, Report, ReportKey, Subset