evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,383 @@
1
+ import abc
2
+ from pydantic_core import to_jsonable_python
3
+ from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Optional, Sequence, Union
4
+
5
+ from evalscope.api.messages import ChatMessage, ChatMessageAssistant, ChatMessageSystem, ChatMessageUser
6
+ from evalscope.api.registry import get_model_api
7
+ from evalscope.api.tool import ToolChoice, ToolFunction, ToolInfo
8
+ from evalscope.utils import get_logger
9
+ from evalscope.utils.function_utils import thread_safe
10
+ from .generate_config import GenerateConfig
11
+ from .model_output import ModelOutput
12
+
13
+ if TYPE_CHECKING:
14
+ from evalscope.config import TaskConfig
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ class ModelAPI(abc.ABC):
20
+ """Model API provider."""
21
+
22
+ def __init__(
23
+ self,
24
+ model_name: str,
25
+ base_url: Optional[str] = None,
26
+ api_key: Optional[str] = None,
27
+ config: GenerateConfig = GenerateConfig(),
28
+ **kwargs
29
+ ) -> None:
30
+ """Create a model API provider.
31
+
32
+ Args:
33
+ model_name (str): Model name.
34
+ base_url (str | None): Alternate base URL for model.
35
+ api_key (str | None): API key for model.
36
+ api_key_vars (list[str]): Environment variables that
37
+ may contain keys for this provider (used for override)
38
+ config (GenerateConfig): Model configuration.
39
+ """
40
+ self.model_name = model_name
41
+ self.base_url = base_url
42
+ self.api_key = api_key
43
+ self.config = config
44
+
45
+ @abc.abstractmethod
46
+ def generate(
47
+ self,
48
+ input: List[ChatMessage],
49
+ tools: List[ToolInfo],
50
+ tool_choice: ToolChoice,
51
+ config: GenerateConfig,
52
+ ) -> ModelOutput:
53
+ """Generate output from the model.
54
+
55
+ Args:
56
+ input (str | list[ChatMessage]): Chat message
57
+ input (if a `str` is passed it is converted
58
+ to a `ChatUserMessage`).
59
+ tools (list[ToolInfo]): Tools available for the
60
+ model to call.
61
+ tool_choice (ToolChoice): Directives to the model
62
+ as to which tools to prefer.
63
+ config (GenerateConfig): Model configuration.
64
+
65
+ Returns:
66
+ ModelOutput
67
+ """
68
+ ...
69
+
70
+ def batch_generate(
71
+ self,
72
+ inputs: List[List[ChatMessage]],
73
+ tools: List[List[ToolInfo]],
74
+ tool_choices: List[ToolChoice],
75
+ configs: List[GenerateConfig],
76
+ ) -> Generator[ModelOutput, None, None]:
77
+ """Default batch implementation using individual generate calls.
78
+
79
+ ModelAPI implementations can override this for optimized batch processing.
80
+
81
+ Args:
82
+ inputs: List of preprocessed chat message inputs.
83
+ tools: List of tools for each input.
84
+ tool_choices: List of tool choices for each input.
85
+ configs: List of configs for each input.
86
+
87
+ Returns:
88
+ Generator yielding ModelOutput for each input.
89
+ """
90
+ from concurrent.futures import ThreadPoolExecutor
91
+
92
+ def single_generate(args):
93
+ input_msgs, input_tools, tool_choice, config = args
94
+ return self.generate(input_msgs, input_tools, tool_choice, config)
95
+
96
+ with ThreadPoolExecutor(max_workers=self.config.batch_size) as executor:
97
+ futures = []
98
+ for input_msgs, input_tools, tool_choice, config in zip(inputs, tools, tool_choices, configs):
99
+ future = executor.submit(single_generate, (input_msgs, input_tools, tool_choice, config))
100
+ futures.append(future)
101
+
102
+ for future in futures:
103
+ yield future.result()
104
+
105
+ def supports_batch(self) -> bool:
106
+ """Whether this ModelAPI supports optimized batch processing."""
107
+ return False
108
+
109
+ def max_tokens(self) -> Optional[int]:
110
+ """Default max_tokens."""
111
+ return None
112
+
113
+ def max_tokens_for_config(self, config: GenerateConfig) -> Optional[int]:
114
+ """Default max_tokens for a given config.
115
+
116
+ Args:
117
+ config: Generation config.
118
+
119
+ Returns:
120
+ Default maximum tokens for specified configuration.
121
+ """
122
+ return None
123
+
124
+ def tools_required(self) -> bool:
125
+ """Any tool use in a message stream means that tools must be passed."""
126
+ return False
127
+
128
+ def tool_result_images(self) -> bool:
129
+ """Tool results can contain images"""
130
+ return False
131
+
132
+
133
+ class Model:
134
+ """Model interface.
135
+
136
+ Use `get_model()` to get an instance of a model.
137
+ """
138
+
139
+ api: ModelAPI
140
+ """Model API."""
141
+
142
+ config: GenerateConfig
143
+ """Generation config."""
144
+
145
+ def __init__(self, api: ModelAPI, config: GenerateConfig, model_args: Dict[str, Any] = {}) -> None:
146
+ """Create a model.
147
+
148
+ Args:
149
+ api: Model API provider.
150
+ config: Model configuration.
151
+ model_args: Optional model args
152
+ """
153
+ self.api = api
154
+ self.config = config
155
+ self.model_args = model_args
156
+
157
+ @property
158
+ def name(self) -> str:
159
+ """Model name or path to model."""
160
+ return self.api.model_name
161
+
162
+ @property
163
+ def role(self) -> Optional[str]:
164
+ """Model role."""
165
+ return self._role
166
+
167
+ @role.setter
168
+ def role(self, role: str) -> None:
169
+ self._role = role
170
+
171
+ def __str__(self) -> str:
172
+ return f'Model(name={self.model_id}, role={self.role})'
173
+
174
+ def generate(
175
+ self,
176
+ input: Union[str, List[ChatMessage]],
177
+ tools: Optional[Sequence[ToolInfo]] = None,
178
+ tool_choice: Optional[ToolChoice] = None,
179
+ config: Optional[GenerateConfig] = None,
180
+ ) -> ModelOutput:
181
+ """Generate output from the model.
182
+
183
+ Args:
184
+ input: Chat message input (if a `str` is passed it is converted
185
+ to a `ChatMessageUser`).
186
+ tools: Tools available for the model to call.
187
+ tool_choice: Directives to the model as to which tools to prefer.
188
+ config: Model configuration.
189
+
190
+ Returns:
191
+ ModelOutput
192
+ """
193
+ processed_input, processed_tools, processed_tool_choice, processed_config = self._preprocess_input(
194
+ input, tools, tool_choice, config
195
+ )
196
+
197
+ # Call the model's generate method
198
+ output = self.api.generate(
199
+ input=processed_input,
200
+ tools=processed_tools,
201
+ tool_choice=processed_tool_choice,
202
+ config=processed_config,
203
+ )
204
+
205
+ # return output
206
+ return output
207
+
208
+ def batch_generate(
209
+ self,
210
+ inputs: List[List[ChatMessage]],
211
+ tools: List[List[ToolInfo]],
212
+ tool_choices: List[ToolChoice],
213
+ configs: List[GenerateConfig],
214
+ ) -> Generator[ModelOutput, None, None]:
215
+ """Generate output from the model for a batch of inputs.
216
+
217
+ Args:
218
+ inputs (List[List[ChatMessage]]): Batch of chat message inputs.
219
+ tools (List[List[ToolInfo]]): Batch of tools for each input.
220
+ tool_choices (List[ToolChoice]): Batch of tool choices for each input.
221
+ configs (List[GenerateConfig]): Batch of configs for each input.
222
+ """
223
+ preprocessed_data = []
224
+
225
+ for input_item, input_tools, input_tool_choice, input_config in zip(inputs, tools, tool_choices, configs):
226
+ processed_input, processed_tools, processed_tool_choice, processed_config = self._preprocess_input(
227
+ input=input_item, tools=input_tools, tool_choice=input_tool_choice, config=input_config
228
+ )
229
+ preprocessed_data.append((processed_input, processed_tools, processed_tool_choice, processed_config))
230
+
231
+ # check if ModelAPI supports batch processing
232
+ if self.api.supports_batch() and len(preprocessed_data) > 1:
233
+ # use the batch_generate method of the ModelAPI
234
+ inputs, tools, tool_choices, configs = zip(*preprocessed_data)
235
+ batch_results = self.api.batch_generate(
236
+ inputs=list(inputs), tools=list(tools), tool_choices=list(tool_choices), configs=list(configs)
237
+ )
238
+ for result in batch_results:
239
+ yield result
240
+ else:
241
+ # fall back to processing each input individually
242
+ for input_msgs, input_tools, tool_choice, config in preprocessed_data:
243
+ result = self.api.generate(input_msgs, input_tools, tool_choice, config)
244
+ yield result
245
+
246
+ def _preprocess_input(
247
+ self,
248
+ input: Union[str, List[ChatMessage]],
249
+ tools: Optional[Sequence[ToolInfo]] = None,
250
+ tool_choice: Optional[ToolChoice] = None,
251
+ config: Optional[GenerateConfig] = None,
252
+ ) -> tuple[List[ChatMessage], List[ToolInfo], ToolChoice, GenerateConfig]:
253
+ """pre process input for generate."""
254
+
255
+ # merge passed config
256
+ if config is not None:
257
+ config = self.config.merge(config)
258
+ else:
259
+ config = self.config.model_copy(deep=True)
260
+
261
+ # provide max_tokens from the model api if required
262
+ if config.max_tokens is None:
263
+ config.max_tokens = self.api.max_tokens_for_config(config)
264
+ if config.max_tokens is None:
265
+ config.max_tokens = self.api.max_tokens()
266
+
267
+ # normalize input to chat
268
+ if isinstance(input, str):
269
+ input = [ChatMessageUser(content=input)]
270
+
271
+ # handle tools and tool_choice
272
+ tool_choice = tool_choice if tool_choice is not None else 'auto'
273
+ tools_info = list(tools) if tools is not None else []
274
+
275
+ if isinstance(tool_choice, ToolFunction):
276
+ tools_info = [tool for tool in tools_info if tool.name == tool_choice.name]
277
+
278
+ if tool_choice == 'none' or len(tools_info) == 0:
279
+ if not self.api.tools_required():
280
+ tools_info = []
281
+ tool_choice = 'none'
282
+
283
+ return input, tools_info, tool_choice, config
284
+
285
+
286
+ class ModelCache:
287
+ _models: Dict[str, 'Model'] = {}
288
+
289
+ @classmethod
290
+ def get(cls, key: str) -> Optional['Model']:
291
+ return cls._models.get(key, None)
292
+
293
+ @classmethod
294
+ def set(cls, key: str, model: 'Model') -> None:
295
+ cls._models[key] = model
296
+
297
+
298
+ def get_model_with_task_config(task_config: 'TaskConfig') -> Model:
299
+ """Get an instance of a model with the specified task configuration.
300
+
301
+ Args:
302
+ task_config (TaskConfig): Task configuration.
303
+
304
+ Returns:
305
+ Model: An instance of the model.
306
+ """
307
+ model = task_config.model
308
+ eval_type = task_config.eval_type
309
+ base_url = task_config.api_url
310
+ api_key = task_config.api_key
311
+ config = task_config.generation_config
312
+ model_args = task_config.model_args or {}
313
+
314
+ return get_model(
315
+ model=model, eval_type=eval_type, base_url=base_url, api_key=api_key, config=config, model_args=model_args
316
+ )
317
+
318
+
319
+ @thread_safe
320
+ def get_model(
321
+ model: str,
322
+ eval_type: str,
323
+ base_url: Optional[str] = None,
324
+ api_key: Optional[str] = None,
325
+ config: GenerateConfig = GenerateConfig(),
326
+ model_args: dict = {},
327
+ role: Optional[str] = None,
328
+ memoize: bool = True,
329
+ ) -> Model:
330
+ """Get an instance of a model.
331
+
332
+ Calls to get_model() are memoized (i.e. a call with the same arguments
333
+ will return an existing instance of the model rather than creating a
334
+ new one). You can disable this with `memoize=False`.
335
+
336
+ Args:
337
+ task_config (TaskConfig): Task configuration.
338
+ memoize (bool): Whether to memoize the model instance.
339
+
340
+ Returns:
341
+ Model instance.
342
+
343
+ """
344
+
345
+ # start with seeing if a model was passed
346
+ if isinstance(model, Model):
347
+ return model
348
+
349
+ # see if we can return a memoized model instance
350
+ # (exclude mockllm since custom_outputs is an infinite generator)
351
+ model_cache_key: str = ''
352
+ if eval_type.startswith('mock_llm'):
353
+ memoize = False
354
+ if memoize:
355
+ model_cache_key = (
356
+ model + str(role) + config.model_dump_json(exclude_none=True) + str(base_url) + str(api_key)
357
+ + str(to_jsonable_python(model_args, fallback=lambda _: None))
358
+ )
359
+ cached = ModelCache.get(model_cache_key)
360
+ if cached is not None:
361
+ return cached
362
+
363
+ logger.info(
364
+ f'Creating model {model} with eval_type={eval_type} '
365
+ f'base_url={base_url}, api_key={api_key}, config={config}, model_args={model_args}'
366
+ )
367
+
368
+ # find a matching model type
369
+ modelapi_type = get_model_api(eval_type)
370
+
371
+ modelapi_instance = modelapi_type(
372
+ model_name=model,
373
+ base_url=base_url,
374
+ api_key=api_key,
375
+ config=config,
376
+ **model_args,
377
+ )
378
+ m = Model(modelapi_instance, config, model_args)
379
+ if role is not None:
380
+ m.role = role
381
+ if memoize:
382
+ ModelCache.set(model_cache_key, m)
383
+ return m
@@ -0,0 +1,285 @@
1
+ import uuid
2
+ from pydantic import BaseModel, Field, JsonValue, model_validator
3
+ from typing import Any, Dict, List, Literal, Optional, Type, Union
4
+
5
+ from evalscope.api.messages import ChatMessageAssistant, Content
6
+ from evalscope.api.tool import ToolCall, ToolFunction
7
+
8
+
9
+ class ModelUsage(BaseModel):
10
+ """Token usage for completion."""
11
+
12
+ input_tokens: int = Field(default=0)
13
+ """Total input tokens used."""
14
+
15
+ output_tokens: int = Field(default=0)
16
+ """Total output tokens used."""
17
+
18
+ total_tokens: int = Field(default=0)
19
+ """Total tokens used."""
20
+
21
+ input_tokens_cache_write: Optional[int] = Field(default=None)
22
+ """Number of tokens written to the cache."""
23
+
24
+ input_tokens_cache_read: Optional[int] = Field(default=None)
25
+ """Number of tokens retrieved from the cache."""
26
+
27
+ reasoning_tokens: Optional[int] = Field(default=None)
28
+ """Number of tokens used for reasoning."""
29
+
30
+ def __add__(self, other: 'ModelUsage') -> 'ModelUsage':
31
+
32
+ def optional_sum(a: Optional[int], b: Optional[int]) -> Optional[int]:
33
+ if a is not None and b is not None:
34
+ return a + b
35
+ if a is not None:
36
+ return a
37
+ if b is not None:
38
+ return b
39
+ return None
40
+
41
+ return ModelUsage(
42
+ input_tokens=self.input_tokens + other.input_tokens,
43
+ output_tokens=self.output_tokens + other.output_tokens,
44
+ total_tokens=self.total_tokens + other.total_tokens,
45
+ input_tokens_cache_write=optional_sum(self.input_tokens_cache_write, other.input_tokens_cache_write),
46
+ input_tokens_cache_read=optional_sum(self.input_tokens_cache_read, other.input_tokens_cache_read),
47
+ reasoning_tokens=optional_sum(self.reasoning_tokens, other.reasoning_tokens),
48
+ )
49
+
50
+
51
+ StopReason = Literal[
52
+ 'stop',
53
+ 'max_tokens',
54
+ 'model_length',
55
+ 'tool_calls',
56
+ 'content_filter',
57
+ 'unknown',
58
+ ]
59
+ """Reason that the model stopped or failed to generate."""
60
+
61
+
62
+ class TopLogprob(BaseModel):
63
+ """List of the most likely tokens and their log probability, at this token position."""
64
+
65
+ token: str
66
+ """The top-kth token represented as a string."""
67
+
68
+ logprob: float
69
+ """The log probability value of the model for the top-kth token."""
70
+
71
+ bytes: Optional[List[int]] = Field(default=None)
72
+ """The top-kth token represented as a byte array (a list of integers)."""
73
+
74
+
75
+ class Logprob(BaseModel):
76
+ """Log probability for a token."""
77
+
78
+ token: str
79
+ """The predicted token represented as a string."""
80
+
81
+ logprob: float
82
+ """The log probability value of the model for the predicted token."""
83
+
84
+ bytes: Optional[List[int]] = Field(default=None)
85
+ """The predicted token represented as a byte array (a list of integers)."""
86
+
87
+ top_logprobs: Optional[List[TopLogprob]] = Field(default=None)
88
+ """If the `top_logprobs` argument is greater than 0, this will contain an ordered list of the top K most likely tokens and their log probabilities.""" # noqa: E501
89
+
90
+
91
+ class Logprobs(BaseModel):
92
+ """Log probability information for a completion choice."""
93
+
94
+ content: List[Logprob]
95
+ """a (num_generated_tokens,) length list containing the individual log probabilities for each generated token."""
96
+
97
+
98
+ class ChatCompletionChoice(BaseModel):
99
+ """Choice generated for completion."""
100
+
101
+ message: ChatMessageAssistant
102
+ """Assistant message."""
103
+
104
+ stop_reason: StopReason = Field(default='unknown')
105
+ """Reason that the model stopped generating."""
106
+
107
+ logprobs: Optional[Logprobs] = Field(default=None)
108
+ """Logprobs."""
109
+
110
+ @model_validator(mode='before')
111
+ @classmethod
112
+ def migrate_stop_reason(cls: Type['ChatCompletionChoice'], values: Dict[str, Any]) -> Dict[str, Any]:
113
+ if 'stop_reason' in values:
114
+ stop_reason = values['stop_reason']
115
+ if stop_reason == 'length':
116
+ values['stop_reason'] = 'max_tokens'
117
+
118
+ return values
119
+
120
+ @classmethod
121
+ def from_content(cls, content: Union[str, List[Content]]) -> 'ChatCompletionChoice':
122
+ """Create a ChatCompletionChoice from content string."""
123
+ return cls(
124
+ message=ChatMessageAssistant(content=content),
125
+ stop_reason='stop',
126
+ )
127
+
128
+
129
+ class ModelOutput(BaseModel):
130
+ """Output from model generation."""
131
+
132
+ model: str = Field(default_factory=str)
133
+ """Model used for generation."""
134
+
135
+ choices: List[ChatCompletionChoice] = Field(default=[])
136
+ """Completion choices."""
137
+
138
+ usage: Optional[ModelUsage] = Field(default=None)
139
+ """Model token usage"""
140
+
141
+ time: Optional[float] = Field(default=None)
142
+ """Time elapsed (in seconds) for call to generate."""
143
+
144
+ metadata: Optional[Dict[str, Any]] = Field(default=None)
145
+ """Additional metadata associated with model output."""
146
+
147
+ error: Optional[str] = Field(default=None)
148
+ """Error message in the case of content moderation refusals."""
149
+
150
+ @property
151
+ def empty(self) -> bool:
152
+ return len(self.choices) == 0
153
+
154
+ @property
155
+ def stop_reason(self) -> StopReason:
156
+ """First message stop reason."""
157
+ return self.choices[0].stop_reason
158
+
159
+ @property
160
+ def message(self) -> ChatMessageAssistant:
161
+ """First message choice."""
162
+ return self.choices[0].message
163
+
164
+ @property
165
+ def completion(self) -> str:
166
+ """Text of first message choice text."""
167
+ if len(self.choices) > 0:
168
+ return self.choices[0].message.text
169
+ else:
170
+ return '\n'.join(choice.message.text for choice in self.choices)
171
+
172
+ @completion.setter
173
+ def completion(self, completion: str) -> None:
174
+ """Set the text of the first message choice.
175
+
176
+ Args:
177
+ completion (str): Text for first message.
178
+ """
179
+ if len(self.choices) > 0:
180
+ self.choices[0].message.text = completion
181
+ else:
182
+ self.choices.append(
183
+ ChatCompletionChoice(
184
+ message=ChatMessageAssistant(content=completion, model=self.model),
185
+ stop_reason='stop',
186
+ )
187
+ )
188
+
189
+ @property
190
+ def completions(self) -> List[str]:
191
+ """List of all message choices text."""
192
+ return [choice.message.text for choice in self.choices]
193
+
194
+ @staticmethod
195
+ def from_content(
196
+ model: str,
197
+ content: Union[str, List[Content]],
198
+ stop_reason: StopReason = 'stop',
199
+ error: Optional[str] = None,
200
+ ) -> 'ModelOutput':
201
+ """Create ModelOutput from simple text content.
202
+
203
+ Args:
204
+ model: Model name.
205
+ content: Text content from generation.
206
+ stop_reason: Stop reason for generation.
207
+ error: Error message.
208
+ """
209
+ return ModelOutput(
210
+ model=model,
211
+ choices=[
212
+ ChatCompletionChoice(
213
+ message=ChatMessageAssistant(content=content, model=model, source='generate'),
214
+ stop_reason=stop_reason,
215
+ )
216
+ ],
217
+ error=error,
218
+ )
219
+
220
+ @staticmethod
221
+ def for_tool_call(
222
+ model: str,
223
+ tool_name: str,
224
+ tool_arguments: Dict[str, Any],
225
+ internal: Optional[JsonValue] = None,
226
+ tool_call_id: Optional[str] = None,
227
+ content: Optional[str] = None,
228
+ ) -> 'ModelOutput':
229
+ """
230
+ Returns a ModelOutput for requesting a tool call.
231
+
232
+ Args:
233
+ model: model name
234
+ tool_name: The name of the tool.
235
+ internal: The model's internal info for the tool (if any).
236
+ tool_arguments: The arguments passed to the tool.
237
+ tool_call_id: Optional ID for the tool call. Defaults to a random UUID.
238
+ content: Optional content to include in the message. Defaults to "tool call for tool {tool_name}".
239
+
240
+ Returns:
241
+ A ModelOutput corresponding to the tool call
242
+ """
243
+ if content is None:
244
+ content = f'tool call for tool {tool_name}'
245
+
246
+ if tool_call_id is None:
247
+ tool_call_id = f'for_tool_call_{uuid.uuid4()}'
248
+
249
+ return ModelOutput(
250
+ model=model,
251
+ choices=[
252
+ ChatCompletionChoice(
253
+ message=ChatMessageAssistant(
254
+ content=content,
255
+ model=model,
256
+ source='generate',
257
+ tool_calls=[
258
+ ToolCall(
259
+ id=tool_call_id,
260
+ internal=internal,
261
+ function=ToolFunction(
262
+ name=tool_name,
263
+ arguments=tool_arguments,
264
+ )
265
+ )
266
+ ],
267
+ ),
268
+ stop_reason='tool_calls',
269
+ )
270
+ ],
271
+ )
272
+
273
+
274
+ def as_stop_reason(reason: Optional[str]) -> StopReason:
275
+ """Encode common reason strings into standard StopReason."""
276
+ if reason in ['stop', 'eos']:
277
+ return 'stop'
278
+ elif reason == 'length':
279
+ return 'max_tokens'
280
+ elif reason in ['tool_calls', 'function_call']:
281
+ return 'tool_calls'
282
+ elif reason in ['content_filter', 'model_length', 'max_tokens']:
283
+ return reason
284
+ else:
285
+ return 'unknown'