crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -1,13 +1,16 @@
1
1
  # mypy: check_untyped_defs = False
2
2
  from dataclasses import replace
3
+ import re
3
4
  from typing import Any, Dict, List, Optional, cast, Union, Callable
4
5
 
6
+ from openai import OpenAIError
7
+
5
8
  from helm.benchmark.model_metadata_registry import is_vlm
6
9
  from helm.common import multimodal_request_utils
7
10
  from helm.common.cache import CacheConfig
8
- from helm.common.media_object import TEXT_TYPE, MultimediaObject
9
- from helm.common.request import ErrorFlags, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
10
- from helm.common.hierarchical_logger import hlog
11
+ from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
12
+ from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
13
+ from helm.common.hierarchical_logger import hlog, hwarn
11
14
  from helm.common.object_spec import get_class_by_name
12
15
  from helm.common.optional_dependencies import handle_module_not_found_error
13
16
  from helm.common.tokenization_request import (
@@ -24,8 +27,13 @@ except ModuleNotFoundError as e:
24
27
  handle_module_not_found_error(e, ["openai"])
25
28
 
26
29
 
27
- class OpenAIClient(CachingClient):
28
- END_OF_TEXT: str = "<|endoftext|>"
30
+ class OpenAIClientUtils:
31
+ """Methods used by both the chat completions client and the responses API client"""
32
+
33
+ @classmethod
34
+ def is_reasoning_model(cls, model_engine: str) -> bool:
35
+ # All OpenAI reasoning models start "o[somenumber]", so we regexp for that to future proof things
36
+ return bool(re.match(r"^o\d+", model_engine))
29
37
 
30
38
  # Error OpenAI throws when the image in the prompt violates their content policy
31
39
  INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
@@ -49,6 +57,56 @@ class OpenAIClient(CachingClient):
49
57
  "See https://labs.openai.com/policies/content-policy for more information."
50
58
  )
51
59
 
60
+ @classmethod
61
+ def handle_openai_error(cls, e: OpenAIError, request: Request):
62
+ if cls.INAPPROPRIATE_IMAGE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_ERROR in str(e):
63
+ hwarn(f"Failed safety check: {str(request)}")
64
+ empty_completion = GeneratedOutput(
65
+ text="",
66
+ logprob=0,
67
+ tokens=[],
68
+ finish_reason={"reason": cls.CONTENT_POLICY_VIOLATED_FINISH_REASON},
69
+ )
70
+ return RequestResult(
71
+ success=True,
72
+ cached=False,
73
+ request_time=0,
74
+ completions=[empty_completion] * request.num_completions,
75
+ embedding=[],
76
+ )
77
+ elif cls.OPENAI_SERVER_ERROR in str(e):
78
+ # Handle these errors by returning an empty completion to unblock
79
+ hwarn(f"OpenAI server error for request: {str(request)}")
80
+ empty_completion = GeneratedOutput(
81
+ text="",
82
+ logprob=0,
83
+ tokens=[],
84
+ finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
85
+ )
86
+ return RequestResult(
87
+ success=True,
88
+ cached=False,
89
+ request_time=0,
90
+ completions=[empty_completion] * request.num_completions,
91
+ embedding=[],
92
+ )
93
+ elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
94
+ return RequestResult(
95
+ success=False,
96
+ cached=False,
97
+ error="Content blocked by Azure's content management filter",
98
+ completions=[],
99
+ embedding=[],
100
+ error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
101
+ )
102
+
103
+ error: str = f"OpenAI error: {e}"
104
+ return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
105
+
106
+
107
+ class OpenAIClient(CachingClient):
108
+ END_OF_TEXT: str = "<|endoftext|>"
109
+
52
110
  def __init__(
53
111
  self,
54
112
  tokenizer: Tokenizer,
@@ -60,11 +118,12 @@ class OpenAIClient(CachingClient):
60
118
  reasoning_effort: Optional[str] = None,
61
119
  openai_model_name: Optional[str] = None,
62
120
  output_processor: Optional[str] = None,
121
+ **kwargs,
63
122
  ):
64
123
  super().__init__(cache_config=cache_config)
65
124
  self.tokenizer = tokenizer
66
125
  self.tokenizer_name = tokenizer_name
67
- self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
126
+ self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url, **kwargs)
68
127
  self.reasoning_effort = reasoning_effort
69
128
  self.openai_model_name = openai_model_name
70
129
  self.output_processor: Optional[Callable[[str], str]] = (
@@ -118,7 +177,7 @@ class OpenAIClient(CachingClient):
118
177
  embedding=embedding,
119
178
  )
120
179
 
121
- def _make_chat_request(self, request: Request) -> RequestResult:
180
+ def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
122
181
  messages: Optional[List[Dict[str, Union[str, Any]]]] = request.messages
123
182
  if (
124
183
  (request.prompt and request.messages)
@@ -137,7 +196,7 @@ class OpenAIClient(CachingClient):
137
196
  if request.messages[-1]["role"] != "user":
138
197
  raise ValueError("Last message must have role 'user'")
139
198
  if request.prompt != "":
140
- hlog("WARNING: Since message is set, prompt will be ignored")
199
+ hwarn("Since message is set, prompt will be ignored")
141
200
  else:
142
201
  # Convert prompt into a single message
143
202
  # For now, put the whole prompt in a single user message, and expect the response
@@ -223,7 +282,7 @@ class OpenAIClient(CachingClient):
223
282
  # Refer to the "Reasoning models" documentation further discussion of o1 model limitations:
224
283
  # https://platform.openai.com/docs/guides/reasoning
225
284
  model_engine: str = request.model_engine
226
- if model_engine.startswith("o1") or model_engine.startswith("o3"):
285
+ if OpenAIClientUtils.is_reasoning_model(model_engine):
227
286
  # Avoid error:
228
287
  # "Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead." # noqa: E501
229
288
  # Note that openai>=1.45 is needed for this
@@ -241,8 +300,13 @@ class OpenAIClient(CachingClient):
241
300
  # 'code': 'unsupported_parameter'}}"
242
301
  raw_request.pop("temperature", None)
243
302
 
303
+ # The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
304
+ raw_request.pop("top_p", None)
305
+ raw_request.pop("frequency_penalty", None)
306
+ raw_request.pop("presence_penalty", None)
307
+
244
308
  if self.reasoning_effort:
245
- raw_request["reasoning_effort"] = "self.reasoning_effort"
309
+ raw_request["reasoning_effort"] = self.reasoning_effort
246
310
  elif is_vlm(request.model):
247
311
  # Avoid error:
248
312
  # "Invalid type for 'stop': expected an unsupported value, but got null instead."
@@ -258,6 +322,10 @@ class OpenAIClient(CachingClient):
258
322
  # OpenAI error: Error code: 400 - {'error': {'message': "[{'type': 'string_type', 'loc': ('body', 'stop', 'str'), 'msg': 'Input should be a valid string', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[str]'), 'msg': 'Input should be a valid list', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[list[int]]'), 'msg': 'Input should be a valid list', 'input': None}]", 'type': 'invalid_request_error', 'param': None, 'code': None}} # noqa: 3501
259
323
  if raw_request["stop"] is None:
260
324
  raw_request.pop("stop")
325
+ return raw_request
326
+
327
+ def _make_chat_request(self, request: Request) -> RequestResult:
328
+ raw_request = self._make_chat_raw_request(request)
261
329
 
262
330
  def do_it() -> Dict[str, Any]:
263
331
  return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
@@ -266,49 +334,7 @@ class OpenAIClient(CachingClient):
266
334
  cache_key = self._get_cache_key(raw_request, request)
267
335
  response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
268
336
  except openai.OpenAIError as e:
269
- if self.INAPPROPRIATE_IMAGE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_ERROR in str(e):
270
- hlog(f"Failed safety check: {str(request)}")
271
- empty_completion = GeneratedOutput(
272
- text="",
273
- logprob=0,
274
- tokens=[],
275
- finish_reason={"reason": self.CONTENT_POLICY_VIOLATED_FINISH_REASON},
276
- )
277
- return RequestResult(
278
- success=True,
279
- cached=False,
280
- request_time=0,
281
- completions=[empty_completion] * request.num_completions,
282
- embedding=[],
283
- )
284
- elif self.OPENAI_SERVER_ERROR in str(e):
285
- # Handle these errors by returning an empty completion to unblock
286
- hlog(f"OpenAI server error for request: {str(request)}")
287
- empty_completion = GeneratedOutput(
288
- text="",
289
- logprob=0,
290
- tokens=[],
291
- finish_reason={"reason": self.OPENAI_SERVER_ERROR},
292
- )
293
- return RequestResult(
294
- success=True,
295
- cached=False,
296
- request_time=0,
297
- completions=[empty_completion] * request.num_completions,
298
- embedding=[],
299
- )
300
- elif self.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
301
- return RequestResult(
302
- success=False,
303
- cached=False,
304
- error="Content blocked by Azure's content management filter",
305
- completions=[],
306
- embedding=[],
307
- error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
308
- )
309
-
310
- error: str = f"OpenAI error: {e}"
311
- return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
337
+ return OpenAIClientUtils.handle_openai_error(e, request)
312
338
 
313
339
  completions: List[GeneratedOutput] = []
314
340
  for raw_completion in response["choices"]:
@@ -338,11 +364,20 @@ class OpenAIClient(CachingClient):
338
364
  tokens: List[Token] = [
339
365
  Token(text=cast(str, raw_token), logprob=0) for raw_token in tokenization_result.raw_tokens
340
366
  ]
367
+ # vLLM has a optional `reasoning_content` field in the message
368
+ # that is not in the standard OpenAI API.
369
+ # This field is also used by some model providers such as Grok.
370
+ thinking = (
371
+ Thinking(text=raw_completion["message"]["reasoning_content"])
372
+ if "reasoning_content" in raw_completion["message"]
373
+ else None
374
+ )
341
375
  completion = GeneratedOutput(
342
376
  text=text,
343
377
  logprob=0, # OpenAI does not provide logprobs
344
378
  tokens=tokens,
345
379
  finish_reason={"reason": raw_completion["finish_reason"]},
380
+ thinking=thinking,
346
381
  )
347
382
  completions.append(truncate_sequence(completion, request)) # Truncate the text by stop sequences
348
383
 
@@ -459,7 +494,7 @@ class OpenAIClient(CachingClient):
459
494
  def make_request(self, request: Request) -> RequestResult:
460
495
  if request.embedding:
461
496
  return self._make_embedding_request(request)
462
- elif "whisper" in request.model_engine:
497
+ elif "whisper" in request.model_engine or "transcribe" in request.model_engine:
463
498
  return self._make_transcription_request(request)
464
499
  else:
465
500
  return self._make_chat_request(request)
@@ -536,6 +571,18 @@ class OpenAITranscriptionThenCompletionClient(Client):
536
571
  # Now make the request to the completion model with just a text-only prompt and no audio
537
572
  # Use the same decoding parameters as the original request
538
573
  # Ensure to set multimodal_prompt to None so the request is treated as text-only.
539
- return self._openai_client.make_request(
574
+ request_result: RequestResult = self._openai_client.make_request(
540
575
  replace(request, prompt=text_prompt, model=f"openai/{completion_model}", multimodal_prompt=None)
541
576
  )
577
+
578
+ # Also include the generated transcript to the request result
579
+ completions_with_transcript: List[GeneratedOutput] = [
580
+ replace(
581
+ completion,
582
+ multimodal_content=MultimediaObject(
583
+ media_objects=[MediaObject(text=text_prompt, content_type="text/plain")]
584
+ ),
585
+ )
586
+ for completion in request_result.completions
587
+ ]
588
+ return replace(request_result, completions=completions_with_transcript)
@@ -0,0 +1,176 @@
1
+ # mypy: check_untyped_defs = False
2
+ import dataclasses
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+
6
+ from helm.clients.openai_client import OpenAIClientUtils
7
+ from helm.common.cache import CacheConfig
8
+ from helm.common.media_object import TEXT_TYPE
9
+ from helm.common.request import (
10
+ Thinking,
11
+ wrap_request_time,
12
+ Request,
13
+ RequestResult,
14
+ GeneratedOutput,
15
+ )
16
+ from helm.common.optional_dependencies import handle_module_not_found_error
17
+ from helm.clients.client import (
18
+ CachingClient,
19
+ truncate_and_tokenize_response_text,
20
+ generate_uid_for_multimodal_prompt,
21
+ )
22
+ from helm.tokenizers.tokenizer import Tokenizer
23
+
24
+ try:
25
+ import openai
26
+ from openai import OpenAI
27
+ except ModuleNotFoundError as e:
28
+ handle_module_not_found_error(e, ["openai"])
29
+
30
+
31
+ class OpenAIResponseClient(CachingClient):
32
+ def __init__(
33
+ self,
34
+ tokenizer: Tokenizer,
35
+ tokenizer_name: str,
36
+ cache_config: CacheConfig,
37
+ api_key: Optional[str] = None,
38
+ org_id: Optional[str] = None,
39
+ base_url: Optional[str] = None,
40
+ reasoning_effort: Optional[str] = None,
41
+ openai_model_name: Optional[str] = None,
42
+ ):
43
+ super().__init__(cache_config=cache_config)
44
+ self.tokenizer = tokenizer
45
+ self.tokenizer_name = tokenizer_name
46
+ self.client = OpenAI(
47
+ api_key=api_key,
48
+ organization=org_id,
49
+ base_url=base_url,
50
+ )
51
+ self.reasoning_effort = reasoning_effort
52
+ self.openai_model_name = openai_model_name
53
+
54
+ def _get_cache_key(self, raw_request: Dict, request: Request):
55
+ cache_key = CachingClient.make_cache_key(raw_request, request)
56
+ if request.multimodal_prompt:
57
+ prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
58
+ cache_key = {**cache_key, "multimodal_prompt": prompt_key}
59
+ return cache_key
60
+
61
+ def _make_raw_request(self, request: Request) -> dict[str, Any]:
62
+ input: Union[str, List[Dict[str, Any]]]
63
+ if request.multimodal_prompt is not None:
64
+ content = []
65
+ request.validate()
66
+ for media_object in request.multimodal_prompt.media_objects:
67
+ if media_object.is_type("image") and media_object.location:
68
+ from helm.common.images_utils import encode_base64
69
+
70
+ base64_image: str = encode_base64(media_object.location)
71
+ content.append(
72
+ {
73
+ "type": "input_image",
74
+ "image_url": f"data:image/jpeg;base64,{base64_image}",
75
+ }
76
+ )
77
+ elif media_object.is_type(TEXT_TYPE):
78
+ assert media_object.text is not None
79
+ content.append({"type": "input_text", "text": media_object.text})
80
+ else:
81
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
82
+ input = [{"role": "user", "content": content}]
83
+ else:
84
+ input = request.prompt
85
+
86
+ raw_request: Dict[str, Any] = {
87
+ "model": self._get_model_for_request(request),
88
+ "input": input,
89
+ "top_p": request.top_p,
90
+ # API errors if max_output_tokens is less than 16
91
+ # (Error you get: "Invalid 'max_output_tokens': integer below minimum value.
92
+ # Expected a value >= 16, but got 5 instead.")
93
+ "max_output_tokens": max(16, request.max_tokens),
94
+ "temperature": request.temperature,
95
+ # Don't store responses for later retrieval
96
+ "store": False,
97
+ }
98
+ if self.reasoning_effort:
99
+ raw_request["reasoning"] = {"effort": self.reasoning_effort}
100
+ # If o-series model, get reasoning summaries
101
+ # Plus other changes
102
+ model_engine: str = request.model_engine
103
+ if OpenAIClientUtils.is_reasoning_model(model_engine):
104
+ raw_request["reasoning"]["summary"] = "detailed"
105
+ # Avoid error:
106
+ # "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
107
+ # not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature',
108
+ # 'code': 'unsupported_parameter'}}"
109
+ raw_request.pop("temperature", None)
110
+
111
+ # The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
112
+ raw_request.pop("top_p", None)
113
+
114
+ return raw_request
115
+
116
+ def _get_model_for_request(self, request: Request) -> str:
117
+ return self.openai_model_name or request.model_engine
118
+
119
+ def make_request(self, request: Request) -> RequestResult:
120
+ # Content can either be text or a list of multimodal content made up of text and images:
121
+ # https://platform.openai.com/docs/api-reference/responses/create
122
+ raw_request = self._make_raw_request(request)
123
+
124
+ # The responses API does not support a "num_completions" parameter,
125
+ # so we need to handle it ourselves with a simple loop
126
+ completions: list[GeneratedOutput] = []
127
+ for _ in range(request.num_completions):
128
+
129
+ def do_it() -> Dict[str, Any]:
130
+ raw_response = self.client.responses.create(**raw_request).model_dump(mode="json")
131
+ assert not raw_response.get("error", None), f"Error in response: {raw_response}"
132
+ return raw_response
133
+
134
+ try:
135
+ cache_key = self._get_cache_key(raw_request, request)
136
+ response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
137
+ except openai.OpenAIError as e:
138
+ return OpenAIClientUtils.handle_openai_error(e, request)
139
+
140
+ # We can only return one completition really,
141
+ # but we get an array of messages back, so we need to contact them
142
+ reasoning_output = ""
143
+ text_output = ""
144
+
145
+ if request.echo_prompt:
146
+ text_output += request.prompt
147
+ for output in response["output"]:
148
+ output_type = output[
149
+ "type"
150
+ ] # one of "message" or "reasoning" from API observation, but can also include tool calls
151
+
152
+ if output_type == "reasoning":
153
+ reasoning_output += "\n".join([raw_output["text"] for raw_output in output["summary"]])
154
+ elif output_type == "message":
155
+ text_output += "\n".join([raw_output["text"] for raw_output in output["content"]])
156
+ # (Other output types are ignored)
157
+
158
+ completion = truncate_and_tokenize_response_text(
159
+ text_output,
160
+ request,
161
+ self.tokenizer,
162
+ self.tokenizer_name,
163
+ original_finish_reason="",
164
+ )
165
+ if reasoning_output:
166
+ completion = dataclasses.replace(completion, thinking=Thinking(text=reasoning_output))
167
+ completions.append(completion)
168
+
169
+ return RequestResult(
170
+ success=True,
171
+ cached=cached,
172
+ request_time=response["request_time"],
173
+ request_datetime=response.get("request_datetime"),
174
+ completions=completions,
175
+ embedding=[],
176
+ )
@@ -5,7 +5,7 @@ from typing import Any, Dict, List
5
5
 
6
6
  from helm.clients.openai_client import OpenAIClient
7
7
  from helm.common.cache import CacheConfig
8
- from helm.common.hierarchical_logger import hlog
8
+ from helm.common.hierarchical_logger import hwarn
9
9
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
10
10
  from helm.common.tokenization_request import (
11
11
  TokenizationRequest,
@@ -103,10 +103,7 @@ class PalmyraClient(CachingClient):
103
103
  return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
104
104
 
105
105
  if _is_content_moderation_failure(response):
106
- hlog(
107
- f"WARNING: Returning empty request for {request.model_deployment} "
108
- "due to content moderation filter"
109
- )
106
+ hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
110
107
  return RequestResult(
111
108
  success=False,
112
109
  cached=False,
@@ -6,7 +6,7 @@ from helm.proxy.retry import NonRetriableException
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.media_object import TEXT_TYPE
8
8
  from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
9
- from helm.common.hierarchical_logger import hlog
9
+ from helm.common.hierarchical_logger import hwarn
10
10
  from helm.common.optional_dependencies import handle_module_not_found_error
11
11
  from helm.tokenizers.tokenizer import Tokenizer
12
12
  from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
@@ -121,7 +121,7 @@ class RekaClient(CachingClient):
121
121
  if messages[-1]["role"] != "user":
122
122
  raise ValueError("Last message must have role 'user'")
123
123
  if request.prompt != "":
124
- hlog("WARNING: Since message is set, prompt will be ignored")
124
+ hwarn("Since message is set, prompt will be ignored")
125
125
  reka_chat_history = self._convert_messages_to_reka_chat_history(messages)
126
126
  else:
127
127
  current_chat_history: Dict[str, Any] = {
@@ -9,7 +9,7 @@ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
9
9
  class TestHuggingFaceClient:
10
10
  def test_gpt2(self):
11
11
  tokenizer = HuggingFaceTokenizer(
12
- BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
12
+ BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
13
13
  )
14
14
  client = HuggingFaceClient(
15
15
  cache_config=BlackHoleCacheConfig(),
@@ -36,7 +36,7 @@ class TestHuggingFaceClient:
36
36
  @pytest.mark.skip(reason="GPT-J 6B is 22 GB and extremely slow without a GPU.")
37
37
  def test_gptj_6b(self):
38
38
  tokenizer = HuggingFaceTokenizer(
39
- BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
39
+ BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
40
40
  )
41
41
  client = HuggingFaceClient(
42
42
  cache_config=BlackHoleCacheConfig(),
@@ -57,7 +57,7 @@ class TestHuggingFaceClient:
57
57
 
58
58
  def test_logprob(self):
59
59
  tokenizer = HuggingFaceTokenizer(
60
- BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
60
+ BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
61
61
  )
62
62
  client = HuggingFaceClient(
63
63
  cache_config=BlackHoleCacheConfig(),
@@ -1,7 +1,8 @@
1
1
  from copy import deepcopy
2
2
  from itertools import zip_longest
3
+ import re
3
4
  import threading
4
- from typing import Callable, List, Dict, Any, Mapping, Optional, TypedDict, Union
5
+ from typing import Callable, List, Dict, Any, Mapping, Optional, Tuple, TypedDict, Union
5
6
  from typing_extensions import NotRequired
6
7
 
7
8
  import requests
@@ -11,7 +12,7 @@ from helm.common.cache import CacheConfig
11
12
  from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
12
13
  from helm.common.object_spec import get_class_by_name
13
14
  from helm.common.optional_dependencies import handle_module_not_found_error
14
- from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
15
+ from helm.common.request import Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
15
16
  from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
16
17
 
17
18
  try:
@@ -24,8 +25,6 @@ except ModuleNotFoundError as e:
24
25
  class _RewriteRequestTags:
25
26
  """Tags that indicate that the request for the model must be rewritten before sending to Together."""
26
27
 
27
- # TODO: Convert to StrEnum after upgrading to Python 3.11
28
-
29
28
  ADD_EOS_TOKEN_AS_STOP_SEQUENCE = "ADD_EOS_TOKEN_AS_STOP_SEQUENCE"
30
29
  """Indicates that the EOS token should be added as an extra stop sequence.
31
30
 
@@ -100,6 +99,19 @@ class JobNotFinishedError(TogetherClientError):
100
99
  pass
101
100
 
102
101
 
102
+ def _parse_thinking(input: str) -> Tuple[str, str]:
103
+ """Return a tuple of thinking text and output text."""
104
+ match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
105
+ if match:
106
+ return (match.group(1), match.group(2))
107
+
108
+ match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
109
+ if match:
110
+ return (match.group(1), "")
111
+
112
+ return (input, "")
113
+
114
+
103
115
  class TogetherClient(CachingClient):
104
116
  """
105
117
  Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -328,12 +340,14 @@ class TogetherChatClient(CachingClient):
328
340
  together_model: Optional[str] = None,
329
341
  disable_logprobs: Optional[bool] = None,
330
342
  output_processor: Optional[str] = None,
343
+ parse_thinking: Optional[bool] = None,
331
344
  ):
332
345
  super().__init__(cache_config=cache_config)
333
346
  self._client = Together(api_key=api_key)
334
347
  self._together_model = together_model
335
348
  self._disable_logprobs = bool(disable_logprobs)
336
349
  # self.output_processor is actually a function, not a class
350
+ self._parse_thinking = bool(parse_thinking)
337
351
 
338
352
  self.output_processor: Optional[Callable[[str], str]] = (
339
353
  get_class_by_name(output_processor) if output_processor else None
@@ -424,11 +438,21 @@ class TogetherChatClient(CachingClient):
424
438
  if token_text is None:
425
439
  break
426
440
  tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
441
+ logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
427
442
  assert choice.message.role == "assistant"
428
443
  output_text = choice.message.content
429
444
  if self.output_processor:
430
445
  output_text = self.output_processor(output_text)
431
- generated_outputs.append(GeneratedOutput(text=output_text, logprob=0.0, tokens=tokens))
446
+
447
+ if self._parse_thinking:
448
+ thinking_text, output_text = _parse_thinking(output_text)
449
+ generated_outputs.append(
450
+ GeneratedOutput(
451
+ text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
452
+ )
453
+ )
454
+ else:
455
+ generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
432
456
  return RequestResult(
433
457
  success=True,
434
458
  cached=cached,
@@ -521,8 +545,9 @@ class TogetherCompletionClient(CachingClient):
521
545
  if token_text is None:
522
546
  break
523
547
  tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
548
+ logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
524
549
  assert choice.text
525
- generated_outputs.append(GeneratedOutput(text=choice.text, logprob=0.0, tokens=tokens))
550
+ generated_outputs.append(GeneratedOutput(text=choice.text, logprob=logprob, tokens=tokens))
526
551
  return RequestResult(
527
552
  success=True,
528
553
  cached=cached,
@@ -1,7 +1,7 @@
1
1
  import requests
2
2
  from abc import ABC, abstractmethod
3
3
  from threading import Lock
4
- from typing import Any, Dict, Mapping, Optional, List, Union
4
+ from typing import Any, Dict, Mapping, Optional, List, Union, cast
5
5
 
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.multimodal_request_utils import get_contents_as_bytes
@@ -107,7 +107,7 @@ class VertexAITextClient(VertexAIClient):
107
107
 
108
108
  def make_request(self, request: Request) -> RequestResult:
109
109
  """Make a request"""
110
- parameters = {
110
+ parameters: Dict[str, Any] = {
111
111
  "temperature": request.temperature,
112
112
  "max_output_tokens": request.max_tokens,
113
113
  "top_k": request.top_k_per_token,
@@ -207,21 +207,23 @@ class VertexAIChatClient(VertexAIClient):
207
207
 
208
208
  def make_request(self, request: Request) -> RequestResult:
209
209
  """Make a request"""
210
- contents = [request.prompt]
210
+ # mypy is unhappy without this cast
211
+ contents: Union[List[Union[str, Image, Part]], List[Content]] = cast(
212
+ List[Union[str, Image, Part]], [request.prompt]
213
+ )
211
214
 
212
215
  # For the multimodal case, build up the content with the media objects of `request.multimodal_prompt`
213
216
  if request.multimodal_prompt is not None:
214
217
  return self._make_multimodal_request(request)
215
218
 
216
219
  if request.messages is not None:
217
- contents = []
218
220
  role_mapping = {"user": "user", "assistant": "model"}
219
- for msg in request.messages:
220
- contents.append(
221
- Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
222
- )
221
+ contents = [
222
+ Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
223
+ for msg in request.messages
224
+ ]
223
225
 
224
- parameters = {
226
+ parameters: Dict[str, Any] = {
225
227
  "temperature": request.temperature,
226
228
  "max_output_tokens": request.max_tokens,
227
229
  "top_k": request.top_k_per_token,
@@ -360,6 +362,12 @@ class VertexAIChatClient(VertexAIClient):
360
362
  for media_object in request.multimodal_prompt.media_objects:
361
363
  if media_object.is_type("image") and media_object.location:
362
364
  contents.append(Part.from_image(Image.load_from_file(media_object.location)))
365
+ elif media_object.is_type("video") and media_object.location:
366
+ # Following this example
367
+ # https://cloud.google.com/vertex-ai/generative-ai/docs/samples/googlegenaisdk-textgen-with-local-video
368
+ with open(media_object.location, "rb") as fp:
369
+ video_content = fp.read()
370
+ contents.append(Part.from_data(data=video_content, mime_type=media_object.content_type))
363
371
  elif media_object.is_type("audio") and media_object.location:
364
372
  contents.append(
365
373
  Part.from_data(get_contents_as_bytes(media_object.location), mime_type=media_object.content_type)