crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
helm/clients/openai_client.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
# mypy: check_untyped_defs = False
|
|
2
2
|
from dataclasses import replace
|
|
3
|
+
import re
|
|
3
4
|
from typing import Any, Dict, List, Optional, cast, Union, Callable
|
|
4
5
|
|
|
6
|
+
from openai import OpenAIError
|
|
7
|
+
|
|
5
8
|
from helm.benchmark.model_metadata_registry import is_vlm
|
|
6
9
|
from helm.common import multimodal_request_utils
|
|
7
10
|
from helm.common.cache import CacheConfig
|
|
8
|
-
from helm.common.media_object import TEXT_TYPE, MultimediaObject
|
|
9
|
-
from helm.common.request import ErrorFlags, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
10
|
-
from helm.common.hierarchical_logger import hlog
|
|
11
|
+
from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
|
|
12
|
+
from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
13
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
11
14
|
from helm.common.object_spec import get_class_by_name
|
|
12
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
13
16
|
from helm.common.tokenization_request import (
|
|
@@ -24,8 +27,13 @@ except ModuleNotFoundError as e:
|
|
|
24
27
|
handle_module_not_found_error(e, ["openai"])
|
|
25
28
|
|
|
26
29
|
|
|
27
|
-
class
|
|
28
|
-
|
|
30
|
+
class OpenAIClientUtils:
|
|
31
|
+
"""Methods used by both the chat completions client and the responses API client"""
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def is_reasoning_model(cls, model_engine: str) -> bool:
|
|
35
|
+
# All OpenAI reasoning models start "o[somenumber]", so we regexp for that to future proof things
|
|
36
|
+
return bool(re.match(r"^o\d+", model_engine))
|
|
29
37
|
|
|
30
38
|
# Error OpenAI throws when the image in the prompt violates their content policy
|
|
31
39
|
INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
|
|
@@ -49,6 +57,56 @@ class OpenAIClient(CachingClient):
|
|
|
49
57
|
"See https://labs.openai.com/policies/content-policy for more information."
|
|
50
58
|
)
|
|
51
59
|
|
|
60
|
+
@classmethod
|
|
61
|
+
def handle_openai_error(cls, e: OpenAIError, request: Request):
|
|
62
|
+
if cls.INAPPROPRIATE_IMAGE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_ERROR in str(e):
|
|
63
|
+
hwarn(f"Failed safety check: {str(request)}")
|
|
64
|
+
empty_completion = GeneratedOutput(
|
|
65
|
+
text="",
|
|
66
|
+
logprob=0,
|
|
67
|
+
tokens=[],
|
|
68
|
+
finish_reason={"reason": cls.CONTENT_POLICY_VIOLATED_FINISH_REASON},
|
|
69
|
+
)
|
|
70
|
+
return RequestResult(
|
|
71
|
+
success=True,
|
|
72
|
+
cached=False,
|
|
73
|
+
request_time=0,
|
|
74
|
+
completions=[empty_completion] * request.num_completions,
|
|
75
|
+
embedding=[],
|
|
76
|
+
)
|
|
77
|
+
elif cls.OPENAI_SERVER_ERROR in str(e):
|
|
78
|
+
# Handle these errors by returning an empty completion to unblock
|
|
79
|
+
hwarn(f"OpenAI server error for request: {str(request)}")
|
|
80
|
+
empty_completion = GeneratedOutput(
|
|
81
|
+
text="",
|
|
82
|
+
logprob=0,
|
|
83
|
+
tokens=[],
|
|
84
|
+
finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
|
|
85
|
+
)
|
|
86
|
+
return RequestResult(
|
|
87
|
+
success=True,
|
|
88
|
+
cached=False,
|
|
89
|
+
request_time=0,
|
|
90
|
+
completions=[empty_completion] * request.num_completions,
|
|
91
|
+
embedding=[],
|
|
92
|
+
)
|
|
93
|
+
elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
|
|
94
|
+
return RequestResult(
|
|
95
|
+
success=False,
|
|
96
|
+
cached=False,
|
|
97
|
+
error="Content blocked by Azure's content management filter",
|
|
98
|
+
completions=[],
|
|
99
|
+
embedding=[],
|
|
100
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
error: str = f"OpenAI error: {e}"
|
|
104
|
+
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class OpenAIClient(CachingClient):
|
|
108
|
+
END_OF_TEXT: str = "<|endoftext|>"
|
|
109
|
+
|
|
52
110
|
def __init__(
|
|
53
111
|
self,
|
|
54
112
|
tokenizer: Tokenizer,
|
|
@@ -60,11 +118,12 @@ class OpenAIClient(CachingClient):
|
|
|
60
118
|
reasoning_effort: Optional[str] = None,
|
|
61
119
|
openai_model_name: Optional[str] = None,
|
|
62
120
|
output_processor: Optional[str] = None,
|
|
121
|
+
**kwargs,
|
|
63
122
|
):
|
|
64
123
|
super().__init__(cache_config=cache_config)
|
|
65
124
|
self.tokenizer = tokenizer
|
|
66
125
|
self.tokenizer_name = tokenizer_name
|
|
67
|
-
self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
|
|
126
|
+
self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url, **kwargs)
|
|
68
127
|
self.reasoning_effort = reasoning_effort
|
|
69
128
|
self.openai_model_name = openai_model_name
|
|
70
129
|
self.output_processor: Optional[Callable[[str], str]] = (
|
|
@@ -118,7 +177,7 @@ class OpenAIClient(CachingClient):
|
|
|
118
177
|
embedding=embedding,
|
|
119
178
|
)
|
|
120
179
|
|
|
121
|
-
def
|
|
180
|
+
def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
|
|
122
181
|
messages: Optional[List[Dict[str, Union[str, Any]]]] = request.messages
|
|
123
182
|
if (
|
|
124
183
|
(request.prompt and request.messages)
|
|
@@ -137,7 +196,7 @@ class OpenAIClient(CachingClient):
|
|
|
137
196
|
if request.messages[-1]["role"] != "user":
|
|
138
197
|
raise ValueError("Last message must have role 'user'")
|
|
139
198
|
if request.prompt != "":
|
|
140
|
-
|
|
199
|
+
hwarn("Since message is set, prompt will be ignored")
|
|
141
200
|
else:
|
|
142
201
|
# Convert prompt into a single message
|
|
143
202
|
# For now, put the whole prompt in a single user message, and expect the response
|
|
@@ -223,7 +282,7 @@ class OpenAIClient(CachingClient):
|
|
|
223
282
|
# Refer to the "Reasoning models" documentation further discussion of o1 model limitations:
|
|
224
283
|
# https://platform.openai.com/docs/guides/reasoning
|
|
225
284
|
model_engine: str = request.model_engine
|
|
226
|
-
if
|
|
285
|
+
if OpenAIClientUtils.is_reasoning_model(model_engine):
|
|
227
286
|
# Avoid error:
|
|
228
287
|
# "Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead." # noqa: E501
|
|
229
288
|
# Note that openai>=1.45 is needed for this
|
|
@@ -241,8 +300,13 @@ class OpenAIClient(CachingClient):
|
|
|
241
300
|
# 'code': 'unsupported_parameter'}}"
|
|
242
301
|
raw_request.pop("temperature", None)
|
|
243
302
|
|
|
303
|
+
# The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
|
|
304
|
+
raw_request.pop("top_p", None)
|
|
305
|
+
raw_request.pop("frequency_penalty", None)
|
|
306
|
+
raw_request.pop("presence_penalty", None)
|
|
307
|
+
|
|
244
308
|
if self.reasoning_effort:
|
|
245
|
-
raw_request["reasoning_effort"] =
|
|
309
|
+
raw_request["reasoning_effort"] = self.reasoning_effort
|
|
246
310
|
elif is_vlm(request.model):
|
|
247
311
|
# Avoid error:
|
|
248
312
|
# "Invalid type for 'stop': expected an unsupported value, but got null instead."
|
|
@@ -258,6 +322,10 @@ class OpenAIClient(CachingClient):
|
|
|
258
322
|
# OpenAI error: Error code: 400 - {'error': {'message': "[{'type': 'string_type', 'loc': ('body', 'stop', 'str'), 'msg': 'Input should be a valid string', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[str]'), 'msg': 'Input should be a valid list', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[list[int]]'), 'msg': 'Input should be a valid list', 'input': None}]", 'type': 'invalid_request_error', 'param': None, 'code': None}} # noqa: 3501
|
|
259
323
|
if raw_request["stop"] is None:
|
|
260
324
|
raw_request.pop("stop")
|
|
325
|
+
return raw_request
|
|
326
|
+
|
|
327
|
+
def _make_chat_request(self, request: Request) -> RequestResult:
|
|
328
|
+
raw_request = self._make_chat_raw_request(request)
|
|
261
329
|
|
|
262
330
|
def do_it() -> Dict[str, Any]:
|
|
263
331
|
return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
|
|
@@ -266,49 +334,7 @@ class OpenAIClient(CachingClient):
|
|
|
266
334
|
cache_key = self._get_cache_key(raw_request, request)
|
|
267
335
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
268
336
|
except openai.OpenAIError as e:
|
|
269
|
-
|
|
270
|
-
hlog(f"Failed safety check: {str(request)}")
|
|
271
|
-
empty_completion = GeneratedOutput(
|
|
272
|
-
text="",
|
|
273
|
-
logprob=0,
|
|
274
|
-
tokens=[],
|
|
275
|
-
finish_reason={"reason": self.CONTENT_POLICY_VIOLATED_FINISH_REASON},
|
|
276
|
-
)
|
|
277
|
-
return RequestResult(
|
|
278
|
-
success=True,
|
|
279
|
-
cached=False,
|
|
280
|
-
request_time=0,
|
|
281
|
-
completions=[empty_completion] * request.num_completions,
|
|
282
|
-
embedding=[],
|
|
283
|
-
)
|
|
284
|
-
elif self.OPENAI_SERVER_ERROR in str(e):
|
|
285
|
-
# Handle these errors by returning an empty completion to unblock
|
|
286
|
-
hlog(f"OpenAI server error for request: {str(request)}")
|
|
287
|
-
empty_completion = GeneratedOutput(
|
|
288
|
-
text="",
|
|
289
|
-
logprob=0,
|
|
290
|
-
tokens=[],
|
|
291
|
-
finish_reason={"reason": self.OPENAI_SERVER_ERROR},
|
|
292
|
-
)
|
|
293
|
-
return RequestResult(
|
|
294
|
-
success=True,
|
|
295
|
-
cached=False,
|
|
296
|
-
request_time=0,
|
|
297
|
-
completions=[empty_completion] * request.num_completions,
|
|
298
|
-
embedding=[],
|
|
299
|
-
)
|
|
300
|
-
elif self.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
|
|
301
|
-
return RequestResult(
|
|
302
|
-
success=False,
|
|
303
|
-
cached=False,
|
|
304
|
-
error="Content blocked by Azure's content management filter",
|
|
305
|
-
completions=[],
|
|
306
|
-
embedding=[],
|
|
307
|
-
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
error: str = f"OpenAI error: {e}"
|
|
311
|
-
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
337
|
+
return OpenAIClientUtils.handle_openai_error(e, request)
|
|
312
338
|
|
|
313
339
|
completions: List[GeneratedOutput] = []
|
|
314
340
|
for raw_completion in response["choices"]:
|
|
@@ -338,11 +364,20 @@ class OpenAIClient(CachingClient):
|
|
|
338
364
|
tokens: List[Token] = [
|
|
339
365
|
Token(text=cast(str, raw_token), logprob=0) for raw_token in tokenization_result.raw_tokens
|
|
340
366
|
]
|
|
367
|
+
# vLLM has a optional `reasoning_content` field in the message
|
|
368
|
+
# that is not in the standard OpenAI API.
|
|
369
|
+
# This field is also used by some model providers such as Grok.
|
|
370
|
+
thinking = (
|
|
371
|
+
Thinking(text=raw_completion["message"]["reasoning_content"])
|
|
372
|
+
if "reasoning_content" in raw_completion["message"]
|
|
373
|
+
else None
|
|
374
|
+
)
|
|
341
375
|
completion = GeneratedOutput(
|
|
342
376
|
text=text,
|
|
343
377
|
logprob=0, # OpenAI does not provide logprobs
|
|
344
378
|
tokens=tokens,
|
|
345
379
|
finish_reason={"reason": raw_completion["finish_reason"]},
|
|
380
|
+
thinking=thinking,
|
|
346
381
|
)
|
|
347
382
|
completions.append(truncate_sequence(completion, request)) # Truncate the text by stop sequences
|
|
348
383
|
|
|
@@ -459,7 +494,7 @@ class OpenAIClient(CachingClient):
|
|
|
459
494
|
def make_request(self, request: Request) -> RequestResult:
|
|
460
495
|
if request.embedding:
|
|
461
496
|
return self._make_embedding_request(request)
|
|
462
|
-
elif "whisper" in request.model_engine:
|
|
497
|
+
elif "whisper" in request.model_engine or "transcribe" in request.model_engine:
|
|
463
498
|
return self._make_transcription_request(request)
|
|
464
499
|
else:
|
|
465
500
|
return self._make_chat_request(request)
|
|
@@ -536,6 +571,18 @@ class OpenAITranscriptionThenCompletionClient(Client):
|
|
|
536
571
|
# Now make the request to the completion model with just a text-only prompt and no audio
|
|
537
572
|
# Use the same decoding parameters as the original request
|
|
538
573
|
# Ensure to set multimodal_prompt to None so the request is treated as text-only.
|
|
539
|
-
|
|
574
|
+
request_result: RequestResult = self._openai_client.make_request(
|
|
540
575
|
replace(request, prompt=text_prompt, model=f"openai/{completion_model}", multimodal_prompt=None)
|
|
541
576
|
)
|
|
577
|
+
|
|
578
|
+
# Also include the generated transcript to the request result
|
|
579
|
+
completions_with_transcript: List[GeneratedOutput] = [
|
|
580
|
+
replace(
|
|
581
|
+
completion,
|
|
582
|
+
multimodal_content=MultimediaObject(
|
|
583
|
+
media_objects=[MediaObject(text=text_prompt, content_type="text/plain")]
|
|
584
|
+
),
|
|
585
|
+
)
|
|
586
|
+
for completion in request_result.completions
|
|
587
|
+
]
|
|
588
|
+
return replace(request_result, completions=completions_with_transcript)
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# mypy: check_untyped_defs = False
|
|
2
|
+
import dataclasses
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from helm.clients.openai_client import OpenAIClientUtils
|
|
7
|
+
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.media_object import TEXT_TYPE
|
|
9
|
+
from helm.common.request import (
|
|
10
|
+
Thinking,
|
|
11
|
+
wrap_request_time,
|
|
12
|
+
Request,
|
|
13
|
+
RequestResult,
|
|
14
|
+
GeneratedOutput,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
17
|
+
from helm.clients.client import (
|
|
18
|
+
CachingClient,
|
|
19
|
+
truncate_and_tokenize_response_text,
|
|
20
|
+
generate_uid_for_multimodal_prompt,
|
|
21
|
+
)
|
|
22
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import openai
|
|
26
|
+
from openai import OpenAI
|
|
27
|
+
except ModuleNotFoundError as e:
|
|
28
|
+
handle_module_not_found_error(e, ["openai"])
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class OpenAIResponseClient(CachingClient):
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
tokenizer: Tokenizer,
|
|
35
|
+
tokenizer_name: str,
|
|
36
|
+
cache_config: CacheConfig,
|
|
37
|
+
api_key: Optional[str] = None,
|
|
38
|
+
org_id: Optional[str] = None,
|
|
39
|
+
base_url: Optional[str] = None,
|
|
40
|
+
reasoning_effort: Optional[str] = None,
|
|
41
|
+
openai_model_name: Optional[str] = None,
|
|
42
|
+
):
|
|
43
|
+
super().__init__(cache_config=cache_config)
|
|
44
|
+
self.tokenizer = tokenizer
|
|
45
|
+
self.tokenizer_name = tokenizer_name
|
|
46
|
+
self.client = OpenAI(
|
|
47
|
+
api_key=api_key,
|
|
48
|
+
organization=org_id,
|
|
49
|
+
base_url=base_url,
|
|
50
|
+
)
|
|
51
|
+
self.reasoning_effort = reasoning_effort
|
|
52
|
+
self.openai_model_name = openai_model_name
|
|
53
|
+
|
|
54
|
+
def _get_cache_key(self, raw_request: Dict, request: Request):
|
|
55
|
+
cache_key = CachingClient.make_cache_key(raw_request, request)
|
|
56
|
+
if request.multimodal_prompt:
|
|
57
|
+
prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
|
|
58
|
+
cache_key = {**cache_key, "multimodal_prompt": prompt_key}
|
|
59
|
+
return cache_key
|
|
60
|
+
|
|
61
|
+
def _make_raw_request(self, request: Request) -> dict[str, Any]:
|
|
62
|
+
input: Union[str, List[Dict[str, Any]]]
|
|
63
|
+
if request.multimodal_prompt is not None:
|
|
64
|
+
content = []
|
|
65
|
+
request.validate()
|
|
66
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
67
|
+
if media_object.is_type("image") and media_object.location:
|
|
68
|
+
from helm.common.images_utils import encode_base64
|
|
69
|
+
|
|
70
|
+
base64_image: str = encode_base64(media_object.location)
|
|
71
|
+
content.append(
|
|
72
|
+
{
|
|
73
|
+
"type": "input_image",
|
|
74
|
+
"image_url": f"data:image/jpeg;base64,{base64_image}",
|
|
75
|
+
}
|
|
76
|
+
)
|
|
77
|
+
elif media_object.is_type(TEXT_TYPE):
|
|
78
|
+
assert media_object.text is not None
|
|
79
|
+
content.append({"type": "input_text", "text": media_object.text})
|
|
80
|
+
else:
|
|
81
|
+
raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
|
|
82
|
+
input = [{"role": "user", "content": content}]
|
|
83
|
+
else:
|
|
84
|
+
input = request.prompt
|
|
85
|
+
|
|
86
|
+
raw_request: Dict[str, Any] = {
|
|
87
|
+
"model": self._get_model_for_request(request),
|
|
88
|
+
"input": input,
|
|
89
|
+
"top_p": request.top_p,
|
|
90
|
+
# API errors if max_output_tokens is less than 16
|
|
91
|
+
# (Error you get: "Invalid 'max_output_tokens': integer below minimum value.
|
|
92
|
+
# Expected a value >= 16, but got 5 instead.")
|
|
93
|
+
"max_output_tokens": max(16, request.max_tokens),
|
|
94
|
+
"temperature": request.temperature,
|
|
95
|
+
# Don't store responses for later retrieval
|
|
96
|
+
"store": False,
|
|
97
|
+
}
|
|
98
|
+
if self.reasoning_effort:
|
|
99
|
+
raw_request["reasoning"] = {"effort": self.reasoning_effort}
|
|
100
|
+
# If o-series model, get reasoning summaries
|
|
101
|
+
# Plus other changes
|
|
102
|
+
model_engine: str = request.model_engine
|
|
103
|
+
if OpenAIClientUtils.is_reasoning_model(model_engine):
|
|
104
|
+
raw_request["reasoning"]["summary"] = "detailed"
|
|
105
|
+
# Avoid error:
|
|
106
|
+
# "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
|
|
107
|
+
# not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature',
|
|
108
|
+
# 'code': 'unsupported_parameter'}}"
|
|
109
|
+
raw_request.pop("temperature", None)
|
|
110
|
+
|
|
111
|
+
# The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
|
|
112
|
+
raw_request.pop("top_p", None)
|
|
113
|
+
|
|
114
|
+
return raw_request
|
|
115
|
+
|
|
116
|
+
def _get_model_for_request(self, request: Request) -> str:
|
|
117
|
+
return self.openai_model_name or request.model_engine
|
|
118
|
+
|
|
119
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
120
|
+
# Content can either be text or a list of multimodal content made up of text and images:
|
|
121
|
+
# https://platform.openai.com/docs/api-reference/responses/create
|
|
122
|
+
raw_request = self._make_raw_request(request)
|
|
123
|
+
|
|
124
|
+
# The responses API does not support a "num_completions" parameter,
|
|
125
|
+
# so we need to handle it ourselves with a simple loop
|
|
126
|
+
completions: list[GeneratedOutput] = []
|
|
127
|
+
for _ in range(request.num_completions):
|
|
128
|
+
|
|
129
|
+
def do_it() -> Dict[str, Any]:
|
|
130
|
+
raw_response = self.client.responses.create(**raw_request).model_dump(mode="json")
|
|
131
|
+
assert not raw_response.get("error", None), f"Error in response: {raw_response}"
|
|
132
|
+
return raw_response
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
cache_key = self._get_cache_key(raw_request, request)
|
|
136
|
+
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
137
|
+
except openai.OpenAIError as e:
|
|
138
|
+
return OpenAIClientUtils.handle_openai_error(e, request)
|
|
139
|
+
|
|
140
|
+
# We can only return one completition really,
|
|
141
|
+
# but we get an array of messages back, so we need to contact them
|
|
142
|
+
reasoning_output = ""
|
|
143
|
+
text_output = ""
|
|
144
|
+
|
|
145
|
+
if request.echo_prompt:
|
|
146
|
+
text_output += request.prompt
|
|
147
|
+
for output in response["output"]:
|
|
148
|
+
output_type = output[
|
|
149
|
+
"type"
|
|
150
|
+
] # one of "message" or "reasoning" from API observation, but can also include tool calls
|
|
151
|
+
|
|
152
|
+
if output_type == "reasoning":
|
|
153
|
+
reasoning_output += "\n".join([raw_output["text"] for raw_output in output["summary"]])
|
|
154
|
+
elif output_type == "message":
|
|
155
|
+
text_output += "\n".join([raw_output["text"] for raw_output in output["content"]])
|
|
156
|
+
# (Other output types are ignored)
|
|
157
|
+
|
|
158
|
+
completion = truncate_and_tokenize_response_text(
|
|
159
|
+
text_output,
|
|
160
|
+
request,
|
|
161
|
+
self.tokenizer,
|
|
162
|
+
self.tokenizer_name,
|
|
163
|
+
original_finish_reason="",
|
|
164
|
+
)
|
|
165
|
+
if reasoning_output:
|
|
166
|
+
completion = dataclasses.replace(completion, thinking=Thinking(text=reasoning_output))
|
|
167
|
+
completions.append(completion)
|
|
168
|
+
|
|
169
|
+
return RequestResult(
|
|
170
|
+
success=True,
|
|
171
|
+
cached=cached,
|
|
172
|
+
request_time=response["request_time"],
|
|
173
|
+
request_datetime=response.get("request_datetime"),
|
|
174
|
+
completions=completions,
|
|
175
|
+
embedding=[],
|
|
176
|
+
)
|
helm/clients/palmyra_client.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Any, Dict, List
|
|
|
5
5
|
|
|
6
6
|
from helm.clients.openai_client import OpenAIClient
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
-
from helm.common.hierarchical_logger import
|
|
8
|
+
from helm.common.hierarchical_logger import hwarn
|
|
9
9
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token, ErrorFlags
|
|
10
10
|
from helm.common.tokenization_request import (
|
|
11
11
|
TokenizationRequest,
|
|
@@ -103,10 +103,7 @@ class PalmyraClient(CachingClient):
|
|
|
103
103
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
104
104
|
|
|
105
105
|
if _is_content_moderation_failure(response):
|
|
106
|
-
|
|
107
|
-
f"WARNING: Returning empty request for {request.model_deployment} "
|
|
108
|
-
"due to content moderation filter"
|
|
109
|
-
)
|
|
106
|
+
hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
|
|
110
107
|
return RequestResult(
|
|
111
108
|
success=False,
|
|
112
109
|
cached=False,
|
helm/clients/reka_client.py
CHANGED
|
@@ -6,7 +6,7 @@ from helm.proxy.retry import NonRetriableException
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.media_object import TEXT_TYPE
|
|
8
8
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
|
|
9
|
-
from helm.common.hierarchical_logger import
|
|
9
|
+
from helm.common.hierarchical_logger import hwarn
|
|
10
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
11
11
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
12
12
|
from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
|
|
@@ -121,7 +121,7 @@ class RekaClient(CachingClient):
|
|
|
121
121
|
if messages[-1]["role"] != "user":
|
|
122
122
|
raise ValueError("Last message must have role 'user'")
|
|
123
123
|
if request.prompt != "":
|
|
124
|
-
|
|
124
|
+
hwarn("Since message is set, prompt will be ignored")
|
|
125
125
|
reka_chat_history = self._convert_messages_to_reka_chat_history(messages)
|
|
126
126
|
else:
|
|
127
127
|
current_chat_history: Dict[str, Any] = {
|
|
@@ -9,7 +9,7 @@ from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
|
9
9
|
class TestHuggingFaceClient:
|
|
10
10
|
def test_gpt2(self):
|
|
11
11
|
tokenizer = HuggingFaceTokenizer(
|
|
12
|
-
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
|
|
12
|
+
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
|
|
13
13
|
)
|
|
14
14
|
client = HuggingFaceClient(
|
|
15
15
|
cache_config=BlackHoleCacheConfig(),
|
|
@@ -36,7 +36,7 @@ class TestHuggingFaceClient:
|
|
|
36
36
|
@pytest.mark.skip(reason="GPT-J 6B is 22 GB and extremely slow without a GPU.")
|
|
37
37
|
def test_gptj_6b(self):
|
|
38
38
|
tokenizer = HuggingFaceTokenizer(
|
|
39
|
-
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
|
|
39
|
+
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
|
|
40
40
|
)
|
|
41
41
|
client = HuggingFaceClient(
|
|
42
42
|
cache_config=BlackHoleCacheConfig(),
|
|
@@ -57,7 +57,7 @@ class TestHuggingFaceClient:
|
|
|
57
57
|
|
|
58
58
|
def test_logprob(self):
|
|
59
59
|
tokenizer = HuggingFaceTokenizer(
|
|
60
|
-
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai/gpt2"
|
|
60
|
+
BlackHoleCacheConfig(), "huggingface/gpt2", pretrained_model_name_or_path="openai-community/gpt2"
|
|
61
61
|
)
|
|
62
62
|
client = HuggingFaceClient(
|
|
63
63
|
cache_config=BlackHoleCacheConfig(),
|
helm/clients/together_client.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from copy import deepcopy
|
|
2
2
|
from itertools import zip_longest
|
|
3
|
+
import re
|
|
3
4
|
import threading
|
|
4
|
-
from typing import Callable, List, Dict, Any, Mapping, Optional, TypedDict, Union
|
|
5
|
+
from typing import Callable, List, Dict, Any, Mapping, Optional, Tuple, TypedDict, Union
|
|
5
6
|
from typing_extensions import NotRequired
|
|
6
7
|
|
|
7
8
|
import requests
|
|
@@ -11,7 +12,7 @@ from helm.common.cache import CacheConfig
|
|
|
11
12
|
from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
|
|
12
13
|
from helm.common.object_spec import get_class_by_name
|
|
13
14
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
14
|
-
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
15
|
+
from helm.common.request import Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
15
16
|
from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
|
|
16
17
|
|
|
17
18
|
try:
|
|
@@ -24,8 +25,6 @@ except ModuleNotFoundError as e:
|
|
|
24
25
|
class _RewriteRequestTags:
|
|
25
26
|
"""Tags that indicate that the request for the model must be rewritten before sending to Together."""
|
|
26
27
|
|
|
27
|
-
# TODO: Convert to StrEnum after upgrading to Python 3.11
|
|
28
|
-
|
|
29
28
|
ADD_EOS_TOKEN_AS_STOP_SEQUENCE = "ADD_EOS_TOKEN_AS_STOP_SEQUENCE"
|
|
30
29
|
"""Indicates that the EOS token should be added as an extra stop sequence.
|
|
31
30
|
|
|
@@ -100,6 +99,19 @@ class JobNotFinishedError(TogetherClientError):
|
|
|
100
99
|
pass
|
|
101
100
|
|
|
102
101
|
|
|
102
|
+
def _parse_thinking(input: str) -> Tuple[str, str]:
|
|
103
|
+
"""Return a tuple of thinking text and output text."""
|
|
104
|
+
match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
|
|
105
|
+
if match:
|
|
106
|
+
return (match.group(1), match.group(2))
|
|
107
|
+
|
|
108
|
+
match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
|
|
109
|
+
if match:
|
|
110
|
+
return (match.group(1), "")
|
|
111
|
+
|
|
112
|
+
return (input, "")
|
|
113
|
+
|
|
114
|
+
|
|
103
115
|
class TogetherClient(CachingClient):
|
|
104
116
|
"""
|
|
105
117
|
Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
|
|
@@ -328,12 +340,14 @@ class TogetherChatClient(CachingClient):
|
|
|
328
340
|
together_model: Optional[str] = None,
|
|
329
341
|
disable_logprobs: Optional[bool] = None,
|
|
330
342
|
output_processor: Optional[str] = None,
|
|
343
|
+
parse_thinking: Optional[bool] = None,
|
|
331
344
|
):
|
|
332
345
|
super().__init__(cache_config=cache_config)
|
|
333
346
|
self._client = Together(api_key=api_key)
|
|
334
347
|
self._together_model = together_model
|
|
335
348
|
self._disable_logprobs = bool(disable_logprobs)
|
|
336
349
|
# self.output_processor is actually a function, not a class
|
|
350
|
+
self._parse_thinking = bool(parse_thinking)
|
|
337
351
|
|
|
338
352
|
self.output_processor: Optional[Callable[[str], str]] = (
|
|
339
353
|
get_class_by_name(output_processor) if output_processor else None
|
|
@@ -424,11 +438,21 @@ class TogetherChatClient(CachingClient):
|
|
|
424
438
|
if token_text is None:
|
|
425
439
|
break
|
|
426
440
|
tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
|
|
441
|
+
logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
|
|
427
442
|
assert choice.message.role == "assistant"
|
|
428
443
|
output_text = choice.message.content
|
|
429
444
|
if self.output_processor:
|
|
430
445
|
output_text = self.output_processor(output_text)
|
|
431
|
-
|
|
446
|
+
|
|
447
|
+
if self._parse_thinking:
|
|
448
|
+
thinking_text, output_text = _parse_thinking(output_text)
|
|
449
|
+
generated_outputs.append(
|
|
450
|
+
GeneratedOutput(
|
|
451
|
+
text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
|
|
452
|
+
)
|
|
453
|
+
)
|
|
454
|
+
else:
|
|
455
|
+
generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
|
|
432
456
|
return RequestResult(
|
|
433
457
|
success=True,
|
|
434
458
|
cached=cached,
|
|
@@ -521,8 +545,9 @@ class TogetherCompletionClient(CachingClient):
|
|
|
521
545
|
if token_text is None:
|
|
522
546
|
break
|
|
523
547
|
tokens.append(Token(text=token_text, logprob=token_logprob or 0.0))
|
|
548
|
+
logprob = sum([token.logprob for token in tokens]) if tokens else 0.0
|
|
524
549
|
assert choice.text
|
|
525
|
-
generated_outputs.append(GeneratedOutput(text=choice.text, logprob=
|
|
550
|
+
generated_outputs.append(GeneratedOutput(text=choice.text, logprob=logprob, tokens=tokens))
|
|
526
551
|
return RequestResult(
|
|
527
552
|
success=True,
|
|
528
553
|
cached=cached,
|
helm/clients/vertexai_client.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from threading import Lock
|
|
4
|
-
from typing import Any, Dict, Mapping, Optional, List, Union
|
|
4
|
+
from typing import Any, Dict, Mapping, Optional, List, Union, cast
|
|
5
5
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.multimodal_request_utils import get_contents_as_bytes
|
|
@@ -107,7 +107,7 @@ class VertexAITextClient(VertexAIClient):
|
|
|
107
107
|
|
|
108
108
|
def make_request(self, request: Request) -> RequestResult:
|
|
109
109
|
"""Make a request"""
|
|
110
|
-
parameters = {
|
|
110
|
+
parameters: Dict[str, Any] = {
|
|
111
111
|
"temperature": request.temperature,
|
|
112
112
|
"max_output_tokens": request.max_tokens,
|
|
113
113
|
"top_k": request.top_k_per_token,
|
|
@@ -207,21 +207,23 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
207
207
|
|
|
208
208
|
def make_request(self, request: Request) -> RequestResult:
|
|
209
209
|
"""Make a request"""
|
|
210
|
-
|
|
210
|
+
# mypy is unhappy without this cast
|
|
211
|
+
contents: Union[List[Union[str, Image, Part]], List[Content]] = cast(
|
|
212
|
+
List[Union[str, Image, Part]], [request.prompt]
|
|
213
|
+
)
|
|
211
214
|
|
|
212
215
|
# For the multimodal case, build up the content with the media objects of `request.multimodal_prompt`
|
|
213
216
|
if request.multimodal_prompt is not None:
|
|
214
217
|
return self._make_multimodal_request(request)
|
|
215
218
|
|
|
216
219
|
if request.messages is not None:
|
|
217
|
-
contents = []
|
|
218
220
|
role_mapping = {"user": "user", "assistant": "model"}
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
221
|
+
contents = [
|
|
222
|
+
Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
|
|
223
|
+
for msg in request.messages
|
|
224
|
+
]
|
|
223
225
|
|
|
224
|
-
parameters = {
|
|
226
|
+
parameters: Dict[str, Any] = {
|
|
225
227
|
"temperature": request.temperature,
|
|
226
228
|
"max_output_tokens": request.max_tokens,
|
|
227
229
|
"top_k": request.top_k_per_token,
|
|
@@ -360,6 +362,12 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
360
362
|
for media_object in request.multimodal_prompt.media_objects:
|
|
361
363
|
if media_object.is_type("image") and media_object.location:
|
|
362
364
|
contents.append(Part.from_image(Image.load_from_file(media_object.location)))
|
|
365
|
+
elif media_object.is_type("video") and media_object.location:
|
|
366
|
+
# Following this example
|
|
367
|
+
# https://cloud.google.com/vertex-ai/generative-ai/docs/samples/googlegenaisdk-textgen-with-local-video
|
|
368
|
+
with open(media_object.location, "rb") as fp:
|
|
369
|
+
video_content = fp.read()
|
|
370
|
+
contents.append(Part.from_data(data=video_content, mime_type=media_object.content_type))
|
|
363
371
|
elif media_object.is_type("audio") and media_object.location:
|
|
364
372
|
contents.append(
|
|
365
373
|
Part.from_data(get_contents_as_bytes(media_object.location), mime_type=media_object.content_type)
|