crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import soundfile as sf
|
|
2
|
+
|
|
3
|
+
from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor # type: ignore
|
|
4
|
+
from qwen_omni_utils import process_mm_info
|
|
5
|
+
|
|
6
|
+
# default: Load the model on the available device(s)
|
|
7
|
+
model = Qwen2_5OmniModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
|
|
8
|
+
|
|
9
|
+
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
|
|
10
|
+
# model = Qwen2_5OmniModel.from_pretrained(
|
|
11
|
+
# "Qwen/Qwen2.5-Omni-7B",
|
|
12
|
+
# torch_dtype="auto",
|
|
13
|
+
# device_map="auto",
|
|
14
|
+
# attn_implementation="flash_attention_2",
|
|
15
|
+
# )
|
|
16
|
+
|
|
17
|
+
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
|
|
18
|
+
|
|
19
|
+
conversation = [
|
|
20
|
+
{
|
|
21
|
+
"role": "system",
|
|
22
|
+
"content": (
|
|
23
|
+
"You are Qwen, a virtual human developed by the Qwen Team,"
|
|
24
|
+
" Alibaba Group, capable of perceiving auditory and visual"
|
|
25
|
+
" inputs, as well as generating text and speech."
|
|
26
|
+
),
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"role": "user",
|
|
30
|
+
"content": [
|
|
31
|
+
{"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
|
|
32
|
+
],
|
|
33
|
+
},
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
# set use audio in video
|
|
37
|
+
USE_AUDIO_IN_VIDEO = True
|
|
38
|
+
|
|
39
|
+
# Preparation for inference
|
|
40
|
+
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
|
|
41
|
+
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
|
|
42
|
+
inputs = processor(
|
|
43
|
+
text=text,
|
|
44
|
+
audios=audios,
|
|
45
|
+
images=images,
|
|
46
|
+
videos=videos,
|
|
47
|
+
return_tensors="pt",
|
|
48
|
+
padding=True,
|
|
49
|
+
use_audio_in_video=USE_AUDIO_IN_VIDEO,
|
|
50
|
+
)
|
|
51
|
+
inputs = inputs.to(model.device).to(model.dtype)
|
|
52
|
+
|
|
53
|
+
# Inference: Generation of the output text and audio
|
|
54
|
+
text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
|
|
55
|
+
|
|
56
|
+
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
|
57
|
+
print(text)
|
|
58
|
+
sf.write(
|
|
59
|
+
"output.wav",
|
|
60
|
+
audio.reshape(-1).detach().cpu().numpy(),
|
|
61
|
+
samplerate=24000,
|
|
62
|
+
)
|
helm/clients/bedrock_client.py
CHANGED
|
@@ -117,10 +117,12 @@ class BedrockNovaClient(CachingClient):
|
|
|
117
117
|
tokenizer_name: str,
|
|
118
118
|
assumed_role: Optional[str] = None,
|
|
119
119
|
region: Optional[str] = None,
|
|
120
|
+
bedrock_model_id: Optional[str] = None,
|
|
120
121
|
):
|
|
121
122
|
super().__init__(cache_config=cache_config)
|
|
122
123
|
self.tokenizer = tokenizer
|
|
123
124
|
self.tokenizer_name = tokenizer_name
|
|
125
|
+
self.bedrock_model_id = bedrock_model_id
|
|
124
126
|
self.bedrock_client = get_bedrock_client_v1(
|
|
125
127
|
assumed_role=assumed_role or os.environ.get("BEDROCK_ASSUME_ROLE", None),
|
|
126
128
|
region=region,
|
|
@@ -144,7 +146,7 @@ class BedrockNovaClient(CachingClient):
|
|
|
144
146
|
messages = self._get_messages_from_request(request)
|
|
145
147
|
|
|
146
148
|
return {
|
|
147
|
-
"modelId": model_id,
|
|
149
|
+
"modelId": self.bedrock_model_id or model_id,
|
|
148
150
|
"inferenceConfig": {
|
|
149
151
|
"temperature": request.temperature,
|
|
150
152
|
"maxTokens": request.max_tokens,
|
helm/clients/client.py
CHANGED
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from typing import List, Mapping, Optional, cast
|
|
4
4
|
|
|
5
|
-
from helm.common.hierarchical_logger import
|
|
5
|
+
from helm.common.hierarchical_logger import hwarn
|
|
6
6
|
from helm.common.media_object import MultimediaObject, TEXT_TYPE
|
|
7
7
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
8
8
|
from helm.common.cache import Cache, CacheConfig
|
|
@@ -65,7 +65,7 @@ def truncate_sequence(
|
|
|
65
65
|
# where max_tokens = 0, so there's nothing to truncate.
|
|
66
66
|
if request.echo_prompt:
|
|
67
67
|
if request.max_tokens != 0:
|
|
68
|
-
|
|
68
|
+
hwarn("don't know how to handle echo_prompt and max_tokens > 0, not truncating")
|
|
69
69
|
return sequence
|
|
70
70
|
|
|
71
71
|
if end_of_text_token:
|
|
@@ -90,8 +90,8 @@ def truncate_sequence(
|
|
|
90
90
|
new_tokens.append(token)
|
|
91
91
|
|
|
92
92
|
if len(new_text) < len(sequence.text) and len(new_tokens) == len(sequence.tokens):
|
|
93
|
-
|
|
94
|
-
f"
|
|
93
|
+
hwarn(
|
|
94
|
+
f"Stripped characters from text ({len(sequence.text)} -> {len(new_text)}), "
|
|
95
95
|
f"but wasn't able to strip the tokens"
|
|
96
96
|
)
|
|
97
97
|
|
|
@@ -99,14 +99,14 @@ def truncate_sequence(
|
|
|
99
99
|
new_logprob = sum(token.logprob for token in new_tokens)
|
|
100
100
|
|
|
101
101
|
if print_warning:
|
|
102
|
-
|
|
102
|
+
hwarn(f"truncate_sequence needs to strip {json.dumps(stop)}")
|
|
103
103
|
|
|
104
104
|
sequence = GeneratedOutput(text=new_text, logprob=new_logprob, tokens=new_tokens)
|
|
105
105
|
|
|
106
106
|
# Truncate based on the max number of tokens.
|
|
107
107
|
if len(sequence.tokens) > request.max_tokens:
|
|
108
108
|
if print_warning:
|
|
109
|
-
|
|
109
|
+
hwarn(f"truncate_sequence needs to truncate {len(sequence.tokens)} down to {request.max_tokens}")
|
|
110
110
|
new_tokens = sequence.tokens[: request.max_tokens]
|
|
111
111
|
|
|
112
112
|
# This is imperfect stitching together of tokens, so just to make sure this is okay
|
|
@@ -114,7 +114,7 @@ def truncate_sequence(
|
|
|
114
114
|
# Usually, in our benchmark, max_tokens is active when it's 1, so hopefully this isn't an issue.
|
|
115
115
|
new_text = "".join(token.text for token in new_tokens)
|
|
116
116
|
if not sequence.text.startswith(new_text):
|
|
117
|
-
|
|
117
|
+
hwarn(f"{json.dumps(sequence.text)} does not start with truncated text {json.dumps(new_text)}")
|
|
118
118
|
|
|
119
119
|
new_logprob = sum(token.logprob for token in new_tokens)
|
|
120
120
|
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
from helm.clients.openai_client import OpenAIClient
|
|
4
|
+
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.common.request import Request
|
|
6
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class GrokChatClient(OpenAIClient):
|
|
10
|
+
|
|
11
|
+
BASE_URL = "https://api.x.ai/v1"
|
|
12
|
+
|
|
13
|
+
_UNSUPPORTED_ARGUMENTS = ["presence_penalty", "frequency_penalty"]
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
tokenizer: Tokenizer,
|
|
18
|
+
tokenizer_name: str,
|
|
19
|
+
cache_config: CacheConfig,
|
|
20
|
+
api_key: Optional[str] = None,
|
|
21
|
+
):
|
|
22
|
+
super().__init__(
|
|
23
|
+
tokenizer=tokenizer,
|
|
24
|
+
tokenizer_name=tokenizer_name,
|
|
25
|
+
cache_config=cache_config,
|
|
26
|
+
api_key=api_key,
|
|
27
|
+
org_id=None,
|
|
28
|
+
base_url="https://api.x.ai/v1",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
|
|
32
|
+
raw_request = super()._make_chat_raw_request(request)
|
|
33
|
+
for unsupported_argument in self._UNSUPPORTED_ARGUMENTS:
|
|
34
|
+
if unsupported_argument in raw_request:
|
|
35
|
+
del raw_request[unsupported_argument]
|
|
36
|
+
return raw_request
|
|
@@ -8,7 +8,7 @@ from transformers.generation.stopping_criteria import (
|
|
|
8
8
|
from typing import Any, Dict, List, Optional, TypedDict
|
|
9
9
|
|
|
10
10
|
from helm.common.cache import CacheConfig
|
|
11
|
-
from helm.common.hierarchical_logger import htrack_block, hlog
|
|
11
|
+
from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
|
|
12
12
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
13
13
|
from helm.common.request import (
|
|
14
14
|
wrap_request_time,
|
|
@@ -18,6 +18,7 @@ from helm.common.request import (
|
|
|
18
18
|
GeneratedOutput,
|
|
19
19
|
Token,
|
|
20
20
|
)
|
|
21
|
+
from helm.proxy.retry import NonRetriableException
|
|
21
22
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
22
23
|
from helm.clients.client import CachingClient, truncate_sequence
|
|
23
24
|
from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer, WrappedPreTrainedTokenizer
|
|
@@ -256,6 +257,7 @@ class HuggingFaceClient(CachingClient):
|
|
|
256
257
|
tokenizer: Tokenizer,
|
|
257
258
|
pretrained_model_name_or_path: Optional[str] = None,
|
|
258
259
|
end_of_text_token: Optional[str] = None,
|
|
260
|
+
apply_chat_template: Optional[bool] = None,
|
|
259
261
|
**kwargs,
|
|
260
262
|
):
|
|
261
263
|
super().__init__(cache_config=cache_config)
|
|
@@ -266,9 +268,46 @@ class HuggingFaceClient(CachingClient):
|
|
|
266
268
|
"but instead it is {tokenizer}"
|
|
267
269
|
)
|
|
268
270
|
self._wrapped_tokenizer: WrappedPreTrainedTokenizer = tokenizer.get_wrapped_tokenizer()
|
|
269
|
-
self._tokenizer = tokenizer
|
|
270
271
|
self._kwargs = _process_huggingface_client_kwargs(kwargs)
|
|
271
272
|
self._end_of_text_token = end_of_text_token
|
|
273
|
+
# If the user did not explicitly configure whether the model is a chat model with `apply_chat_template` arg,
|
|
274
|
+
# auto-infer if the model is a chat model based on whether the tokenizer has a chat template.
|
|
275
|
+
# Note: Auto-inference is incorrect for some non-chat models that still have chat templates
|
|
276
|
+
# e.g. Qwen2, Qwen 2.5.
|
|
277
|
+
# For these models, the `apply_chat_template` arg should be explicitly set to false.
|
|
278
|
+
if apply_chat_template is not None:
|
|
279
|
+
self._apply_chat_template = apply_chat_template
|
|
280
|
+
else:
|
|
281
|
+
with self._wrapped_tokenizer as hf_tokenizer:
|
|
282
|
+
self._apply_chat_template = bool(hf_tokenizer.chat_template)
|
|
283
|
+
hwarn(
|
|
284
|
+
f"Automatically set `apply_chat_template` to {self._apply_chat_template} based on "
|
|
285
|
+
"whether the tokenizer has a chat template. "
|
|
286
|
+
"If this is incorrect, please explicitly set `apply_chat_template`."
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def get_prompt(self, request: Request) -> str:
|
|
290
|
+
if request.prompt and request.messages:
|
|
291
|
+
raise NonRetriableException(f"More than one of `prompt` and `messages` was set in request: {request}")
|
|
292
|
+
# Chat model expects a list of messages as input
|
|
293
|
+
if self._apply_chat_template:
|
|
294
|
+
with self._wrapped_tokenizer as tokenizer:
|
|
295
|
+
if request.messages:
|
|
296
|
+
prompt = tokenizer.apply_chat_template(request.messages, tokenize=False, add_generation_prompt=True)
|
|
297
|
+
assert isinstance(prompt, str)
|
|
298
|
+
return prompt
|
|
299
|
+
else:
|
|
300
|
+
prompt = tokenizer.apply_chat_template(
|
|
301
|
+
[{"role": "user", "content": request.prompt}], tokenize=False, add_generation_prompt=True
|
|
302
|
+
)
|
|
303
|
+
assert isinstance(prompt, str)
|
|
304
|
+
return prompt
|
|
305
|
+
# Base non-chat model expects a string as input
|
|
306
|
+
else:
|
|
307
|
+
if request.messages:
|
|
308
|
+
raise NonRetriableException("Chat mesages not supported by non-chat model")
|
|
309
|
+
else:
|
|
310
|
+
return request.prompt
|
|
272
311
|
|
|
273
312
|
def make_request(self, request: Request) -> RequestResult:
|
|
274
313
|
# Embedding not supported for this model
|
|
@@ -277,7 +316,7 @@ class HuggingFaceClient(CachingClient):
|
|
|
277
316
|
|
|
278
317
|
raw_request: HuggingFaceRequest = {
|
|
279
318
|
"engine": request.model_engine,
|
|
280
|
-
"prompt": request
|
|
319
|
+
"prompt": self.get_prompt(request),
|
|
281
320
|
"temperature": 1e-7 if request.temperature == 0 else request.temperature,
|
|
282
321
|
"num_return_sequences": request.num_completions,
|
|
283
322
|
"max_new_tokens": request.max_tokens,
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from threading import Lock
|
|
2
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
import transformers
|
|
5
|
+
|
|
6
|
+
from helm.clients.client import CachingClient
|
|
7
|
+
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.hierarchical_logger import htrack_block, hwarn
|
|
9
|
+
from helm.common.request import GeneratedOutput, Request, RequestResult, wrap_request_time
|
|
10
|
+
from helm.proxy.retry import NonRetriableException
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_pipelines: Dict[str, transformers.Pipeline] = {}
|
|
14
|
+
_pipelines_lock: Lock = Lock()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_pipeline(
|
|
18
|
+
helm_model_name: str,
|
|
19
|
+
pipeline_kwargs: Dict[str, Any],
|
|
20
|
+
) -> Any:
|
|
21
|
+
"""
|
|
22
|
+
Checks if the desired HuggingFaceModel is cached. Creates the HuggingFaceModel if it's not cached.
|
|
23
|
+
Returns the HuggingFaceModel.
|
|
24
|
+
"""
|
|
25
|
+
global _pipelines
|
|
26
|
+
global _pipelines_lock
|
|
27
|
+
with _pipelines_lock:
|
|
28
|
+
if helm_model_name not in _pipelines:
|
|
29
|
+
huggingface_model_name = pipeline_kwargs["model"]
|
|
30
|
+
with htrack_block(
|
|
31
|
+
f"Loading HuggingFace model {huggingface_model_name} (kwargs={pipeline_kwargs}) "
|
|
32
|
+
f"for HELM model {helm_model_name} with transformers.pipeline"
|
|
33
|
+
):
|
|
34
|
+
_pipelines[helm_model_name] = transformers.pipeline(**pipeline_kwargs)
|
|
35
|
+
|
|
36
|
+
return _pipelines[helm_model_name]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class HuggingFacePipelineClient(CachingClient):
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
cache_config: CacheConfig,
|
|
43
|
+
model_name: str,
|
|
44
|
+
pretrained_model_name_or_path: Optional[str] = None,
|
|
45
|
+
apply_chat_template: Optional[bool] = None,
|
|
46
|
+
**kwargs,
|
|
47
|
+
):
|
|
48
|
+
# Include `pretrained_model_name_or_path` parameter so that model deployments can use
|
|
49
|
+
# the `pretrained_model_name_or_path` arg to override `model_name`
|
|
50
|
+
super().__init__(cache_config=cache_config)
|
|
51
|
+
self._helm_model_name = model_name
|
|
52
|
+
self._pipeline_kwargs = {
|
|
53
|
+
"model": pretrained_model_name_or_path or self._helm_model_name,
|
|
54
|
+
"task": "text-generation",
|
|
55
|
+
**kwargs,
|
|
56
|
+
}
|
|
57
|
+
self._pipeline = _get_pipeline(self._helm_model_name, self._pipeline_kwargs)
|
|
58
|
+
if apply_chat_template is not None:
|
|
59
|
+
self._apply_chat_template = apply_chat_template
|
|
60
|
+
else:
|
|
61
|
+
# If the user did not explicitly configure whether the model is a chat model with `apply_chat_template` arg,
|
|
62
|
+
# auto-infer if the model is a chat model based on whether the tokenizer has a chat template.
|
|
63
|
+
# Note: Auto-inference is incorrect for some non-chat models that still have chat templates
|
|
64
|
+
# e.g. Qwen2, Qwen 2.5.
|
|
65
|
+
# For these models, the `apply_chat_template` arg should be explicitly set to false.
|
|
66
|
+
self._apply_chat_template = bool(self._pipeline.tokenizer.chat_template)
|
|
67
|
+
hwarn(
|
|
68
|
+
f"Automatically set `apply_chat_template` to {self._apply_chat_template} based on "
|
|
69
|
+
"whether the tokenizer has a chat template. "
|
|
70
|
+
"If this is incorrect, please explicitly set `apply_chat_template`."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def make_text_inputs(self, request: Request) -> Union[str, List[Dict[str, str]]]:
|
|
74
|
+
if request.prompt and request.messages:
|
|
75
|
+
raise NonRetriableException(f"More than one of `prompt` and `messages` was set in request: {request}")
|
|
76
|
+
# Chat model expects a list of messages as input
|
|
77
|
+
if self._apply_chat_template:
|
|
78
|
+
if request.messages:
|
|
79
|
+
return request.messages
|
|
80
|
+
else:
|
|
81
|
+
return [{"role": "user", "content": request.prompt}]
|
|
82
|
+
# Base non-chat model expects a string as input
|
|
83
|
+
else:
|
|
84
|
+
if request.messages:
|
|
85
|
+
raise NonRetriableException("Chat mesages not supported by non-chat model")
|
|
86
|
+
else:
|
|
87
|
+
return request.prompt
|
|
88
|
+
|
|
89
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
90
|
+
"""Make a request"""
|
|
91
|
+
if request.model != self._helm_model_name:
|
|
92
|
+
raise NonRetriableException(
|
|
93
|
+
f"This instance of HuggingFacePipelineClient has loaded model {self._helm_model_name} but the request was for model {request.model}" # noqa: E501
|
|
94
|
+
)
|
|
95
|
+
completions: List[GeneratedOutput] = []
|
|
96
|
+
do_sample = request.temperature > 0.0
|
|
97
|
+
raw_request = {
|
|
98
|
+
"text_inputs": self.make_text_inputs(request),
|
|
99
|
+
"return_full_text": request.echo_prompt,
|
|
100
|
+
"temperature": request.temperature if do_sample else None,
|
|
101
|
+
"num_return_sequences": request.num_completions,
|
|
102
|
+
"max_new_tokens": request.max_tokens,
|
|
103
|
+
"top_p": request.top_p,
|
|
104
|
+
"top_k": request.top_k_per_token if do_sample else None,
|
|
105
|
+
"do_sample": do_sample,
|
|
106
|
+
"return_dict_in_generate": True,
|
|
107
|
+
}
|
|
108
|
+
if request.stop_sequences:
|
|
109
|
+
stop_sequence_ids = self._pipeline.tokenizer(
|
|
110
|
+
request.stop_sequences, return_token_type_ids=False, add_special_tokens=False
|
|
111
|
+
)
|
|
112
|
+
if len(stop_sequence_ids.input_ids) == 1 and len(stop_sequence_ids.input_ids[0]) == 1:
|
|
113
|
+
raw_request["eos_token_id"] = stop_sequence_ids.input_ids[0][0]
|
|
114
|
+
else:
|
|
115
|
+
raise NonRetriableException(
|
|
116
|
+
"Multiple stop sequences and stop sequences of multiple tokens, are not yet supported by HuggingFacePipelineClient" # noqa: E501
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def do_it() -> Dict[str, Any]:
|
|
120
|
+
pipeline_outputs = self._pipeline(**raw_request)
|
|
121
|
+
return {"outputs": pipeline_outputs}
|
|
122
|
+
|
|
123
|
+
cache_key = CachingClient.make_cache_key(
|
|
124
|
+
{"pipeline_kwargs": self._pipeline_kwargs, **raw_request},
|
|
125
|
+
request,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
129
|
+
for raw_output in response["outputs"]:
|
|
130
|
+
completions.append(GeneratedOutput(text=raw_output["generated_text"], logprob=0, tokens=[]))
|
|
131
|
+
return RequestResult(
|
|
132
|
+
success=True,
|
|
133
|
+
cached=cached,
|
|
134
|
+
request_time=response["request_time"],
|
|
135
|
+
request_datetime=response["request_datetime"],
|
|
136
|
+
completions=completions,
|
|
137
|
+
embedding=[],
|
|
138
|
+
)
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
|
-
"""
|
|
15
|
+
"""DalleBart model configuration"""
|
|
16
16
|
import warnings
|
|
17
17
|
|
|
18
18
|
from transformers.configuration_utils import PretrainedConfig
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
|
-
"""
|
|
15
|
+
"""DalleBart model."""
|
|
16
16
|
|
|
17
17
|
import math
|
|
18
18
|
from functools import partial
|