crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +191 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +47 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +24 -6
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/huggingface_client.py +2 -2
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +33 -20
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -13
- helm/clients/vertexai_client.py +19 -11
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +525 -172
- helm/config/model_metadata.yaml +185 -10
- helm/config/tokenizer_configs.yaml +100 -2
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
import tempfile
|
|
4
|
+
|
|
5
|
+
from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
|
|
6
|
+
from helm.common.request import Request
|
|
7
|
+
from helm.clients.openrouter_client import OpenRouterClient
|
|
8
|
+
|
|
9
|
+
from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestOpenRouterClient:
|
|
13
|
+
def setup_method(self, method):
|
|
14
|
+
cache_file = tempfile.NamedTemporaryFile(delete=False)
|
|
15
|
+
self.cache_path: str = cache_file.name
|
|
16
|
+
self.tokenizer_name = "mistralai/Mistral-7B-v0.1"
|
|
17
|
+
self.tokenizer = HuggingFaceTokenizer(
|
|
18
|
+
cache_config=BlackHoleCacheConfig(),
|
|
19
|
+
tokenizer_name=self.tokenizer_name,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def teardown_method(self, method):
|
|
23
|
+
os.remove(self.cache_path)
|
|
24
|
+
|
|
25
|
+
@pytest.mark.parametrize(
|
|
26
|
+
"model_name,test_input,expected_model",
|
|
27
|
+
[
|
|
28
|
+
(
|
|
29
|
+
"mistralai/mistral-medium-3.1",
|
|
30
|
+
Request(
|
|
31
|
+
model="mistralai/mistral-medium-3.1",
|
|
32
|
+
model_deployment="openrouter/mistral-medium-3.1",
|
|
33
|
+
),
|
|
34
|
+
"mistralai/mistral-medium-3.1",
|
|
35
|
+
),
|
|
36
|
+
(
|
|
37
|
+
None,
|
|
38
|
+
Request(model="openai/gpt-oss-20b:free", model_deployment="openrouter/gpt-oss-20b:free"),
|
|
39
|
+
"openai/gpt-oss-20b:free",
|
|
40
|
+
),
|
|
41
|
+
],
|
|
42
|
+
)
|
|
43
|
+
def test_get_model_for_request(self, model_name, test_input, expected_model):
|
|
44
|
+
client = OpenRouterClient(
|
|
45
|
+
tokenizer_name=self.tokenizer_name,
|
|
46
|
+
tokenizer=self.tokenizer,
|
|
47
|
+
cache_config=SqliteCacheConfig(self.cache_path),
|
|
48
|
+
model_name=model_name,
|
|
49
|
+
api_key="test_key",
|
|
50
|
+
)
|
|
51
|
+
assert client._get_model_for_request(test_input) == expected_model
|
|
52
|
+
|
|
53
|
+
def test_api_key_env_var(self, monkeypatch):
|
|
54
|
+
monkeypatch.setenv("OPENROUTER_API_KEY", "test_key")
|
|
55
|
+
client = OpenRouterClient(
|
|
56
|
+
tokenizer_name=self.tokenizer_name,
|
|
57
|
+
tokenizer=self.tokenizer,
|
|
58
|
+
cache_config=SqliteCacheConfig(self.cache_path),
|
|
59
|
+
)
|
|
60
|
+
assert client.api_key == "test_key"
|
|
61
|
+
|
|
62
|
+
def test_api_key_argument(self):
|
|
63
|
+
client = OpenRouterClient(
|
|
64
|
+
tokenizer_name=self.tokenizer_name,
|
|
65
|
+
tokenizer=self.tokenizer,
|
|
66
|
+
cache_config=BlackHoleCacheConfig(),
|
|
67
|
+
api_key="explicit_key",
|
|
68
|
+
)
|
|
69
|
+
assert client.api_key == "explicit_key"
|
helm/clients/together_client.py
CHANGED
|
@@ -25,8 +25,6 @@ except ModuleNotFoundError as e:
|
|
|
25
25
|
class _RewriteRequestTags:
|
|
26
26
|
"""Tags that indicate that the request for the model must be rewritten before sending to Together."""
|
|
27
27
|
|
|
28
|
-
# TODO: Convert to StrEnum after upgrading to Python 3.11
|
|
29
|
-
|
|
30
28
|
ADD_EOS_TOKEN_AS_STOP_SEQUENCE = "ADD_EOS_TOKEN_AS_STOP_SEQUENCE"
|
|
31
29
|
"""Indicates that the EOS token should be added as an extra stop sequence.
|
|
32
30
|
|
|
@@ -101,7 +99,20 @@ class JobNotFinishedError(TogetherClientError):
|
|
|
101
99
|
pass
|
|
102
100
|
|
|
103
101
|
|
|
104
|
-
def
|
|
102
|
+
def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
|
|
103
|
+
"""Return a tuple of thinking text and output text."""
|
|
104
|
+
match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
|
|
105
|
+
if match:
|
|
106
|
+
return (match.group(1), match.group(2))
|
|
107
|
+
|
|
108
|
+
match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
|
|
109
|
+
if match:
|
|
110
|
+
return (match.group(1), "")
|
|
111
|
+
|
|
112
|
+
return (input, "")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
|
|
105
116
|
"""Return a tuple of thinking text and output text."""
|
|
106
117
|
match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
|
|
107
118
|
if match:
|
|
@@ -114,6 +125,31 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
|
|
|
114
125
|
return (input, "")
|
|
115
126
|
|
|
116
127
|
|
|
128
|
+
def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
|
|
129
|
+
"""Return a tuple of thinking text and output text."""
|
|
130
|
+
match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
|
|
131
|
+
if match:
|
|
132
|
+
return (match.group(1), match.group(2))
|
|
133
|
+
|
|
134
|
+
match = re.match(r"\n<think>(.*)", input, re.DOTALL)
|
|
135
|
+
if match:
|
|
136
|
+
return (match.group(1), "")
|
|
137
|
+
|
|
138
|
+
return (input, "")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
|
|
142
|
+
# TODO: Come up with a more sustainable extensible way of doing this.
|
|
143
|
+
if "deepseek-r1" in model_name:
|
|
144
|
+
return _parse_thinking_deepseek_r1(input)
|
|
145
|
+
elif "qwen3" in model_name:
|
|
146
|
+
return _parse_thinking_qwen3(input)
|
|
147
|
+
elif "glm-4.5" in model_name:
|
|
148
|
+
return _parse_thinking_glm_4_5(input)
|
|
149
|
+
else:
|
|
150
|
+
raise Exception(f"No thinking parser available for model {model_name}")
|
|
151
|
+
|
|
152
|
+
|
|
117
153
|
class TogetherClient(CachingClient):
|
|
118
154
|
"""
|
|
119
155
|
Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
|
|
@@ -348,9 +384,8 @@ class TogetherChatClient(CachingClient):
|
|
|
348
384
|
self._client = Together(api_key=api_key)
|
|
349
385
|
self._together_model = together_model
|
|
350
386
|
self._disable_logprobs = bool(disable_logprobs)
|
|
351
|
-
# self.output_processor is actually a function, not a class
|
|
352
387
|
self._parse_thinking = bool(parse_thinking)
|
|
353
|
-
|
|
388
|
+
# self.output_processor is actually a function, not a class
|
|
354
389
|
self.output_processor: Optional[Callable[[str], str]] = (
|
|
355
390
|
get_class_by_name(output_processor) if output_processor else None
|
|
356
391
|
)
|
|
@@ -446,15 +481,15 @@ class TogetherChatClient(CachingClient):
|
|
|
446
481
|
if self.output_processor:
|
|
447
482
|
output_text = self.output_processor(output_text)
|
|
448
483
|
|
|
484
|
+
thinking: Optional[Thinking] = None
|
|
449
485
|
if self._parse_thinking:
|
|
450
|
-
thinking_text, output_text = _parse_thinking(output_text)
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
)
|
|
456
|
-
|
|
457
|
-
generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
|
|
486
|
+
thinking_text, output_text = _parse_thinking(output_text, request.model)
|
|
487
|
+
thinking = Thinking(text=thinking_text)
|
|
488
|
+
elif hasattr(choice.message, "reasoning_content"):
|
|
489
|
+
thinking = Thinking(text=choice.message.reasoning_content)
|
|
490
|
+
generated_outputs.append(
|
|
491
|
+
GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
|
|
492
|
+
)
|
|
458
493
|
return RequestResult(
|
|
459
494
|
success=True,
|
|
460
495
|
cached=cached,
|
helm/clients/vertexai_client.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from threading import Lock
|
|
4
|
-
from typing import Any, Dict, Mapping, Optional, List, Union
|
|
4
|
+
from typing import Any, Dict, Mapping, Optional, List, Union, cast
|
|
5
5
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.multimodal_request_utils import get_contents_as_bytes
|
|
@@ -107,7 +107,7 @@ class VertexAITextClient(VertexAIClient):
|
|
|
107
107
|
|
|
108
108
|
def make_request(self, request: Request) -> RequestResult:
|
|
109
109
|
"""Make a request"""
|
|
110
|
-
parameters = {
|
|
110
|
+
parameters: Dict[str, Any] = {
|
|
111
111
|
"temperature": request.temperature,
|
|
112
112
|
"max_output_tokens": request.max_tokens,
|
|
113
113
|
"top_k": request.top_k_per_token,
|
|
@@ -207,21 +207,23 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
207
207
|
|
|
208
208
|
def make_request(self, request: Request) -> RequestResult:
|
|
209
209
|
"""Make a request"""
|
|
210
|
-
|
|
210
|
+
# mypy is unhappy without this cast
|
|
211
|
+
contents: Union[List[Union[str, Image, Part]], List[Content]] = cast(
|
|
212
|
+
List[Union[str, Image, Part]], [request.prompt]
|
|
213
|
+
)
|
|
211
214
|
|
|
212
215
|
# For the multimodal case, build up the content with the media objects of `request.multimodal_prompt`
|
|
213
216
|
if request.multimodal_prompt is not None:
|
|
214
217
|
return self._make_multimodal_request(request)
|
|
215
218
|
|
|
216
219
|
if request.messages is not None:
|
|
217
|
-
contents = []
|
|
218
220
|
role_mapping = {"user": "user", "assistant": "model"}
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
221
|
+
contents = [
|
|
222
|
+
Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
|
|
223
|
+
for msg in request.messages
|
|
224
|
+
]
|
|
223
225
|
|
|
224
|
-
parameters = {
|
|
226
|
+
parameters: Dict[str, Any] = {
|
|
225
227
|
"temperature": request.temperature,
|
|
226
228
|
"max_output_tokens": request.max_tokens,
|
|
227
229
|
"top_k": request.top_k_per_token,
|
|
@@ -274,8 +276,14 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
274
276
|
if not candidate.content:
|
|
275
277
|
raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
|
|
276
278
|
if not candidate.content.parts:
|
|
277
|
-
|
|
278
|
-
|
|
279
|
+
if candidate.finish_reason == 2: # MAX_TOKENS
|
|
280
|
+
# This means that there is no text output because the maximum number of tokens were
|
|
281
|
+
# reached during thinking.
|
|
282
|
+
predictions.append({"text": ""})
|
|
283
|
+
else:
|
|
284
|
+
raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
|
|
285
|
+
else:
|
|
286
|
+
predictions.append({"text": candidate.content.text})
|
|
279
287
|
# TODO: Extract more information from the response
|
|
280
288
|
return {"predictions": predictions}
|
|
281
289
|
|
helm/clients/vllm_client.py
CHANGED
|
@@ -2,7 +2,7 @@ from typing import Any, Dict, Optional
|
|
|
2
2
|
|
|
3
3
|
from helm.common.cache import CacheConfig
|
|
4
4
|
from helm.common.request import Request
|
|
5
|
-
from helm.clients.openai_client import OpenAILegacyCompletionsClient
|
|
5
|
+
from helm.clients.openai_client import OpenAIClient, OpenAILegacyCompletionsClient
|
|
6
6
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
7
7
|
|
|
8
8
|
|
|
@@ -19,6 +19,8 @@ class VLLMClient(OpenAILegacyCompletionsClient):
|
|
|
19
19
|
tokenizer_name: str,
|
|
20
20
|
cache_config: CacheConfig,
|
|
21
21
|
base_url: Optional[str] = None,
|
|
22
|
+
vllm_model_name: Optional[str] = None,
|
|
23
|
+
**kwargs,
|
|
22
24
|
):
|
|
23
25
|
super().__init__(
|
|
24
26
|
tokenizer=tokenizer,
|
|
@@ -27,18 +29,52 @@ class VLLMClient(OpenAILegacyCompletionsClient):
|
|
|
27
29
|
api_key="EMPTY",
|
|
28
30
|
org_id=None,
|
|
29
31
|
base_url=base_url,
|
|
32
|
+
openai_model_name=vllm_model_name,
|
|
33
|
+
**kwargs,
|
|
30
34
|
)
|
|
31
35
|
self.tokenizer = tokenizer
|
|
32
36
|
self.tokenizer_name = tokenizer_name
|
|
33
|
-
|
|
34
|
-
def _get_model_for_request(self, request: Request) -> str:
|
|
35
|
-
# The `model` parameter for vLLM should be the whole model name including the creator organization,
|
|
36
|
-
# unlike OpenAI which only uses the model engine.
|
|
37
|
-
return request.model
|
|
37
|
+
self.vllm_model_name = vllm_model_name
|
|
38
38
|
|
|
39
39
|
def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
|
|
40
40
|
raw_request = super()._to_raw_completion_request(request)
|
|
41
41
|
# This avoids the error: best_of must be 1 when using greedy sampling
|
|
42
|
-
if
|
|
42
|
+
if (
|
|
43
|
+
"temperature" in raw_request
|
|
44
|
+
and raw_request["temperature"] == 0.0
|
|
45
|
+
and "best_of" in raw_request
|
|
46
|
+
and raw_request["best_of"] > 1
|
|
47
|
+
):
|
|
43
48
|
raw_request["best_of"] = 1
|
|
44
49
|
return raw_request
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class VLLMChatClient(OpenAIClient):
|
|
53
|
+
"""Sends request to a vLLM server using the OpenAI-compatible API.
|
|
54
|
+
|
|
55
|
+
Only uses the Chat Completions API.
|
|
56
|
+
|
|
57
|
+
See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
tokenizer: Tokenizer,
|
|
62
|
+
tokenizer_name: str,
|
|
63
|
+
cache_config: CacheConfig,
|
|
64
|
+
base_url: Optional[str] = None,
|
|
65
|
+
vllm_model_name: Optional[str] = None,
|
|
66
|
+
**kwargs,
|
|
67
|
+
):
|
|
68
|
+
super().__init__(
|
|
69
|
+
tokenizer=tokenizer,
|
|
70
|
+
tokenizer_name=tokenizer_name,
|
|
71
|
+
cache_config=cache_config,
|
|
72
|
+
api_key="EMPTY",
|
|
73
|
+
org_id=None,
|
|
74
|
+
base_url=base_url,
|
|
75
|
+
openai_model_name=vllm_model_name,
|
|
76
|
+
**kwargs,
|
|
77
|
+
)
|
|
78
|
+
self.tokenizer = tokenizer
|
|
79
|
+
self.tokenizer_name = tokenizer_name
|
|
80
|
+
self.vllm_model_name = vllm_model_name
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from dataclasses import replace
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List, Tuple
|
|
4
|
+
|
|
5
|
+
from helm.clients.vllm_client import VLLMChatClient
|
|
6
|
+
from helm.common.request import GeneratedOutput, Request, RequestResult, Thinking
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VLLMGraniteThinkingClient(VLLMChatClient):
|
|
10
|
+
"""Sends request to a Granite model on vLLM server with thinking enabled.
|
|
11
|
+
|
|
12
|
+
From vLLM documentation at
|
|
13
|
+
https://docs.vllm.ai/en/v0.9.1/features/reasoning_outputs.html
|
|
14
|
+
|
|
15
|
+
IBM Granite 3.2 reasoning is disabled by default;
|
|
16
|
+
to enable it, you must also pass thinking=True in your chat_template_kwargs.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
|
|
20
|
+
raw_request = super()._make_chat_raw_request(request)
|
|
21
|
+
raw_request["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
|
|
22
|
+
return raw_request
|
|
23
|
+
|
|
24
|
+
def _parse_thinking(self, input: str) -> Tuple[str, str]:
|
|
25
|
+
"""Return a tuple of thinking text and output text."""
|
|
26
|
+
match = re.match(r"<think>(.*)</think>\s*<response>(.*)</response>", input, re.DOTALL)
|
|
27
|
+
if match:
|
|
28
|
+
return (match.group(1), match.group(2))
|
|
29
|
+
|
|
30
|
+
match = re.match(r"<think>(.*)</think>\s*<response>(.*)", input, re.DOTALL)
|
|
31
|
+
if match:
|
|
32
|
+
return (match.group(1), match.group(2))
|
|
33
|
+
|
|
34
|
+
match = re.match(r"<think>(.*)</think>\s*", input, re.DOTALL)
|
|
35
|
+
if match:
|
|
36
|
+
return (match.group(1), "")
|
|
37
|
+
|
|
38
|
+
match = re.match(r"<think>(.*)", input, re.DOTALL)
|
|
39
|
+
if match:
|
|
40
|
+
return (match.group(1), "")
|
|
41
|
+
|
|
42
|
+
return (input, "")
|
|
43
|
+
|
|
44
|
+
def _make_chat_request(self, request: Request) -> RequestResult:
|
|
45
|
+
request_result = super()._make_chat_request(request)
|
|
46
|
+
modified_completions: List[GeneratedOutput] = []
|
|
47
|
+
for completion in request_result.completions:
|
|
48
|
+
thinking, modified_text = self._parse_thinking(completion.text)
|
|
49
|
+
modified_completions.append(
|
|
50
|
+
replace(
|
|
51
|
+
completion,
|
|
52
|
+
text=modified_text,
|
|
53
|
+
thinking=Thinking(text=thinking),
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
return replace(request_result, completions=modified_completions)
|
helm/common/critique_request.py
CHANGED
|
@@ -6,7 +6,6 @@ from helm.common.media_object import MediaObject
|
|
|
6
6
|
class QuestionType:
|
|
7
7
|
"""String enum of question types."""
|
|
8
8
|
|
|
9
|
-
# TODO: Make this a StrEnum after upgrading to Python 3.11
|
|
10
9
|
MULTIPLE_CHOICE: str = "multiple_choice"
|
|
11
10
|
CHECKBOX: str = "checkbox"
|
|
12
11
|
FREE_RESPONSE: str = "free_response"
|
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import logging.config
|
|
3
|
+
import yaml
|
|
4
|
+
import os
|
|
2
5
|
import sys
|
|
3
6
|
import time
|
|
4
7
|
from typing import Any, Callable, List, Optional
|
|
@@ -34,22 +37,31 @@ class HierarchicalLogger(object):
|
|
|
34
37
|
def indent(self) -> str:
|
|
35
38
|
return " " * len(self.start_times)
|
|
36
39
|
|
|
37
|
-
def track_begin(self, x: Any) -> None:
|
|
38
|
-
|
|
40
|
+
def track_begin(self, x: Any, **kwargs) -> None:
|
|
41
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
42
|
+
self.logger.info(self.indent() + str(x) + " {", **kwargs)
|
|
39
43
|
sys.stdout.flush()
|
|
40
44
|
self.start_times.append(time.time())
|
|
41
45
|
|
|
42
|
-
def track_end(self) -> None:
|
|
46
|
+
def track_end(self, **kwargs) -> None:
|
|
47
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
43
48
|
t = time.time() - self.start_times.pop()
|
|
44
|
-
self.logger.info(self.indent() + "} [%s]" % (format_time(t)))
|
|
49
|
+
self.logger.info(self.indent() + "} [%s]" % (format_time(t)), **kwargs)
|
|
45
50
|
sys.stdout.flush()
|
|
46
51
|
|
|
47
|
-
def log(self, x: Any) -> None:
|
|
48
|
-
|
|
52
|
+
def log(self, x: Any, **kwargs) -> None:
|
|
53
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
54
|
+
self.logger.info(self.indent() + str(x), **kwargs)
|
|
49
55
|
sys.stdout.flush()
|
|
50
56
|
|
|
51
|
-
def
|
|
52
|
-
|
|
57
|
+
def debug(self, x: Any, **kwargs) -> None:
|
|
58
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
59
|
+
self.logger.debug(self.indent() + str(x), **kwargs)
|
|
60
|
+
sys.stdout.flush()
|
|
61
|
+
|
|
62
|
+
def warn(self, x: Any, **kwargs) -> None:
|
|
63
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
64
|
+
self.logger.warning(self.indent() + str(x), **kwargs)
|
|
53
65
|
sys.stdout.flush()
|
|
54
66
|
|
|
55
67
|
|
|
@@ -69,23 +81,31 @@ singleton = HierarchicalLogger()
|
|
|
69
81
|
# Exposed public methods
|
|
70
82
|
|
|
71
83
|
|
|
72
|
-
def
|
|
73
|
-
|
|
84
|
+
def hdebug(x: Any, **kwargs) -> None:
|
|
85
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
86
|
+
singleton.debug(x, **kwargs)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def hlog(x: Any, **kwargs) -> None:
|
|
90
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
91
|
+
singleton.log(x, **kwargs)
|
|
74
92
|
|
|
75
93
|
|
|
76
|
-
def hwarn(x: Any) -> None:
|
|
77
|
-
|
|
94
|
+
def hwarn(x: Any, **kwargs) -> None:
|
|
95
|
+
kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
|
|
96
|
+
singleton.warn(x, **kwargs)
|
|
78
97
|
|
|
79
98
|
|
|
80
99
|
class htrack_block:
|
|
81
|
-
def __init__(self, x: Any) -> None:
|
|
100
|
+
def __init__(self, x: Any, stacklevel=1) -> None:
|
|
101
|
+
self._stacklevel = stacklevel + 1
|
|
82
102
|
self.x = x
|
|
83
103
|
|
|
84
104
|
def __enter__(self) -> None:
|
|
85
|
-
singleton.track_begin(self.x)
|
|
105
|
+
singleton.track_begin(self.x, stacklevel=self._stacklevel)
|
|
86
106
|
|
|
87
107
|
def __exit__(self, tpe: Any, value: Any, callback: Any) -> None:
|
|
88
|
-
singleton.track_end()
|
|
108
|
+
singleton.track_end(stacklevel=self._stacklevel)
|
|
89
109
|
|
|
90
110
|
|
|
91
111
|
class htrack:
|
|
@@ -116,34 +136,63 @@ class htrack:
|
|
|
116
136
|
description = description.replace("$" + k, str(v))
|
|
117
137
|
else:
|
|
118
138
|
description = ""
|
|
119
|
-
with htrack_block(parent + fn.__name__ + description):
|
|
139
|
+
with htrack_block(parent + fn.__name__ + description, stacklevel=2):
|
|
120
140
|
return fn(*args, **kwargs)
|
|
121
141
|
|
|
122
142
|
return wrapper
|
|
123
143
|
|
|
124
144
|
|
|
125
|
-
def setup_default_logging():
|
|
145
|
+
def setup_default_logging(config_path: Optional[str] = None):
|
|
126
146
|
"""
|
|
127
|
-
Setup
|
|
128
|
-
"""
|
|
129
|
-
formatter = ColoredFormatter(
|
|
130
|
-
"%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
|
|
131
|
-
datefmt="%Y-%m-%dT%H:%M:%S",
|
|
132
|
-
reset=True,
|
|
133
|
-
log_colors={
|
|
134
|
-
"DEBUG": "cyan",
|
|
135
|
-
"INFO": "green",
|
|
136
|
-
"WARNING": "yellow",
|
|
137
|
-
"ERROR": "red",
|
|
138
|
-
"CRITICAL": "red,bg_white",
|
|
139
|
-
},
|
|
140
|
-
secondary_log_colors={},
|
|
141
|
-
style="%",
|
|
142
|
-
)
|
|
147
|
+
Setup Python logging for HELM
|
|
143
148
|
|
|
149
|
+
Priority:
|
|
150
|
+
1. External config file (YAML or JSON).
|
|
151
|
+
2. ENV var LOG_LEVEL.
|
|
152
|
+
3. a default logger to STDOUT
|
|
153
|
+
"""
|
|
144
154
|
logger = logging.getLogger("helm")
|
|
145
|
-
logger.setLevel(logging.INFO)
|
|
146
155
|
logger.propagate = False
|
|
156
|
+
|
|
157
|
+
if config_path and os.path.exists(config_path):
|
|
158
|
+
with open(config_path, "r") as f:
|
|
159
|
+
config = yaml.safe_load(f)
|
|
160
|
+
logging.config.dictConfig(config)
|
|
161
|
+
hdebug("setup custom HELM logging")
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
log_level = (os.getenv("HELM_LOG_LEVEL") or os.getenv("LOG_LEVEL") or "INFO").upper()
|
|
165
|
+
try:
|
|
166
|
+
logger.setLevel(getattr(logging, log_level))
|
|
167
|
+
except AttributeError:
|
|
168
|
+
logger.setLevel(logging.INFO)
|
|
169
|
+
|
|
170
|
+
# Set formatter
|
|
171
|
+
formatter: Optional[logging.Formatter] = None
|
|
172
|
+
if sys.stdout.isatty():
|
|
173
|
+
try:
|
|
174
|
+
formatter = ColoredFormatter(
|
|
175
|
+
"%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
|
|
176
|
+
datefmt="%Y-%m-%dT%H:%M:%S",
|
|
177
|
+
reset=True,
|
|
178
|
+
log_colors={
|
|
179
|
+
"DEBUG": "cyan",
|
|
180
|
+
"INFO": "green",
|
|
181
|
+
"WARNING": "yellow",
|
|
182
|
+
"ERROR": "red",
|
|
183
|
+
"CRITICAL": "red,bg_white",
|
|
184
|
+
},
|
|
185
|
+
style="%",
|
|
186
|
+
)
|
|
187
|
+
except ImportError:
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
if formatter is None:
|
|
191
|
+
# fallback
|
|
192
|
+
formatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
|
|
193
|
+
|
|
194
|
+
# Add default stdout handler
|
|
147
195
|
handler = logging.StreamHandler(sys.stdout)
|
|
148
196
|
handler.setFormatter(formatter)
|
|
149
197
|
logger.addHandler(handler)
|
|
198
|
+
hdebug("setup default HELM logging")
|
helm/common/object_spec.py
CHANGED
|
@@ -55,14 +55,23 @@ def inject_object_spec_args(
|
|
|
55
55
|
This is loosely based on instance (constant) bindings and provider bindings in Guice dependency injection.
|
|
56
56
|
|
|
57
57
|
Example:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
58
|
+
>>> from helm.common.object_spec import * # NOQA
|
|
59
|
+
>>> import sys, types
|
|
60
|
+
>>> # Given a custom class with hashable arguments
|
|
61
|
+
>>> class MyClass:
|
|
62
|
+
... def __init__(a: int, b: int, c: int, d: int = 0):
|
|
63
|
+
... pass
|
|
64
|
+
>>> #
|
|
65
|
+
>>> # <boilerplate>: make a dummy module for MyClass to make this doctest exectuable
|
|
66
|
+
>>> sys.modules["my_module"] = type("MyModule", (types.ModuleType,), {"MyClass": MyClass})("my_module")
|
|
67
|
+
>>> # </boilerplate>
|
|
68
|
+
>>> #
|
|
69
|
+
>>> # Define new style and old style object specs
|
|
70
|
+
>>> old_object_spec = ObjectSpec(class_name="my_module.MyClass", args={"a": 11})
|
|
71
|
+
>>> new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
|
|
72
|
+
>>> # new_object_spec is now
|
|
73
|
+
>>> print(new_object_spec)
|
|
74
|
+
ObjectSpec(class_name='my_module.MyClass', args={'a': 11, 'b': 12, 'c': 13})
|
|
66
75
|
"""
|
|
67
76
|
cls = get_class_by_name(spec.class_name)
|
|
68
77
|
init_signature = inspect.signature(cls.__init__)
|
|
@@ -93,6 +102,12 @@ def parse_object_spec(description: str) -> ObjectSpec:
|
|
|
93
102
|
<class_name>:<key>=<value>,<key>=<value>
|
|
94
103
|
Usually, the description is something that's succinct and can be typed on the command-line.
|
|
95
104
|
Here, value defaults to string.
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
>>> from helm.common.object_spec import * # NOQA
|
|
108
|
+
>>> description = 'mscoco:model=huggingface_stable-diffusion-v1-4'
|
|
109
|
+
>>> parse_object_spec(description)
|
|
110
|
+
ObjectSpec(class_name='mscoco', args={'model': 'huggingface_stable-diffusion-v1-4'})
|
|
96
111
|
"""
|
|
97
112
|
|
|
98
113
|
def parse_arg(arg: str) -> Tuple[str, Any]:
|