PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/common/general.py CHANGED Viewed

@@ -42,6 +42,13 @@ def ensure_directory_exists(path: str):
     os.makedirs(path, exist_ok=True)
+def check_file_exists(path: str, msg: Optional[str] = None):
+    """Checks that `path` exists, raises FileNotFoundError if it doesn't."""
+    if not os.path.exists(path):
+        error_msg = msg if msg else f"Required file not found: {path}"
+        raise FileNotFoundError(error_msg)
 def parse_hocon(text: str):
     """Parse `text` (in HOCON format) into a dict-like object."""
     return pyhocon.ConfigFactory.parse_string(text)
@@ -156,7 +163,7 @@ def format_split(split: str) -> str:
 def asdict_without_nones(obj: Any) -> Dict[str, Any]:
-    if not is_dataclass(obj):
+    if not is_dataclass(obj) or isinstance(obj, type):
         raise ValueError(f"Expected dataclass, got '{obj}'")
     return asdict(obj, dict_factory=lambda x: {k: v for (k, v) in x if v is not None})
@@ -178,7 +185,7 @@ def binarize_dict(d: Dict[str, int]) -> Dict[str, int]:
 def serialize(obj: Any) -> List[str]:
     """Takes in a dataclass and outputs all of its fields and values in a list."""
-    if not is_dataclass(obj):
+    if not is_dataclass(obj) or isinstance(obj, type):
         raise ValueError(f"Expected dataclass, got '{obj}'")
     return [f"{key}: {json.dumps(value)}" for key, value in asdict(obj).items()]

helm/common/hierarchical_logger.py CHANGED Viewed

@@ -1,6 +1,11 @@
+import logging
+import logging.config
+import yaml
+import os
 import sys
 import time
 from typing import Any, Callable, List, Optional
+from colorlog import ColoredFormatter
 class HierarchicalLogger(object):
@@ -20,24 +25,43 @@ class HierarchicalLogger(object):
         } [0s]
     """
+    # Far too much effort to unwind every call to hlog to go via logging,
+    # And is a terrible idea to inspect the stack every time hlog is called
+    # to figure out the caller,
+    # So just log everything under "helm".
+    logger = logging.getLogger("helm")
     def __init__(self) -> None:
         self.start_times: List[float] = []
     def indent(self) -> str:
         return "  " * len(self.start_times)
-    def track_begin(self, x: Any) -> None:
-        print(self.indent() + str(x) + " {")
+    def track_begin(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.info(self.indent() + str(x) + " {", **kwargs)
         sys.stdout.flush()
         self.start_times.append(time.time())
-    def track_end(self) -> None:
+    def track_end(self, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
         t = time.time() - self.start_times.pop()
-        print(self.indent() + "} [%s]" % (format_time(t)))
+        self.logger.info(self.indent() + "} [%s]" % (format_time(t)), **kwargs)
+        sys.stdout.flush()
+    def log(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.info(self.indent() + str(x), **kwargs)
         sys.stdout.flush()
-    def log(self, x: Any) -> None:
-        print(self.indent() + str(x))
+    def debug(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.debug(self.indent() + str(x), **kwargs)
+        sys.stdout.flush()
+    def warn(self, x: Any, **kwargs) -> None:
+        kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+        self.logger.warning(self.indent() + str(x), **kwargs)
         sys.stdout.flush()
@@ -57,19 +81,31 @@ singleton = HierarchicalLogger()
 # Exposed public methods
-def hlog(x: Any) -> None:
-    singleton.log(x)
+def hdebug(x: Any, **kwargs) -> None:
+    kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+    singleton.debug(x, **kwargs)
+def hlog(x: Any, **kwargs) -> None:
+    kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+    singleton.log(x, **kwargs)
+def hwarn(x: Any, **kwargs) -> None:
+    kwargs["stacklevel"] = kwargs.get("stacklevel", 1) + 1
+    singleton.warn(x, **kwargs)
 class htrack_block:
-    def __init__(self, x: Any) -> None:
+    def __init__(self, x: Any, stacklevel=1) -> None:
+        self._stacklevel = stacklevel + 1
         self.x = x
     def __enter__(self) -> None:
-        singleton.track_begin(self.x)
+        singleton.track_begin(self.x, stacklevel=self._stacklevel)
     def __exit__(self, tpe: Any, value: Any, callback: Any) -> None:
-        singleton.track_end()
+        singleton.track_end(stacklevel=self._stacklevel)
 class htrack:
@@ -100,7 +136,63 @@ class htrack:
                     description = description.replace("$" + k, str(v))
             else:
                 description = ""
-            with htrack_block(parent + fn.__name__ + description):
+            with htrack_block(parent + fn.__name__ + description, stacklevel=2):
                 return fn(*args, **kwargs)
         return wrapper
+def setup_default_logging(config_path: Optional[str] = None):
+    """
+    Setup Python logging for HELM
+    Priority:
+    1. External config file (YAML or JSON).
+    2. ENV var LOG_LEVEL.
+    3. a default logger to STDOUT
+    """
+    logger = logging.getLogger("helm")
+    logger.propagate = False
+    if config_path and os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        logging.config.dictConfig(config)
+        hdebug("setup custom HELM logging")
+        return
+    log_level = (os.getenv("HELM_LOG_LEVEL") or os.getenv("LOG_LEVEL") or "INFO").upper()
+    try:
+        logger.setLevel(getattr(logging, log_level))
+    except AttributeError:
+        logger.setLevel(logging.INFO)
+    # Set formatter
+    formatter: Optional[logging.Formatter] = None
+    if sys.stdout.isatty():
+        try:
+            formatter = ColoredFormatter(
+                "%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
+                datefmt="%Y-%m-%dT%H:%M:%S",
+                reset=True,
+                log_colors={
+                    "DEBUG": "cyan",
+                    "INFO": "green",
+                    "WARNING": "yellow",
+                    "ERROR": "red",
+                    "CRITICAL": "red,bg_white",
+                },
+                style="%",
+            )
+        except ImportError:
+            pass
+    if formatter is None:
+        # fallback
+        formatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    # Add default stdout handler
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    hdebug("setup default HELM logging")

helm/common/local_context.py ADDED Viewed

@@ -0,0 +1,140 @@
+import dataclasses
+import os
+from typing import Optional
+from helm.common.context import Context
+from helm.common.cache import CacheConfig
+from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
+from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
+from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
+from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
+from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
+from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
+from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
+from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationRequestResult,
+    DecodeRequest,
+    DecodeRequestResult,
+)
+from helm.common.request import Request, RequestResult
+from helm.clients.auto_client import AutoClient
+from helm.clients.moderation_api_client import ModerationAPIClient
+from helm.clients.image_generation.nudity_check_client import NudityCheckClient
+from helm.clients.gcs_client import GCSClient
+from helm.clients.clip_score_client import CLIPScoreClient
+from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
+from helm.proxy.example_queries import example_queries
+from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
+from helm.proxy.query import Query, QueryResult
+from helm.proxy.retry import retry_request
+from helm.tokenizers.auto_tokenizer import AutoTokenizer
+from helm.proxy.services.service import (
+    CACHE_DIR,
+    GeneralInfo,
+    VERSION,
+    expand_environments,
+    synthesize_request,
+)
+class LocalContext(Context):
+    """
+    Main class that supports various functionality for the server.
+    """
+    def __init__(
+        self,
+        base_path: str = "prod_env",
+        cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
+    ):
+        ensure_directory_exists(base_path)
+        client_file_storage_path = os.path.join(base_path, CACHE_DIR)
+        ensure_directory_exists(client_file_storage_path)
+        credentials = get_credentials(base_path)
+        self.cache_backend_config = cache_backend_config
+        self.client = AutoClient(credentials, client_file_storage_path, cache_backend_config)
+        self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
+        # Lazily instantiate the following clients
+        self.moderation_api_client: Optional[ModerationAPIClient] = None
+        self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
+        self.perspective_api_client: Optional[ToxicityClassifierClient] = None
+        self.nudity_check_client: Optional[NudityCheckClient] = None
+        self.clip_score_client: Optional[CLIPScoreClient] = None
+        self.gcs_client: Optional[GCSClient] = None
+    def get_general_info(self) -> GeneralInfo:
+        # Can't send release_dates in ModelMetadata bacause dates cannot be round-tripped to and from JSON easily.
+        # TODO(#2158): Either fix this or delete get_general_info.
+        all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
+        return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
+    def expand_query(self, query: Query) -> QueryResult:
+        """Turn the `query` into requests."""
+        prompt = query.prompt
+        settings = query.settings
+        environments = parse_hocon(query.environments)
+        requests = []
+        for environment in expand_environments(environments):
+            request = synthesize_request(prompt, settings, environment)
+            requests.append(request)
+        return QueryResult(requests=requests)
+    def make_request(self, request: Request) -> RequestResult:
+        """Actually make a request to an API."""
+        return self.client.make_request(request)
+    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
+        return self.tokenizer.tokenize(request)
+    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
+        return self.tokenizer.decode(request)
+    def upload(self, request: FileUploadRequest) -> FileUploadResult:
+        if not self.gcs_client:
+            self.gcs_client = self.client.get_gcs_client()
+        assert self.gcs_client
+        return self.gcs_client.upload(request)
+    def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
+        if not self.nudity_check_client:
+            self.nudity_check_client = self.client.get_nudity_check_client()
+        assert self.nudity_check_client
+        return self.nudity_check_client.check_nudity(request)
+    def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
+        if not self.clip_score_client:
+            self.clip_score_client = self.client.get_clip_score_client()
+        assert self.clip_score_client
+        return self.clip_score_client.compute_score(request)
+    def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
+        @retry_request
+        def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
+            if not self.toxicity_classifier_client:
+                self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
+            return self.toxicity_classifier_client.get_toxicity_scores(request)
+        return get_toxicity_scores_with_retry(request)
+    def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
+        @retry_request
+        def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
+            if not self.moderation_api_client:
+                self.moderation_api_client = self.client.get_moderation_api_client()
+            return self.moderation_api_client.get_moderation_results(request)
+        return get_moderation_results_with_retry(request)
+    def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
+        return self.client.get_critique_client().make_critique_request(request)
+    def get_cache_config(self, shard_name: str) -> CacheConfig:
+        return self.cache_backend_config.get_cache_config(shard_name)

helm/common/object_spec.py CHANGED Viewed

@@ -55,14 +55,23 @@ def inject_object_spec_args(
     This is loosely based on instance (constant) bindings and provider bindings in Guice dependency injection.
     Example:
-    class MyClass:
-        def __init__(a: int, b: int, c: int, d: int = 0):
-            pass
-    old_object_spec = ObjectSpec(class_name="MyClass", args={"a": 11})
-    new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
-    # new_object_spec is now ObjectSpec(class_name="MyClass", args={"a": 11, "b": 12, "c": 13})
+        >>> from helm.common.object_spec import *  # NOQA
+        >>> import sys, types
+        >>> # Given a custom class with hashable arguments
+        >>> class MyClass:
+        ...     def __init__(a: int, b: int, c: int, d: int = 0):
+        ...         pass
+        >>> #
+        >>> # <boilerplate>: make a dummy module for MyClass to make this doctest exectuable
+        >>> sys.modules["my_module"] = type("MyModule", (types.ModuleType,), {"MyClass": MyClass})("my_module")
+        >>> # </boilerplate>
+        >>> #
+        >>> # Define new style and old style object specs
+        >>> old_object_spec = ObjectSpec(class_name="my_module.MyClass", args={"a": 11})
+        >>> new_object_spec = inject_object_spec_args(old_object_spec, {"b": 12}, {"c": lambda: 13})
+        >>> # new_object_spec is now
+        >>> print(new_object_spec)
+        ObjectSpec(class_name='my_module.MyClass', args={'a': 11, 'b': 12, 'c': 13})
     """
     cls = get_class_by_name(spec.class_name)
     init_signature = inspect.signature(cls.__init__)
@@ -93,6 +102,12 @@ def parse_object_spec(description: str) -> ObjectSpec:
         <class_name>:<key>=<value>,<key>=<value>
     Usually, the description is something that's succinct and can be typed on the command-line.
     Here, value defaults to string.
+    Example:
+        >>> from helm.common.object_spec import *  # NOQA
+        >>> description = 'mscoco:model=huggingface_stable-diffusion-v1-4'
+        >>> parse_object_spec(description)
+        ObjectSpec(class_name='mscoco', args={'model': 'huggingface_stable-diffusion-v1-4'})
     """
     def parse_arg(arg: str) -> Tuple[str, Any]:

helm/common/remote_context.py ADDED Viewed

@@ -0,0 +1,61 @@
+from helm.common.context import Context
+from helm.common.cache import CacheConfig
+from helm.common.authentication import Authentication
+from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
+from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
+from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
+from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
+from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
+from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationRequestResult,
+    DecodeRequestResult,
+    DecodeRequest,
+)
+from helm.common.request import Request, RequestResult
+from helm.proxy.query import Query, QueryResult
+from helm.proxy.services.remote_service import RemoteService
+from helm.proxy.services.service import GeneralInfo, Service
+class RemoteContext(Context):
+    def __init__(self, base_url: str, auth: Authentication):
+        self.service: Service = RemoteService(base_url)
+        self.auth = auth
+    def get_general_info(self) -> GeneralInfo:
+        return self.service.get_general_info()
+    def expand_query(self, query: Query) -> QueryResult:
+        return self.service.expand_query(query)
+    def make_request(self, request: Request) -> RequestResult:
+        return self.service.make_request(self.auth, request)
+    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
+        return self.service.tokenize(self.auth, request)
+    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
+        return self.service.decode(self.auth, request)
+    def upload(self, request: FileUploadRequest) -> FileUploadResult:
+        return self.service.upload(self.auth, request)
+    def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
+        return self.service.check_nudity(self.auth, request)
+    def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
+        return self.service.compute_clip_score(self.auth, request)
+    def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
+        return self.service.get_toxicity_scores(self.auth, request)
+    def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
+        return self.service.get_moderation_results(self.auth, request)
+    def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
+        return self.service.make_critique_request(self.auth, request)
+    def get_cache_config(self, shard_name: str) -> CacheConfig:
+        return self.service.get_cache_config(shard_name)

helm/common/request.py CHANGED Viewed

@@ -131,6 +131,11 @@ class Token:
         ]
+@dataclass(frozen=True)
+class Thinking:
+    text: Optional[str] = None
 @dataclass(frozen=True)
 class GeneratedOutput:
     """A `GeneratedOutput` is a single generated output that may contain text or multimodal content."""
@@ -150,6 +155,9 @@ class GeneratedOutput:
     # Could be a sequence made up of multimedia content
     multimodal_content: Optional[MultimediaObject] = None
+    # Could be reasoning
+    thinking: Optional[Thinking] = None
     def __add__(self, other: "GeneratedOutput") -> "GeneratedOutput":
         return GeneratedOutput(self.text + other.text, self.logprob + other.logprob, self.tokens + other.tokens)

helm/common/test_logging.py ADDED Viewed

@@ -0,0 +1,94 @@
+import sys
+import tempfile
+import textwrap
+import pathlib
+from helm.benchmark import run
+from typing import List, Optional
+class ArgvContext:
+    """
+    Helper to assign a temporary value to sys.argv and then restore it
+    """
+    def __init__(self, argv: Optional[List[str]]):
+        self.argv = argv
+        self._original_argv: Optional[List[str]] = None
+    def __enter__(self):
+        self._original_argv = sys.argv[:]
+        sys.argv = self.argv or []
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert self._original_argv is not None  # Satisfies mypy
+        sys.argv = self._original_argv
+def test_run_with_custom_logging_config():
+    # Setup temporary directory
+    with tempfile.TemporaryDirectory(prefix="helm_test_") as tmp_dir_str:
+        tmp_dir = pathlib.Path(tmp_dir_str)
+        log_path = tmp_dir / "test.log"
+        log_config_path = tmp_dir / "test_config.yaml"
+        # Write custom YAML log config to file
+        log_config_text = textwrap.dedent(
+            f"""
+            version: 1
+            disable_existing_loggers: false
+            formatters:
+              simple:
+                datefmt: '%Y-%m-%dT%H:%M:%S'
+                format: '%(asctime)s %(levelname)s %(name)s %(message)s'
+            handlers:
+              file:
+                class: logging.FileHandler
+                filename: {log_path}
+                formatter: simple
+                level: DEBUG
+                mode: w
+            loggers:
+              helm:
+                handlers:
+                - file
+                level: DEBUG
+                propagate: false
+            """
+        ).strip()
+        log_config_path.write_text(log_config_text)
+        # Simulate command-line arguments
+        argv = [
+            "run.py",  # Fake script name
+            "--run-entries",
+            "mmlu:subject=philosophy,model=openai/gpt2",
+            "-m",
+            "1",
+            "--suite",
+            "my-suite",
+            "--dry-run",
+            "--log-config",
+            str(log_config_path),
+        ]
+        # Call main
+        with ArgvContext(argv):
+            run.main()
+        # Check log file contents
+        assert log_path.exists(), "Log file was not created"
+        log_contents = log_path.read_text()
+        # Test that log file was written to disk as requested
+        print("Log Contents")
+        print("------------")
+        print(log_contents)
+        assert (
+            "mscoco" in log_contents or "huggingface" in log_contents or "dry-run" in log_contents
+        ), "Expected log content not found in log file:\n"
+if __name__ == "__main__":
+    test_run_with_custom_logging_config()

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl