PyPI - crfm-helm - Versions diffs - 0.5.1__tar.gz → 0.5.2__tar.gz - Mend

crfm-helm 0.5.1tar.gz → 0.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (699) hide show

{crfm_helm-0.5.1/src/crfm_helm.egg-info → crfm_helm-0.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.5.1
+Version: 0.5.2
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM
@@ -77,6 +77,8 @@ Requires-Dist: evaluate~=0.4.1; extra == "unitxt"
 Provides-Extra: aleph-alpha
 Requires-Dist: aleph-alpha-client~=2.14.0; extra == "aleph-alpha"
 Requires-Dist: tokenizers>=0.13.3; extra == "aleph-alpha"
+Provides-Extra: openvino
+Requires-Dist: optimum[openvino]~=1.19; extra == "openvino"
 Provides-Extra: allenai
 Requires-Dist: ai2-olmo~=0.2; extra == "allenai"
 Provides-Extra: amazon
@@ -86,14 +88,16 @@ Requires-Dist: botocore~=1.31.57; extra == "amazon"
 Provides-Extra: anthropic
 Requires-Dist: anthropic~=0.17; extra == "anthropic"
 Requires-Dist: websocket-client~=1.3.2; extra == "anthropic"
+Provides-Extra: cohere
+Requires-Dist: cohere~=5.3; extra == "cohere"
 Provides-Extra: mistral
 Requires-Dist: mistralai~=0.0.11; extra == "mistral"
 Provides-Extra: openai
 Requires-Dist: openai~=1.0; extra == "openai"
-Requires-Dist: tiktoken~=0.3.3; extra == "openai"
+Requires-Dist: tiktoken~=0.7; extra == "openai"
 Requires-Dist: pydantic~=2.0; extra == "openai"
 Provides-Extra: google
-Requires-Dist: google-cloud-aiplatform~=1.44; extra == "google"
+Requires-Dist: google-cloud-aiplatform~=1.48; extra == "google"
 Provides-Extra: together
 Requires-Dist: together~=1.1; extra == "together"
 Provides-Extra: tsinghua
@@ -105,12 +109,17 @@ Requires-Dist: crfm-helm[aleph-alpha]; extra == "models"
 Requires-Dist: crfm-helm[allenai]; extra == "models"
 Requires-Dist: crfm-helm[amazon]; extra == "models"
 Requires-Dist: crfm-helm[anthropic]; extra == "models"
+Requires-Dist: crfm-helm[cohere]; extra == "models"
 Requires-Dist: crfm-helm[google]; extra == "models"
 Requires-Dist: crfm-helm[mistral]; extra == "models"
 Requires-Dist: crfm-helm[openai]; extra == "models"
+Requires-Dist: crfm-helm[reka]; extra == "models"
 Requires-Dist: crfm-helm[together]; extra == "models"
 Requires-Dist: crfm-helm[tsinghua]; extra == "models"
 Requires-Dist: crfm-helm[yandex]; extra == "models"
+Requires-Dist: crfm-helm[openvino]; extra == "models"
+Provides-Extra: reka
+Requires-Dist: reka-api~=2.0.0; extra == "reka"
 Provides-Extra: vlm
 Requires-Dist: crfm-helm[openai]; extra == "vlm"
 Requires-Dist: einops~=0.7.0; extra == "vlm"
@@ -120,6 +129,7 @@ Requires-Dist: torch~=2.1.2; extra == "vlm"
 Requires-Dist: transformers_stream_generator~=0.0.4; extra == "vlm"
 Requires-Dist: scipy~=1.10; extra == "vlm"
 Requires-Dist: torchvision<3.0.0,>=0.14.1; extra == "vlm"
+Requires-Dist: crfm-helm[reka]; extra == "vlm"
 Requires-Dist: crfm-helm[images]; extra == "vlm"
 Requires-Dist: crfm-helm[image2structure]; extra == "vlm"
 Requires-Dist: pycocoevalcap~=1.2; extra == "vlm"

{crfm_helm-0.5.1 → crfm_helm-0.5.2}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = crfm-helm
-version = 0.5.1
+version = 0.5.2
 author = Stanford CRFM
 author_email = contact-crfm@stanford.edu
 description = Benchmark for language models
@@ -92,6 +92,8 @@ unitxt =
 aleph-alpha =
 	aleph-alpha-client~=2.14.0
 	tokenizers>=0.13.3
+openvino =
+	optimum[openvino]~=1.19
 allenai =
 	ai2-olmo~=0.2
 amazon =
@@ -101,14 +103,16 @@ amazon =
 anthropic =
 	anthropic~=0.17
 	websocket-client~=1.3.2  # For legacy stanford-online-all-v4-s3
+cohere =
+	cohere~=5.3
 mistral =
 	mistralai~=0.0.11
 openai =
 	openai~=1.0
-	tiktoken~=0.3.3
+	tiktoken~=0.7
 	pydantic~=2.0  # For model_dump(mode="json") - openai only requires pydantic>=1.9.0
 google =
-	google-cloud-aiplatform~=1.44
+	google-cloud-aiplatform~=1.48
 together =
 	together~=1.1
 tsinghua =
@@ -120,12 +124,17 @@ models =
 	crfm-helm[allenai]
 	crfm-helm[amazon]
 	crfm-helm[anthropic]
+	crfm-helm[cohere]
 	crfm-helm[google]
 	crfm-helm[mistral]
 	crfm-helm[openai]
+	crfm-helm[reka]
 	crfm-helm[together]
 	crfm-helm[tsinghua]
 	crfm-helm[yandex]
+	crfm-helm[openvino]
+reka =
+	reka-api~=2.0.0
 vlm =
 	crfm-helm[openai]
@@ -139,6 +148,8 @@ vlm =
 	scipy~=1.10
 	torchvision>=0.14.1,<3.0.0
+	crfm-helm[reka]
 	crfm-helm[images]
 	crfm-helm[image2structure]

{crfm_helm-0.5.1 → crfm_helm-0.5.2/src/crfm_helm.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.5.1
+Version: 0.5.2
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM
@@ -77,6 +77,8 @@ Requires-Dist: evaluate~=0.4.1; extra == "unitxt"
 Provides-Extra: aleph-alpha
 Requires-Dist: aleph-alpha-client~=2.14.0; extra == "aleph-alpha"
 Requires-Dist: tokenizers>=0.13.3; extra == "aleph-alpha"
+Provides-Extra: openvino
+Requires-Dist: optimum[openvino]~=1.19; extra == "openvino"
 Provides-Extra: allenai
 Requires-Dist: ai2-olmo~=0.2; extra == "allenai"
 Provides-Extra: amazon
@@ -86,14 +88,16 @@ Requires-Dist: botocore~=1.31.57; extra == "amazon"
 Provides-Extra: anthropic
 Requires-Dist: anthropic~=0.17; extra == "anthropic"
 Requires-Dist: websocket-client~=1.3.2; extra == "anthropic"
+Provides-Extra: cohere
+Requires-Dist: cohere~=5.3; extra == "cohere"
 Provides-Extra: mistral
 Requires-Dist: mistralai~=0.0.11; extra == "mistral"
 Provides-Extra: openai
 Requires-Dist: openai~=1.0; extra == "openai"
-Requires-Dist: tiktoken~=0.3.3; extra == "openai"
+Requires-Dist: tiktoken~=0.7; extra == "openai"
 Requires-Dist: pydantic~=2.0; extra == "openai"
 Provides-Extra: google
-Requires-Dist: google-cloud-aiplatform~=1.44; extra == "google"
+Requires-Dist: google-cloud-aiplatform~=1.48; extra == "google"
 Provides-Extra: together
 Requires-Dist: together~=1.1; extra == "together"
 Provides-Extra: tsinghua
@@ -105,12 +109,17 @@ Requires-Dist: crfm-helm[aleph-alpha]; extra == "models"
 Requires-Dist: crfm-helm[allenai]; extra == "models"
 Requires-Dist: crfm-helm[amazon]; extra == "models"
 Requires-Dist: crfm-helm[anthropic]; extra == "models"
+Requires-Dist: crfm-helm[cohere]; extra == "models"
 Requires-Dist: crfm-helm[google]; extra == "models"
 Requires-Dist: crfm-helm[mistral]; extra == "models"
 Requires-Dist: crfm-helm[openai]; extra == "models"
+Requires-Dist: crfm-helm[reka]; extra == "models"
 Requires-Dist: crfm-helm[together]; extra == "models"
 Requires-Dist: crfm-helm[tsinghua]; extra == "models"
 Requires-Dist: crfm-helm[yandex]; extra == "models"
+Requires-Dist: crfm-helm[openvino]; extra == "models"
+Provides-Extra: reka
+Requires-Dist: reka-api~=2.0.0; extra == "reka"
 Provides-Extra: vlm
 Requires-Dist: crfm-helm[openai]; extra == "vlm"
 Requires-Dist: einops~=0.7.0; extra == "vlm"
@@ -120,6 +129,7 @@ Requires-Dist: torch~=2.1.2; extra == "vlm"
 Requires-Dist: transformers_stream_generator~=0.0.4; extra == "vlm"
 Requires-Dist: scipy~=1.10; extra == "vlm"
 Requires-Dist: torchvision<3.0.0,>=0.14.1; extra == "vlm"
+Requires-Dist: crfm-helm[reka]; extra == "vlm"
 Requires-Dist: crfm-helm[images]; extra == "vlm"
 Requires-Dist: crfm-helm[image2structure]; extra == "vlm"
 Requires-Dist: pycocoevalcap~=1.2; extra == "vlm"

{crfm_helm-0.5.1 → crfm_helm-0.5.2}/src/crfm_helm.egg-info/SOURCES.txt RENAMED Viewed

@@ -62,8 +62,11 @@ src/helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimod
 src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py
 src/helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py
 src/helm/benchmark/annotation/__init__.py
+src/helm/benchmark/annotation/air_bench_annotator.py
 src/helm/benchmark/annotation/annotator.py
 src/helm/benchmark/annotation/annotator_factory.py
+src/helm/benchmark/annotation/live_qa_annotator.py
+src/helm/benchmark/annotation/medication_qa_annotator.py
 src/helm/benchmark/annotation/test_annotator_factory.py
 src/helm/benchmark/annotation/test_dummy_annotator.py
 src/helm/benchmark/annotation/image2structure/__init__.py
@@ -101,6 +104,7 @@ src/helm/benchmark/efficiency_data/inference_denoised_runtimes.json
 src/helm/benchmark/efficiency_data/inference_idealized_runtimes.json
 src/helm/benchmark/efficiency_data/training_efficiency.json
 src/helm/benchmark/metrics/__init__.py
+src/helm/benchmark/metrics/air_bench_metrics.py
 src/helm/benchmark/metrics/basic_metrics.py
 src/helm/benchmark/metrics/bbq_metrics.py
 src/helm/benchmark/metrics/bias_metrics.py
@@ -122,16 +126,23 @@ src/helm/benchmark/metrics/dry_run_metrics.py
 src/helm/benchmark/metrics/efficiency_metrics.py
 src/helm/benchmark/metrics/evaluate_instances_metric.py
 src/helm/benchmark/metrics/evaluate_reference_metrics.py
+src/helm/benchmark/metrics/fin_qa_metrics.py
+src/helm/benchmark/metrics/fin_qa_metrics_helper.py
+src/helm/benchmark/metrics/gpt4v_originality_critique_metrics.py
 src/helm/benchmark/metrics/instruction_following_critique_metrics.py
 src/helm/benchmark/metrics/language_modeling_metrics.py
+src/helm/benchmark/metrics/live_qa_metrics.py
 src/helm/benchmark/metrics/machine_translation_metrics.py
+src/helm/benchmark/metrics/medication_qa_metrics.py
 src/helm/benchmark/metrics/metric.py
 src/helm/benchmark/metrics/metric_name.py
 src/helm/benchmark/metrics/metric_service.py
 src/helm/benchmark/metrics/numeracy_metrics.py
 src/helm/benchmark/metrics/paraphrase_generation_metrics.py
+src/helm/benchmark/metrics/prometheus_vision_critique_metrics.py
 src/helm/benchmark/metrics/ranking_metrics.py
 src/helm/benchmark/metrics/reference_metric.py
+src/helm/benchmark/metrics/reka_vibe_critique_metrics.py
 src/helm/benchmark/metrics/statistic.py
 src/helm/benchmark/metrics/summarization_critique_metrics.py
 src/helm/benchmark/metrics/summarization_metrics.py
@@ -207,11 +218,15 @@ src/helm/benchmark/presentation/table.py
 src/helm/benchmark/presentation/test_contamination.py
 src/helm/benchmark/presentation/test_create_plots.py
 src/helm/benchmark/presentation/test_run_entry.py
+src/helm/benchmark/presentation/test_schema.py
 src/helm/benchmark/presentation/test_summarize.py
 src/helm/benchmark/run_specs/__init__.py
+src/helm/benchmark/run_specs/air_bench_run_specs.py
 src/helm/benchmark/run_specs/classic_run_specs.py
 src/helm/benchmark/run_specs/cleva_run_specs.py
 src/helm/benchmark/run_specs/decodingtrust_run_specs.py
+src/helm/benchmark/run_specs/experimental_run_specs.py
+src/helm/benchmark/run_specs/finance_run_specs.py
 src/helm/benchmark/run_specs/heim_run_specs.py
 src/helm/benchmark/run_specs/instruction_following_run_specs.py
 src/helm/benchmark/run_specs/lite_run_specs.py
@@ -219,6 +234,7 @@ src/helm/benchmark/run_specs/simple_run_specs.py
 src/helm/benchmark/run_specs/unitxt_run_specs.py
 src/helm/benchmark/run_specs/vlm_run_specs.py
 src/helm/benchmark/scenarios/__init__.py
+src/helm/benchmark/scenarios/air_bench_scenario.py
 src/helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py
 src/helm/benchmark/scenarios/babi_qa_scenario.py
 src/helm/benchmark/scenarios/bbq_scenario.py
@@ -226,6 +242,7 @@ src/helm/benchmark/scenarios/big_bench_scenario.py
 src/helm/benchmark/scenarios/blimp_scenario.py
 src/helm/benchmark/scenarios/bold_scenario.py
 src/helm/benchmark/scenarios/boolq_scenario.py
+src/helm/benchmark/scenarios/ci_mcqa_scenario.py
 src/helm/benchmark/scenarios/civil_comments_scenario.py
 src/helm/benchmark/scenarios/cleva_scenario.py
 src/helm/benchmark/scenarios/code_scenario.py
@@ -249,6 +266,7 @@ src/helm/benchmark/scenarios/dyck_language_scenario.py
 src/helm/benchmark/scenarios/entity_data_imputation_scenario.py
 src/helm/benchmark/scenarios/entity_matching_scenario.py
 src/helm/benchmark/scenarios/entity_matching_scenario_fixed_random_state.py
+src/helm/benchmark/scenarios/fin_qa_scenario.py
 src/helm/benchmark/scenarios/grammar.py
 src/helm/benchmark/scenarios/grammar_scenario.py
 src/helm/benchmark/scenarios/gsm_scenario.py
@@ -292,6 +310,7 @@ src/helm/benchmark/scenarios/summarization_scenario.py
 src/helm/benchmark/scenarios/synthetic_efficiency_scenario.py
 src/helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py
 src/helm/benchmark/scenarios/synthetic_reasoning_scenario.py
+src/helm/benchmark/scenarios/test_air_bench_scenario.py
 src/helm/benchmark/scenarios/test_grammar.py
 src/helm/benchmark/scenarios/test_math_scenario.py
 src/helm/benchmark/scenarios/test_scenario.py
@@ -346,6 +365,7 @@ src/helm/benchmark/scenarios/vision_language/pairs_scenario.py
 src/helm/benchmark/scenarios/vision_language/pope_scenario.py
 src/helm/benchmark/scenarios/vision_language/seed_bench_scenario.py
 src/helm/benchmark/scenarios/vision_language/unicorn_scenario.py
+src/helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py
 src/helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py
 src/helm/benchmark/scenarios/vision_language/vqa_scenario.py
 src/helm/benchmark/scenarios/vision_language/image2structure/__init__.py
@@ -368,14 +388,19 @@ src/helm/benchmark/static/index.html
 src/helm/benchmark/static/info-icon.png
 src/helm/benchmark/static/json-urls.js
 src/helm/benchmark/static/plot-captions.js
+src/helm/benchmark/static/schema_air_bench.yaml
 src/helm/benchmark/static/schema_classic.yaml
+src/helm/benchmark/static/schema_finance.yaml
 src/helm/benchmark/static/schema_image2structure.yaml
 src/helm/benchmark/static/schema_instruction_following.yaml
 src/helm/benchmark/static/schema_lite.yaml
+src/helm/benchmark/static/schema_medical.yaml
 src/helm/benchmark/static/schema_mmlu.yaml
+src/helm/benchmark/static/schema_tables.yaml
+src/helm/benchmark/static/schema_thai.yaml
 src/helm/benchmark/static/schema_unitxt.yaml
+src/helm/benchmark/static/schema_vhelm.yaml
 src/helm/benchmark/static/schema_vhelm_lite.yaml
-src/helm/benchmark/static/schema_vlm.yaml
 src/helm/benchmark/static/utils.js
 src/helm/benchmark/static/images/crfm-logo.png
 src/helm/benchmark/static/images/helm-logo-simple.png
@@ -400,6 +425,7 @@ src/helm/benchmark/static_build/config.js
 src/helm/benchmark/static_build/index.html
 src/helm/benchmark/static_build/assets/01-694cb9b7.png
 src/helm/benchmark/static_build/assets/ai21-0eb91ec3.png
+src/helm/benchmark/static_build/assets/air-overview-d2e6c49f.png
 src/helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png
 src/helm/benchmark/static_build/assets/anthropic-70d8bc39.png
 src/helm/benchmark/static_build/assets/bigscience-7f0400c0.png
@@ -410,13 +436,15 @@ src/helm/benchmark/static_build/assets/google-06d997ad.png
 src/helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png
 src/helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png
 src/helm/benchmark/static_build/assets/helmhero-28e90f4d.png
-src/helm/benchmark/static_build/assets/index-737eef9e.js
-src/helm/benchmark/static_build/assets/index-878a1094.css
+src/helm/benchmark/static_build/assets/index-30dbceba.js
+src/helm/benchmark/static_build/assets/index-66b02d40.css
 src/helm/benchmark/static_build/assets/meta-5580e9f1.png
 src/helm/benchmark/static_build/assets/microsoft-f5ee5016.png
 src/helm/benchmark/static_build/assets/mistral-18e1be23.png
 src/helm/benchmark/static_build/assets/nvidia-86fa75c1.png
 src/helm/benchmark/static_build/assets/openai-3f8653e4.png
+src/helm/benchmark/static_build/assets/overview-74aea3d8.png
+src/helm/benchmark/static_build/assets/process-flow-bd2eba96.png
 src/helm/benchmark/static_build/assets/react-d4a0b69b.js
 src/helm/benchmark/static_build/assets/recharts-6d337683.js
 src/helm/benchmark/static_build/assets/tii-24de195c.png
@@ -489,6 +517,7 @@ src/helm/clients/open_lm_client.py
 src/helm/clients/openai_client.py
 src/helm/clients/palmyra_client.py
 src/helm/clients/perspective_api_client.py
+src/helm/clients/reka_client.py
 src/helm/clients/simple_client.py
 src/helm/clients/test_auto_client.py
 src/helm/clients/test_client.py
@@ -499,6 +528,7 @@ src/helm/clients/together_client.py
 src/helm/clients/toxicity_classifier_client.py
 src/helm/clients/vertexai_client.py
 src/helm/clients/vllm_client.py
+src/helm/clients/yi_client.py
 src/helm/clients/clip_scorers/__init__.py
 src/helm/clients/clip_scorers/base_clip_scorer.py
 src/helm/clients/clip_scorers/clip_scorer.py
@@ -560,6 +590,8 @@ src/helm/clients/vision_language/huggingface_vision2seq_client.py
 src/helm/clients/vision_language/huggingface_vlm_client.py
 src/helm/clients/vision_language/idefics_client.py
 src/helm/clients/vision_language/open_flamingo_client.py
+src/helm/clients/vision_language/paligemma_client.py
+src/helm/clients/vision_language/palmyra_vision_client.py
 src/helm/clients/vision_language/qwen_vlm_client.py
 src/helm/clients/vision_language/open_flamingo/__init__.py
 src/helm/clients/vision_language/open_flamingo/src/__init__.py
@@ -647,6 +679,7 @@ src/helm/tokenizers/ice_tokenizer.py
 src/helm/tokenizers/lit_gpt_tokenizer.py
 src/helm/tokenizers/simple_tokenizer.py
 src/helm/tokenizers/test_anthropic_tokenizer.py
+src/helm/tokenizers/test_cohere_tokenizer.py
 src/helm/tokenizers/test_huggingface_tokenizer.py
 src/helm/tokenizers/test_ice_tokenizer.py
 src/helm/tokenizers/test_simple_tokenizer.py

{crfm_helm-0.5.1 → crfm_helm-0.5.2}/src/crfm_helm.egg-info/requires.txt RENAMED Viewed

@@ -62,6 +62,9 @@ jieba==0.42.1
 opencc==1.1.6
 langdetect==1.0.9
+[cohere]
+cohere~=5.3
 [decodingtrust]
 fairlearn~=0.9.0
@@ -73,7 +76,7 @@ mypy==1.5.1
 flake8==5.0.4
 [google]
-google-cloud-aiplatform~=1.44
+google-cloud-aiplatform~=1.48
 [heim]
 gdown~=4.4.0
@@ -134,21 +137,27 @@ crfm-helm[aleph-alpha]
 crfm-helm[allenai]
 crfm-helm[amazon]
 crfm-helm[anthropic]
+crfm-helm[cohere]
 crfm-helm[google]
 crfm-helm[mistral]
 crfm-helm[openai]
+crfm-helm[reka]
 crfm-helm[together]
 crfm-helm[tsinghua]
 crfm-helm[yandex]
+crfm-helm[openvino]
 [mongo]
 pymongo~=4.2
 [openai]
 openai~=1.0
-tiktoken~=0.3.3
+tiktoken~=0.7
 pydantic~=2.0
+[openvino]
+optimum[openvino]~=1.19
 [plots]
 colorcet~=3.0.1
 matplotlib~=3.6.0
@@ -157,6 +166,9 @@ seaborn~=0.11.0
 [proxy-server]
 gunicorn~=20.1.0
+[reka]
+reka-api~=2.0.0
 [scenarios]
 gdown~=4.4.0
 sympy~=1.11.1
@@ -186,6 +198,7 @@ torch~=2.1.2
 transformers_stream_generator~=0.0.4
 scipy~=1.10
 torchvision<3.0.0,>=0.14.1
+crfm-helm[reka]
 crfm-helm[images]
 crfm-helm[image2structure]
 pycocoevalcap~=1.2

crfm_helm-0.5.2/src/helm/benchmark/adaptation/adapter_spec.py ADDED Viewed

@@ -0,0 +1,129 @@
+from dataclasses import dataclass, field
+from typing import List, Optional
+from helm.common.image_generation_parameters import ImageGenerationParameters
+# Adaptation methods
+ADAPT_GENERATION: str = "generation"
+ADAPT_LANGUAGE_MODELING: str = "language_modeling"
+ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
+ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
+ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
+ADAPT_RANKING_BINARY: str = "ranking_binary"
+ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
+    ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
+    ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
+]
+# Multimodal adaptation methods
+ADAPT_GENERATION_MULTIMODAL: str = "generation_multimodal"
+ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL: str = "multiple_choice_joint_multimodal"
+@dataclass(frozen=True)
+class Substitution:
+    """Represents a regular expression search/replace."""
+    source: str
+    target: str
+@dataclass(frozen=True)
+class AdapterSpec:
+    """
+    Specifies how to take a `Scenario` (a list of `Instance`s) and produce a
+    `ScenarioState` (set of `Request`s ). Instead of having free-form prompt
+    hacking, we try to make the process more declarative and systematic.
+    Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
+    """
+    method: str = ""
+    """The high-level strategy for converting instances into a prompt for the language model."""
+    global_prefix: str = ""
+    """The string that is prepended to the entire prompt."""
+    global_suffix: str = ""
+    """The string that is appended to the entire prompt."""
+    instructions: str = ""
+    """The description of the task that is included at the very beginning of the prompt."""
+    input_prefix: str = "Input: "
+    """The string that is included before each input (e.g., 'Question:')."""
+    input_suffix: str = "\n"
+    """The string that is included after each input (e.g., '\\n')."""
+    reference_prefix: str = "A. "
+    """The string that is included before each reference (for multiple-choice questions)."""
+    reference_suffix: str = "\n"
+    """The string that is included after each reference (for multiple-choice questions)."""
+    output_prefix: str = "Output: "
+    """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
+    output_suffix: str = "\n"
+    """The string that is included after the correct answer/predicted output (e.g., '\\n')."""
+    instance_prefix: str = "\n"
+    """The string that is included before each instance (e.g., '\\n\\n')."""
+    substitutions: List[Substitution] = field(default_factory=list, hash=False)
+    """A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n')
+    to perform at the very end on the prompt."""
+    max_train_instances: int = 5
+    """Maximum number of training instances to include in the prompt (currently by randomly sampling)."""
+    max_eval_instances: Optional[int] = None
+    """Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."""
+    num_outputs: int = 5
+    """Maximum number of possible outputs to generate by sampling multiple outputs."""
+    num_train_trials: int = 1
+    """Number of trials, where in each trial we choose an independent, random set of training instances.
+    Used to compute variance."""
+    num_trials: int = 1
+    """Number of trials, where we query the model with the same requests, but different random seeds."""
+    sample_train: bool = True
+    """If true, randomly sample N training examples; if false, select N consecutive training examples"""
+    # Decoding parameters (inherited by `Request`)
+    model_deployment: str = ""
+    """Name of the language model deployment (<host_organization>/<model name>) to send requests to."""
+    model: str = ""
+    """Name of the language model (<creator_organization>/<model name>) to send requests to."""
+    temperature: float = 1
+    """Temperature parameter used in generation."""
+    max_tokens: int = 100
+    """Maximum number of tokens to generate."""
+    # Set hash=False to make `AdapterSpec` hashable
+    stop_sequences: List[str] = field(default_factory=list, hash=False)
+    """List of stop sequences. Output generation will be stopped if any stop sequence is encountered."""
+    # Random string (used concretely to bypass cache / see diverse results)
+    random: Optional[str] = None
+    """Random seed (string), which guarantees reproducibility."""
+    multi_label: bool = False
+    """If true, for instances with multiple correct reference, the gold answer should be considered to be all
+    of the correct references rather than any of the correct references."""
+    image_generation_parameters: Optional[ImageGenerationParameters] = None
+    """Parameters for image generation."""
+    # Set hash=False to make `AdapterSpec` hashable
+    eval_splits: Optional[List[str]] = field(default=None, hash=False)
+    """The splits from which evaluation instances will be drawn."""

crfm_helm-0.5.2/src/helm/benchmark/annotation/air_bench_annotator.py ADDED Viewed

@@ -0,0 +1,64 @@
+import datasets
+import os
+import re
+from typing import Any
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+class AIRBench2024Annotator(Annotator):
+    """The AIRBench 2024 autograder."""
+    name = "air_bench_2024"
+    def __init__(self, auto_client: AutoClient, file_storage_path: str):
+        self._auto_client = auto_client
+        cache_dir = os.path.join(file_storage_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "stanford-crfm/air-bench-2024", "judge_prompts", split="test", cache_dir=cache_dir
+        )
+        self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
+        # Regex pattern is lenient to allow for typos e.g. extra whitespace
+        self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
+    def annotate(self, request_state: RequestState) -> Any:
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        model_input_text = request_state.request.prompt
+        model_output_text = request_state.result.completions[0].text
+        if not model_output_text.strip():
+            return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
+        category_id = request_state.instance.references[0].output.text
+        prompt_template = self._category_id_to_judge_prompt[category_id]
+        # Strip to deal with incorrectly formatted input CSV.
+        # TODO: Stop stripping after CSV is fixed.
+        annotator_prompt = prompt_template.replace("{{QUESTION}}", model_input_text).replace(
+            "{{ANSWER}}", model_output_text
+        )
+        annotator_request = Request(
+            model="openai/gpt-4o-2024-05-13",
+            model_deployment="openai/gpt-4o-2024-05-13",
+            prompt=annotator_prompt,
+            temperature=0.0,
+            max_tokens=64,
+        )
+        annotator_response = self._auto_client.make_request(annotator_request)
+        if not annotator_response.success:
+            raise Exception(f"Annotation request failed: {annotator_response.error}")
+        assert len(annotator_response.completions) == 1
+        annotator_response_text = annotator_response.completions[0].text
+        annotator_response_parts = self._pattern.search(annotator_response_text)
+        if not annotator_response_parts:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}")
+        reasoning = annotator_response_parts[1].strip()
+        try:
+            score = float(annotator_response_parts[2].strip())
+        except ValueError as e:
+            raise Exception(f"Malformed annotator response: {annotator_response_text}") from e
+        return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}

{crfm_helm-0.5.1 → crfm_helm-0.5.2}/src/helm/benchmark/annotation/annotator_factory.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import Any, Dict, Mapping, Optional
+from helm.clients.auto_client import AutoClient
 from helm.common.credentials_utils import provide_api_key
 from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
 from helm.common.hierarchical_logger import hlog
@@ -46,6 +47,11 @@ class AnnotatorFactory:
             provider_bindings={
                 "api_key": lambda: provide_api_key(self.credentials, annotator_name),
                 "file_storage_path": lambda: self._get_file_storage_path(annotator_name),
+                "auto_client": lambda: AutoClient(
+                    credentials=self.credentials,
+                    file_storage_path=self.file_storage_path,
+                    cache_backend_config=self.cache_backend_config,
+                ),
             },
         )
         annotator = create_object(annotator_spec)

crfm-helm 0.5.1__tar.gz → 0.5.2__tar.gz

crfm-helm 0.5.1tar.gz → 0.5.2tar.gz