PyPI - crfm-helm - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (236) hide show

{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +41 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +197 -152
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +32 -31
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
helm/benchmark/adaptation/common_adapter_specs.py +2 -0
helm/benchmark/annotation/air_bench_annotator.py +64 -0
helm/benchmark/annotation/annotator_factory.py +6 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
helm/benchmark/annotation/call_center_annotator.py +247 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +68 -0
helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
helm/benchmark/annotation/live_qa_annotator.py +71 -0
helm/benchmark/annotation/medication_qa_annotator.py +68 -0
helm/benchmark/annotation/model_as_judge.py +45 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
helm/benchmark/annotation/xstest_annotator.py +110 -0
helm/benchmark/augmentations/translate_perturbation.py +1 -0
helm/benchmark/huggingface_registration.py +16 -6
helm/benchmark/metrics/air_bench_metrics.py +56 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/bhasa_metrics.py +188 -0
helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
helm/benchmark/metrics/code_metrics_helper.py +11 -1
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
helm/benchmark/metrics/live_qa_metrics.py +23 -0
helm/benchmark/metrics/medication_qa_metrics.py +23 -0
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/safety_metrics.py +57 -0
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
helm/benchmark/metrics/unitxt_metrics.py +20 -10
helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
helm/benchmark/metrics/vision_language/image_metrics.py +30 -72
helm/benchmark/metrics/vision_language/image_utils.py +1 -1
helm/benchmark/model_metadata_registry.py +3 -3
helm/benchmark/presentation/schema.py +54 -4
helm/benchmark/presentation/test_run_entry.py +1 -0
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/run.py +31 -2
helm/benchmark/run_expander.py +113 -10
helm/benchmark/run_spec_factory.py +4 -0
helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
helm/benchmark/run_specs/call_center_run_specs.py +152 -0
helm/benchmark/run_specs/classic_run_specs.py +15 -11
helm/benchmark/run_specs/decodingtrust_run_specs.py +11 -9
helm/benchmark/run_specs/experimental_run_specs.py +85 -0
helm/benchmark/run_specs/finance_run_specs.py +110 -0
helm/benchmark/run_specs/safety_run_specs.py +154 -0
helm/benchmark/run_specs/vlm_run_specs.py +251 -57
helm/benchmark/scenarios/air_bench_scenario.py +50 -0
helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
helm/benchmark/scenarios/banking77_scenario.py +51 -0
helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/fin_qa_scenario.py +119 -0
helm/benchmark/scenarios/financebench_scenario.py +53 -0
helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
helm/benchmark/scenarios/scenario.py +1 -1
helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +2 -8
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/bingo_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +13 -2
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -7
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -5
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +44 -13
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/pairs_scenario.py +7 -6
helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +5 -5
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +1 -6
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_bhasa.yaml +709 -0
helm/benchmark/static/schema_call_center.yaml +232 -0
helm/benchmark/static/schema_classic.yaml +3 -59
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +189 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +3 -52
helm/benchmark/static/schema_lite.yaml +3 -61
helm/benchmark/static/schema_medical.yaml +255 -0
helm/benchmark/static/schema_mmlu.yaml +3 -61
helm/benchmark/static/schema_safety.yaml +247 -0
helm/benchmark/static/schema_tables.yaml +317 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_unitxt.yaml +3 -61
helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +304 -298
helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/index.html +2 -2
helm/benchmark/window_services/test_openai_window_service.py +8 -8
helm/clients/ai21_client.py +71 -1
helm/clients/anthropic_client.py +50 -28
helm/clients/auto_client.py +11 -0
helm/clients/client.py +24 -7
helm/clients/cohere_client.py +98 -3
helm/clients/huggingface_client.py +79 -19
helm/clients/nvidia_nim_client.py +35 -0
helm/clients/openai_client.py +11 -5
helm/clients/palmyra_client.py +25 -0
helm/clients/perspective_api_client.py +11 -6
helm/clients/reka_client.py +189 -0
helm/clients/test_client.py +7 -9
helm/clients/test_huggingface_client.py +19 -3
helm/clients/test_together_client.py +72 -2
helm/clients/together_client.py +129 -23
helm/clients/vertexai_client.py +62 -18
helm/clients/vision_language/huggingface_vlm_client.py +1 -0
helm/clients/vision_language/open_flamingo_client.py +1 -2
helm/clients/vision_language/paligemma_client.py +146 -0
helm/clients/vision_language/palmyra_vision_client.py +99 -0
helm/clients/yi_client.py +31 -0
helm/common/critique_request.py +10 -1
helm/common/images_utils.py +25 -0
helm/common/mongo_key_value_store.py +2 -1
helm/common/request.py +16 -0
helm/config/model_deployments.yaml +740 -363
helm/config/model_metadata.yaml +824 -128
helm/config/tokenizer_configs.yaml +207 -10
helm/proxy/critique/model_critique_client.py +32 -4
helm/proxy/example_queries.py +14 -21
helm/proxy/services/server_service.py +2 -3
helm/proxy/token_counters/test_auto_token_counter.py +2 -2
helm/tokenizers/ai21_tokenizer.py +51 -59
helm/tokenizers/auto_tokenizer.py +1 -1
helm/tokenizers/cohere_tokenizer.py +29 -62
helm/tokenizers/huggingface_tokenizer.py +35 -13
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_huggingface_tokenizer.py +5 -1
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/general.js +0 -122
helm/benchmark/static/images/crfm-logo.png +0 -0
helm/benchmark/static/images/helm-logo-simple.png +0 -0
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/info-icon.png +0 -0
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/schema_image2structure.yaml +0 -304
helm/benchmark/static/utils.js +0 -285
helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
helm/benchmark/static_build/assets/index-878a1094.css +0 -1
helm/benchmark/window_services/ai21_window_service.py +0 -247
helm/benchmark/window_services/cohere_window_service.py +0 -101
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -75
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -327
helm/tokenizers/ice_tokenizer.py +0 -30
helm/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.1.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
/helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0

helm/config/tokenizer_configs.yaml CHANGED Viewed

@@ -17,11 +17,31 @@ tokenizer_configs:
     prefix_token: "<s>"
   # AI21
-  - name: ai21/j1
+  - name: ai21/j2-tokenizer
     tokenizer_spec:
-      class_name: "helm.tokenizers.ai21_tokenizer.AI21Tokenizer"
-    end_of_text_token: " "
-    prefix_token: ""
+      class_name: "helm.tokenizers.ai21_tokenizer.AI21LocalTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|startoftext|>"
+  - name: ai21/jamba-tokenizer
+    tokenizer_spec:
+      class_name: "helm.tokenizers.ai21_tokenizer.AI21LocalTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|startoftext|>"
+  - name: ai21/jamba-instruct-tokenizer
+    tokenizer_spec:
+      class_name: "helm.tokenizers.ai21_tokenizer.AI21LocalTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|startoftext|>"
+  - name: ai21/jamba-1.5-mini-tokenizer
+    tokenizer_spec:
+      class_name: "helm.tokenizers.ai21_tokenizer.AI21LocalTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|startoftext|>"
+  - name: ai21/jamba-1.5-large-tokenizer
+    tokenizer_spec:
+      class_name: "helm.tokenizers.ai21_tokenizer.AI21LocalTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|startoftext|>"
   # AlephAlpha
   - name: AlephAlpha/luminous-base
@@ -45,6 +65,24 @@ tokenizer_configs:
     end_of_text_token: ""
     prefix_token: ""
+  # Alibaba DAMO Academy
+  - name: damo/seallm-7b-v2
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: SeaLLMs/SeaLLM-7B-v2
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  - name: damo/seallm-7b-v2.5
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: SeaLLMs/SeaLLM-7B-v2.5
+    end_of_text_token: "<eos>"
+    prefix_token: "<bos>"
   # Anthropic
   - name: anthropic/claude
     tokenizer_spec:
@@ -69,7 +107,7 @@ tokenizer_configs:
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
     end_of_text_token: "</s>"
-    prefix_token: "</s>"
+    prefix_token: "<s>"
   - name: bigscience/T0pp
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -77,11 +115,45 @@ tokenizer_configs:
     prefix_token: ""
   # Cohere
-  - name: cohere/cohere
+  - name: cohere/command
     tokenizer_spec:
-      class_name: "helm.tokenizers.cohere_tokenizer.CohereTokenizer"
-    end_of_text_token: ""
-    prefix_token: ":"
+      class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
+    end_of_text_token: "<EOS_TOKEN>"
+    prefix_token: "<BOS_TOKEN>"
+  - name: cohere/command-light
+    tokenizer_spec:
+      class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
+    end_of_text_token: "<EOS_TOKEN>"
+    prefix_token: "<BOS_TOKEN>"
+  - name: cohere/command-r
+    tokenizer_spec:
+      class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
+    end_of_text_token: "<EOS_TOKEN>"
+    prefix_token: "<BOS_TOKEN>"
+  - name: cohere/command-r-plus
+    tokenizer_spec:
+      class_name: "helm.tokenizers.cohere_tokenizer.CohereLocalTokenizer"
+    end_of_text_token: "<EOS_TOKEN>"
+    prefix_token: "<BOS_TOKEN>"
+  - name: cohere/c4ai-command-r-v01
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: CohereForAI/c4ai-command-r-v01
+    end_of_text_token: "<EOS_TOKEN>"
+    prefix_token: "<BOS_TOKEN>"
+  - name: cohere/c4ai-command-r-plus
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: CohereForAI/c4ai-command-r-plus
+    end_of_text_token: "<EOS_TOKEN>"
+    prefix_token: "<BOS_TOKEN>"
   # Databricks
   - name: databricks/dbrx-instruct
@@ -159,6 +231,11 @@ tokenizer_configs:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
     end_of_text_token: "<eos>"
     prefix_token: "<bos>"
+  - name: google/gemma-2-9b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<eos>"
+    prefix_token: "<bos>"
   # Hf-internal-testing
@@ -240,6 +317,14 @@ tokenizer_configs:
     prefix_token: "<|begin_of_text|>"
     end_of_text_token: "<|end_of_text|>"
+  - name: meta/llama-3.1-8b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: meta-llama/Meta-Llama-3.1-8B-Instruct
+    prefix_token: "<|begin_of_text|>"
+    end_of_text_token: "<|end_of_text|>"
   # 01-ai
   - name: 01-ai/Yi-6B
     tokenizer_spec:
@@ -247,6 +332,17 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: "<s>"
+  # AI Singapore
+  - name: aisingapore/sea-lion-7b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        trust_remote_code: true
+        use_fast: false
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: ""
   # Allen Institute for AI
   # The allenai/olmo-7b requires Python 3.9 or newer.
@@ -259,6 +355,12 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: ""
+  - name: allenai/OLMo-1.7-7B-hf
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: ""
   # Microsoft
   - name: microsoft/phi-2
@@ -267,6 +369,20 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: "<|endoftext|>"
+  - name: microsoft/phi-3-small-8k-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        trust_remote_code: true
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+  - name: microsoft/phi-3-medium-4k-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<s>"
   # Mistralai
   - name: mistralai/Mistral-7B-v0.1
     tokenizer_spec:
@@ -274,6 +390,36 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: "<s>"
+  - name: mistralai/Mistral-7B-Instruct-v0.1
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  - name: mistralai/Mistral-7B-Instruct-v0.2
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  - name: mistralai/Mistral-7B-Instruct-v0.3
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  - name: mistralai/Mistral-Nemo-Base-2407
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  - name: mistralai/Mistral-Large-Instruct-2407
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
   # Neurips
   - name: neurips/local
     tokenizer_spec:
@@ -281,19 +427,43 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: "<|endoftext|>"
-  # Openai
+  # NVIDIA
+  - name: nvidia/nemotron-4-340b-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: Xenova/Nemotron-4-340B-Instruct-Tokenizer
+        revision: b7aa0de92cda9f9e722d58d6ca90f46ae17d4701
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+  # OpenAI
   - name: openai/cl100k_base
     tokenizer_spec:
       class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
     end_of_text_token: "<|endoftext|>"
     prefix_token: "<|endoftext|>"
+  - name: openai/o200k_base
+    tokenizer_spec:
+      class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
   - name: openai/clip-vit-large-patch14
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
     end_of_text_token: ""
     prefix_token: ""
+  # OpenThaiGPT
+  - name: openthaigpt/openthaigpt-1.0.0-7b-chat
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  # Qwen
   - name: qwen/qwen-7b
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -311,6 +481,14 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: ""
+  - name: qwen/qwen2-72b-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: Qwen/Qwen2-72B-Instruct
+    end_of_text_token: <|im_end|>"
+    prefix_token: "<|im_start|>'"
   - name: qwen/qwen-vl
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -331,10 +509,22 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: ""
+  # SambaLingo
+  - name: sambanova/sambalingo-thai-base
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: sambanovasystems/SambaLingo-Thai-Base
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
   # Snowflake
   - name: snowflake/snowflake-arctic-instruct
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: Snowflake/snowflake-arctic-instruct
+        trust_remote_code: true
     end_of_text_token: "<|im_end|>"
     prefix_token: "<|im_start|>"
@@ -352,6 +542,13 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: ""
+  # Typhoon
+  - name: scb10x/typhoon-7b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
   # Writer
   - name: writer/gpt2
     tokenizer_spec:

helm/proxy/critique/model_critique_client.py CHANGED Viewed

@@ -15,6 +15,7 @@ from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request, RequestResult, GeneratedOutput
 from helm.clients.client import Client
 from helm.proxy.critique.critique_client import CritiqueClient
+from helm.common.media_object import MultimediaObject, MediaObject
 class CritiqueParseError(Exception):
@@ -24,6 +25,8 @@ class CritiqueParseError(Exception):
 class ModelCritiqueClient(CritiqueClient):
     """A CritiqueClient that queries a Model to answer CritiqueRequests."""
+    VISION_LANGUAGE_MODELS = ["openai/gpt-4-vision", "reka/reka", "huggingface/prometheus-vision"]
     def __init__(self, client: Client, model_name):
         self._client = client
         self._model_name = model_name
@@ -31,6 +34,11 @@ class ModelCritiqueClient(CritiqueClient):
             get_default_model_deployment_for_model(model_name, warn_arg_deprecated=False, ignore_deprecated=True)
             or self._model_name
         )
+        self.vision_language = False
+        for vision_language_model_name in self.VISION_LANGUAGE_MODELS:
+            if model_name.startswith(vision_language_model_name):
+                self.vision_language = True
+                break
     def _interpolate_fields(self, text: str, fields: Dict[str, str]) -> str:
         for key, value in fields.items():
@@ -58,10 +66,15 @@ class ModelCritiqueClient(CritiqueClient):
         requests: List[Request] = []
         for question in task.questions:
-            prompt: str = base_prompt + "\n\n" + self._question_to_prompt(question, fields)
+            prompt: str
+            if len(question.text) > 0:
+                prompt = base_prompt + "\n\n" + self._question_to_prompt(question, fields)
+            else:
+                # We may don't want to add extra newlines and prompts
+                # if the question text is empty (e.g., the Vibe-Eval evaluator).
+                prompt = base_prompt
             if question.question_type == "free_response":
-                # TODO: Make max_tokens configurable
-                max_tokens = 100
+                max_tokens = 100 if task.max_tokens is None else task.max_tokens
             elif question.question_type == "checkbox":
                 # We multiply by 2 because the model will generate a comma after each option.
                 max_tokens = len(question.options) * 2
@@ -78,12 +91,21 @@ class ModelCritiqueClient(CritiqueClient):
                 prompt = anthropic.HUMAN_PROMPT + prompt + anthropic.AI_PROMPT
+            multimodal_prompt: Optional[MultimediaObject] = None
+            if self.vision_language:
+                assert question.media_object is not None, "Expect media_object for vision-language models"
+                image_media: MediaObject = question.media_object
+                text_media: MediaObject = MediaObject(text=prompt, content_type="text/plain")
+                multimodal_prompt = MultimediaObject(media_objects=[image_media, text_media])
+                prompt = ""  # set to empty string to avoid conflicts with multimodal_prompt
             request = Request(
                 model=self._model_name,
                 model_deployment=self._model_deployment_name,
                 prompt=prompt,
                 max_tokens=max_tokens,
                 echo_prompt=False,
+                multimodal_prompt=multimodal_prompt,
             )
             requests.append(request)
         return requests
@@ -124,7 +146,13 @@ class ModelCritiqueClient(CritiqueClient):
                 raise CritiqueParseError(
                     f"Invalid answer: {completion}. Multiple choice questions should have one answer."
                 )
-            return answers[0]
+            letter_answer = answers[0]
+            choice_rank = string.ascii_uppercase.index(letter_answer)
+            if choice_rank >= len(question.options):
+                raise CritiqueParseError(
+                    f"Invalid answer: {completion}. The answer is out of range of the options: {question.options}"
+                )
+            return letter_answer
         except CritiqueParseError as e:
             # If there was an error parsing the answer, we assume the user did not answer the question.
             hlog(f"Error parsing answer: {e}. Skipping question (and so the respondent entirely)")

helm/proxy/example_queries.py CHANGED Viewed

@@ -22,7 +22,6 @@ example_queries = [
             temperature: 0.5  # Medium amount of randomness
             stop_sequences: [.]  # Stop when you hit a period
             model: openai/gpt-3.5-turbo-0613
-            model_deployment: openai/gpt-3.5-turbo-0613
             """
         ),
         environments="",
@@ -35,24 +34,24 @@ example_queries = [
             stop_sequences: [\\n]  # Stop when you hit a newline
             num_completions: 5  # Generate many samples
             model: openai/gpt-3.5-turbo-0613
-            model_deployment: openai/gpt-3.5-turbo-0613
             """
         ),
         environments="",
     ),
-    Query(
-        prompt="The quick brown fox jumps over the lazy dog.",
-        settings=dedent(
-            """
-            echo_prompt: true  # Analyze the prompt
-            max_tokens: 0  # Don't generate any more
-            top_k_per_token: 5  # Show alternatives for each position
-            model: openai/davinci-002
-            model_deployment: openai/davinci-002
-            """
-        ),
-        environments=dedent(""),
-    ),
+    # Disabled because `max_tokens: 0` no longer works on the OpenAI API
+    # Query(
+    #     prompt="The quick brown fox jumps over the lazy dog.",
+    #     settings=dedent(
+    #         """
+    #         echo_prompt: true  # Analyze the prompt
+    #         max_tokens: 0  # Don't generate any more
+    #         top_k_per_token: 5  # Show alternatives for each position
+    #         model: openai/text-davinci-002
+    #         model_deployment: openai/text-davinci-002
+    #         """
+    #     ),
+    #     environments=dedent(""),
+    # ),
     Query(
         prompt="Odd numbers: 1 -> 3 -> 5",
         settings=dedent(
@@ -60,7 +59,6 @@ example_queries = [
             temperature: 0  # Deterministic
             max_tokens: 50
             model: openai/gpt-3.5-turbo-0613
-            model_deployment: openai/gpt-3.5-turbo-0613
             """
         ),
         environments="",
@@ -73,7 +71,6 @@ example_queries = [
             stop_sequences: [.]
             # Try out multiple models
             model: ${model}
-            model_deployment: ${model}
             """
         ),
         environments=dedent(
@@ -100,7 +97,6 @@ example_queries = [
             num_completions: 5
             # Try out multiple models
             model: ${model}
-            model_deployment: ${model}
             """
         ),
         environments=dedent(
@@ -136,7 +132,6 @@ example_queries = [
             top_k_per_token: 4
             # Try out multiple models
             model: ${model}
-            model_deployment: ${model}
             """
         ),
         environments=dedent(
@@ -150,7 +145,6 @@ example_queries = [
         settings=dedent(
             """
             model: openai/gpt-3.5-turbo-0613
-            model_deployment: openai/gpt-3.5-turbo-0613
             """
         ),
         environments="",
@@ -163,7 +157,6 @@ example_queries = [
             stop_sequences: [\\n]
             # Try out multiple models
             model: ${model}
-            model_deployment: ${model}
             """
         ),
         environments=dedent(

helm/proxy/services/server_service.py CHANGED Viewed

@@ -25,7 +25,6 @@ from helm.common.hierarchical_logger import hlog
 from helm.proxy.accounts import Accounts, Account
 from helm.clients.auto_client import AutoClient
 from helm.clients.moderation_api_client import ModerationAPIClient
-from helm.clients.perspective_api_client import PerspectiveAPIClient
 from helm.clients.image_generation.nudity_check_client import NudityCheckClient
 from helm.clients.gcs_client import GCSClient
 from helm.clients.clip_score_client import CLIPScoreClient
@@ -75,7 +74,7 @@ class ServerService(Service):
         # Lazily instantiate the following clients
         self.moderation_api_client: Optional[ModerationAPIClient] = None
         self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
-        self.perspective_api_client: Optional[PerspectiveAPIClient] = None
+        self.perspective_api_client: Optional[ToxicityClassifierClient] = None
         self.nudity_check_client: Optional[NudityCheckClient] = None
         self.clip_score_client: Optional[CLIPScoreClient] = None
         self.gcs_client: Optional[GCSClient] = None
@@ -119,7 +118,7 @@ class ServerService(Service):
                 return "codex"
             elif model_deployment.startswith("openai/dall-e-"):
                 return "dall_e"
-            elif model_deployment.startswith("openai/gpt-4-"):
+            elif model_deployment.startswith("openai/gpt-4"):
                 return "gpt4"
             else:
                 return "gpt3"

helm/proxy/token_counters/test_auto_token_counter.py CHANGED Viewed

@@ -13,8 +13,8 @@ class TestAutoTokenCounter:
         )
         # The following prompt has 51 tokens according to the GPT-2 tokenizer
         request = Request(
-            model="openai/text-davinci-002",
-            model_deployment="openai/text-davinci-002",
+            model="openai/gpt2",
+            model_deployment="huggingface/gpt2",
             prompt="The Center for Research on Foundation Models (CRFM) is "
             "an interdisciplinary initiative born out of the Stanford "
             "Institute for Human-Centered Artificial Intelligence (HAI) "

helm/tokenizers/ai21_tokenizer.py CHANGED Viewed

@@ -1,60 +1,52 @@
-from typing import Any, Dict, List
-import requests
-from dacite import from_dict
-from helm.common.cache import Cache, CacheConfig
-from helm.common.tokenization_request import (
-    TokenizationRequest,
-    TokenizationRequestResult,
-    TokenizationToken,
-    TextRange,
-    DecodeRequest,
-    DecodeRequestResult,
-)
-from helm.clients.ai21_utils import AI21RequestError, handle_failed_request
-from .tokenizer import Tokenizer
-class AI21Tokenizer(Tokenizer):
-    def __init__(self, api_key: str, cache_config: CacheConfig) -> None:
-        self.cache = Cache(cache_config)
-        self.api_key: str = api_key
-    def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
-        """
-        Tokenizes the text by using the AI21 endpoint: https://api.ai21.com/studio/v1/tokenize.
-        """
-        # TODO: Does not support encoding
-        raw_request: Dict[str, str] = {"text": request.text}
-        def do_it() -> Dict[str, Any]:
-            response = requests.post(
-                "https://api.ai21.com/studio/v1/tokenize",
-                headers={"Authorization": f"Bearer {self.api_key}"},
-                json=raw_request,
-            ).json()
-            # If 'tokens' is not present in the response, assume request failed.
-            if "tokens" not in response:
-                handle_failed_request(api_type="tokenizer", response=response)
-            return response
-        try:
-            response, cached = self.cache.get(raw_request, do_it)
-        except AI21RequestError:
-            return TokenizationRequestResult(success=False, cached=False, text="", tokens=[])
-        # Each token is represented like this in the response:
-        # {'token': '▁Hello', 'textRange': {'start': 0, 'end': 5}}
-        tokens: List[TokenizationToken] = []
-        for token_dict in response["tokens"]:
-            tokens.append(
-                TokenizationToken(value=token_dict["token"], text_range=from_dict(TextRange, token_dict["textRange"]))
+import threading
+from typing import Any, Dict
+from helm.common.cache import CacheConfig
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.tokenizers.caching_tokenizer import CachingTokenizer
+try:
+    from ai21_tokenizer import Tokenizer as SDKTokenizer
+    from ai21_tokenizer.base_tokenizer import BaseTokenizer
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["ai21"])
+class AI21LocalTokenizer(CachingTokenizer):
+    """AI21 tokenizer using the AI21 Python library."""
+    def __init__(self, cache_config: CacheConfig) -> None:
+        super().__init__(cache_config)
+        self._tokenizers_lock = threading.Lock()
+        self.tokenizers: Dict[str, BaseTokenizer] = {}
+    def _get_tokenizer(self, tokenizer_name: str) -> BaseTokenizer:
+        with self._tokenizers_lock:
+            if tokenizer_name not in self.tokenizers:
+                self.tokenizers[tokenizer_name] = SDKTokenizer.get_tokenizer(tokenizer_name)
+            return self.tokenizers[tokenizer_name]
+    def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        tokenizer_name = request["tokenizer"].split("/")[1]
+        tokenizer = self._get_tokenizer(tokenizer_name)
+        if request["truncation"]:
+            token_ids = tokenizer.encode(
+                text=request["text"],
+                truncation=request["truncation"],
+                max_length=request["max_length"],
+                add_special_tokens=False,
             )
-        text: str = response["text"]
-        return TokenizationRequestResult(success=True, cached=cached, tokens=tokens, text=text)
-    def decode(self, request: DecodeRequest) -> DecodeRequestResult:
-        raise NotImplementedError("Not supported")
+        else:
+            token_ids = tokenizer.encode(
+                text=request["text"],
+                add_special_tokens=False,
+            )
+        if request["encode"]:
+            return {"tokens": token_ids}
+        else:
+            return {"tokens": tokenizer.convert_ids_to_tokens(token_ids)}
+    def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        tokenizer_name = request["tokenizer"].split("/")[1]
+        tokenizer = self._get_tokenizer(tokenizer_name)
+        return {"text": tokenizer.decode(request["tokens"])}

helm/tokenizers/auto_tokenizer.py CHANGED Viewed

@@ -41,7 +41,7 @@ class AutoTokenizer(Tokenizer):
         if tokenizer_config:
             tokenizer_spec = inject_object_spec_args(
                 tokenizer_config.tokenizer_spec,
-                constant_bindings={"cache_config": cache_config},
+                constant_bindings={"cache_config": cache_config, "tokenizer_name": tokenizer_name},
                 provider_bindings={
                     "api_key": lambda: provide_api_key(self.credentials, organization),
                     "project_id": lambda: self.credentials.get(organization + "ProjectId", None),  # VertexAI

crfm-helm 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.1py3-none-any.whl → 0.5.3py3-none-any.whl