PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +1 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/schema.py +5 -0
helm/benchmark/presentation/summarize.py +9 -3
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +7 -1
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +19 -1
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/index.html +1 -1
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
helm/clients/huggingface_client.py +2 -2
helm/clients/openai_client.py +2 -1
helm/clients/openai_responses_client.py +6 -4
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +0 -2
helm/clients/vertexai_client.py +11 -9
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +83 -34
helm/common/object_spec.py +23 -8
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +454 -175
helm/config/model_metadata.yaml +117 -10
helm/config/tokenizer_configs.yaml +81 -1
helm/proxy/cli.py +1 -1
helm/proxy/retry.py +5 -0
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0

helm/config/model_metadata.yaml CHANGED Viewed

@@ -2624,6 +2624,15 @@ models:
     release_date: 2024-11-18
     tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  # Moonshot AI
+  - name: moonshotai/kimi-k2-instruct
+    display_name: Kimi K2 Instruct
+    description:  Kimi K2 Instruct is a mixture-of-experts (MoE) language model with 32 billion activated parameters and 1 trillion total parameters trained with the Muon optimizer on 15.5T tokens. ([blog](https://moonshotai.github.io/Kimi-K2/))
+    creator_organization_name: Moonshot AI
+    access: open
+    num_parameters: 1029173256720
+    release_date: 2024-07-14  # Blog post has no date, so use the date from this news article https://www.cnbc.com/2025/07/14/alibaba-backed-moonshot-releases-kimi-k2-ai-rivaling-chatgpt-claude.html
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
   # MosaicML
   - name: mosaicml/mpt-7b
@@ -3256,6 +3265,14 @@ models:
     release_date: 2025-04-16
     tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: openai/o3-pro-2025-06-10-high-reasoning-effort
+    display_name: o3-pro (2025-06-10, high reasoning effort)
+    description: o3-pro is an o-series model designed to think longer and provide the most reliable responses. ([blog post](https://help.openai.com/en/articles/9624314-model-release-notes))
+    creator_organization_name: OpenAI
+    access: limited
+    release_date: 2025-06-10
+    tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
   ## Codex Models
   # DEPRECATED: Codex models have been shut down on March 23 2023.
@@ -4163,6 +4180,14 @@ models:
     release_date: 2025-04-03  # https://docs.x.ai/docs/release-notes#april-2025
     tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: xai/grok-4-0709
+    display_name: Grok 4 (0709)
+    description: Grok 4 (0709) is a model that includes native tool use and real-time search integration. ([blog](https://x.ai/news/grok-4))
+    creator_organization_name: xAI
+    access: limited
+    release_date: 2025-07-09
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
   # Yandex
   - name: yandex/yalm
     display_name: YaLM (100B)
@@ -4266,6 +4291,31 @@ models:
     release_date: 2023-11-08
     tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: maritaca-ai/sabiazinho-3
+    display_name: Sabiazinho 3
+    description: Sabiazinho-3 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to july 2023.
+    creator_organization_name: Maritaca AI
+    access: limited
+    release_date: 2025-02-06
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: maritaca-ai/sabia-3
+    display_name: Sabía 3
+    description: Sabiá-3 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to july 2023.
+    creator_organization_name: Maritaca AI
+    access: limited
+    release_date: 2024-12-11
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: maritaca-ai/sabia-3.1-2025-05-08
+    display_name: Sabía 3.1
+    description: Sabiá-3.1 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to August 2024.
+    creator_organization_name: Maritaca AI
+    access: limited
+    release_date: 2025-05-08
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 # Granite - IBM
 # https://www.ibm.com/granite
 # https://github.com/ibm-granite/granite-3.0-language-models
@@ -4479,21 +4529,23 @@ models:
     tags: [ TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG ]
   - name: ibm/granite-3.3-8b-instruct
-    display_name: Granite 3.3 8B Instruct
-    description: Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
+    display_name: IBM Granite 3.3 8B Instruct
+    description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
     creator_organization_name: IBM
     access: open
     num_parameters: 8170000000
     release_date: 2025-04-16
     tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
-  - name: mistralai/mixtral-8x7b-instruct-v0:1
-    display_name: Mixtral 8x7B Instruct on IBM WatsonX
-    description: A 7B sparse Mixture-of-Experts model with stronger capabilities than Mistral 7B. Uses 12B active parameters out of 45B total. Supports multiple languages, code and 32k context window.
-    creator_organization_name: Mistral
-    access: limited
-    release_date: 2023-12-11
-    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: ibm/granite-3.3-8b-instruct-with-guardian
+    display_name: IBM Granite 3.3 8B Instruct (with guardian)
+    description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct)) This model was run with an additional safety filter using [Granite Guardian 3.2](https://www.ibm.com/granite/docs/models/guardian/).
+    creator_organization_name: IBM
+    access: open
+    num_parameters: 8170000000
+    release_date: 2025-04-16
+    # Unfortunately this setup is not easily reproducible, so we mark it with DEPRECATED_MODEL_TAG
+    tags: [DEPRECATED_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
   - name: ura-hcmut/ura-llama-2.1-8b
     display_name: URA-Llama 2.1 (8B)
@@ -4682,4 +4734,59 @@ models:
     access: open
     num_parameters: 4000000000
     release_date: 2024-04-02
-    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
+    display_name: Gemma-3 Gaia PT-BR 4b Instruct
+    description: Gemma-3 Gaia PT-BR 4b Instruct is a model trained by CEIA-UFG for understanding and generating Brazilian Portuguese text.
+    creator_organization_name: CEIA-UFG
+    access: open
+    num_parameters: 4000000000
+    release_date: 2025-06-01
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
+    display_name: Bode 13B Alpaca PT-BR
+    description: Bode is a language model (LLM) for Portuguese, based on LLaMA 2 and fine-tuned with the Alpaca dataset translated into Portuguese. Suitable for instruction, text generation, translation and tasks in Portuguese.
+    creator_organization_name: Recogna NLP
+    access: open
+    num_parameters: 13000000000
+    release_date: 2024-01-05
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: 22h/cabrita_7b_pt_850000
+    display_name: Cabrita PT-BR 7B
+    description: Cabrita is an OpenLLaMA-based model, continuously trained in Portuguese (mC4-pt subset) for 850000 steps with efficient tokenization adapted to the language.
+    creator_organization_name: 22h
+    access: open
+    num_parameters: 7000000000
+    release_date: 2023-08-23
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
+    display_name: Gervásio PT-BR/PT-PT 7B Decoder
+    description: Gervásio PT* is a 7B parameter decoder model, adapted from LLaMA27B, trained for both Brazilian and European Portuguese. Fine-tuned with translated data from benchmarks such as GLUE and SuperGLUE.
+    creator_organization_name: PORTULAN (University of Lisbon NLX)
+    access: open
+    num_parameters: 6740000000
+    release_date: 2024-02-29
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: TucanoBR/Tucano-2b4
+    display_name: Tucano PT-BR 2b4
+    description: Tucano is a series of decoder models based on LLaMA2, natively pre-trained in Portuguese using the GigaVerbo dataset (200B tokens), with the 2B model trained for 1.96M steps over 845h (515B tokens, 4 epochs).
+    creator_organization_name: TucanoBR (University of Bonn)
+    access: open
+    num_parameters: 2444618240
+    release_date: 2024-12-11
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
+  - name: nicholasKluge/TeenyTinyLlama-460m
+    display_name: TeenyTinyLlama 460M PT-BR
+    description: TeenyTinyLlama-460m is a lightweight and efficient model based on LLaMA2, trained exclusively on Brazilian Portuguese. It uses RoPE embeddings and SwiGLU activations, with a refined SentencePiece tokenizer and a low-resource optimized architecture.
+    creator_organization_name: Nicholas Kluge.
+    access: open
+    num_parameters: 460000000
+    release_date: 2024-01-30
+    tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]

helm/config/tokenizer_configs.yaml CHANGED Viewed

@@ -265,6 +265,12 @@ tokenizer_configs:
     end_of_text_token: ""
     prefix_token: ""
+  - name: xai/grok-4-0709
+    tokenizer_spec:
+      class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
+    end_of_text_token: ""
+    prefix_token: ""
   # Hf-internal-testing
   # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
@@ -582,6 +588,17 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: "<s>"
+  # Moonshot AI
+  - name: moonshotai/kimi-k2-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
+        trust_remote_code: true
+        revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
+    end_of_text_token: "[EOS]"
+    prefix_token: "[BOS]"
   # Nectec
   - name: nectec/OpenThaiLLM-Prebuilt-7B
     tokenizer_spec:
@@ -892,6 +909,7 @@ tokenizer_configs:
     end_of_text_token: ""
     prefix_token: ""
+# Maritaca AI
   - name: maritaca-ai/sabia-7b
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -900,6 +918,14 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: "<s>"
+  - name: maritaca-ai/sabia-2-tokenizer-medium
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
 # Granite-3.1-8b-base
   - name: ibm-granite/granite-3.1-8b-base
     tokenizer_spec:
@@ -1104,4 +1130,58 @@ tokenizer_configs:
         args:
             pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
     end_of_text_token: "</s>"
-    prefix_token: "<s>"
+    prefix_token: "<s>"
+# Gemma-3-Gaia-PT-BR-4b-it
+  - name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
+    end_of_text_token: "<eos>"
+    prefix_token: "<bos>"
+# Bode 13B Alpaca PT-BR
+  - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Cabrita 7B PT-BR tokenizer
+  - name: 22h/cabrita_7b_pt_850000
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Gervásio 7B PT‑BR/PT‑PT tokenizer
+  - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Tucano 2b4 PT-BR tokenizer
+  - name: TucanoBR/Tucano-2b4
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: TucanoBR/Tucano-2b4
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# TeenyTinyLlama 460M PT-BR tokenizer
+  - name: nicholasKluge/TeenyTinyLlama-460m
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"

helm/proxy/cli.py CHANGED Viewed

@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
     # Update quotas
     for quota_str in args.quotas:
-        m = re.match(f"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
+        m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
         if not m:
             raise Exception(
                 f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "

helm/proxy/retry.py CHANGED Viewed

@@ -5,6 +5,7 @@ from retrying import Retrying
 from helm.common.request import RequestResult
 from helm.common.tokenization_request import TokenizationRequestResult
 from helm.common.hierarchical_logger import hlog
+import os
 import traceback
 import threading
@@ -19,6 +20,10 @@ Example usage:
         ...
 """
+# TODO: make these configurable at a config / cli level
+HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
+HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
 # The lock is used to prevent multiple threads from printing at the same time.
 # This can cause issues when printing the stack trace.
 # (The stack traces can get mixed up and become unreadable.)

helm/tokenizers/grok_tokenizer.py CHANGED Viewed

@@ -34,6 +34,8 @@ class GrokAPITokenizer(CachingTokenizer):
                 "Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
             )
         text = request["text"]
+        if not text:
+            return {"token_ids": []}
         model = request["tokenizer"].split("/")[-1]
         response = requests.post(
             url="https://api.x.ai/v1/tokenize-text",

helm/benchmark/metrics/numeracy_metrics.py DELETED Viewed

@@ -1,72 +0,0 @@
-from typing import List
-from helm.common.request import RequestResult
-from helm.benchmark.adaptation.request_state import RequestState
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark.scenarios.numeracy_scenario import (  # noqa
-    NumeracyScenario,
-    Polynomial,
-    RELTYPE_INFO,
-    distance_linear,
-    distance_parabola,
-    distance_plane,
-    distance_paraboloid,
-)
-from helm.benchmark.metrics.metric import Metric
-from helm.benchmark.metrics.metric_name import MetricName
-from helm.benchmark.metrics.metric_service import MetricService
-from helm.benchmark.metrics.statistic import Stat
-class DistanceMetric(Metric):
-    """Returns the minimum geometric distance between the point represented by the completion
-    and the curve or surface specified by `rel_str`.
-    Expects `references.outputs` to be a list containing the following:
-     - val_GT (str): the last coordinate of the point lying on the given curve / surface
-         with first coordinates as given in the input
-     - rel_str (str): the relation
-     - relation_type (str): one of {'linear', 'parabola', 'plane', 'paraboloid'}
-    Returns:
-        The minimum geometric distance from the point to the curve / surface float.
-    """
-    def evaluate_generation(
-        self,
-        adapter_spec: AdapterSpec,
-        request_state: RequestState,
-        metric_service: MetricService,
-        eval_cache_path: str,
-    ) -> List[Stat]:
-        """For given request, compute the following two metrics:
-        1. geometric distance metric in range [0, ∞), calling the appropriate distance method, if possible, and
-        2. percent valid metric in range [0., 1.] of completions that are a valid number, ignoring commas.
-        """
-        references = request_state.instance.references
-        _, rel_str, relation_type = map(lambda _: _.output.text, references)
-        input_text: str = request_state.instance.input.text
-        datapoint_input = input_text.split("\n")[-1]
-        val = list(map(int, datapoint_input.split(NumeracyScenario.delimiter)))
-        distance_func = globals()[f"distance_{relation_type}"]
-        result = 0.0
-        num_valid = 0
-        assert request_state.result is not None
-        request_result: RequestResult = request_state.result
-        for completion_sequence in request_result.completions:
-            completion = completion_sequence.text.strip()
-            try:
-                pred = int(completion.replace(",", ""))  # ignore commas in numbers
-            except Exception:
-                continue
-            point = val + [pred]
-            result += distance_func(point, rel_str)
-            num_valid += 1
-        percent_valid = 1.0 * num_valid / len(request_result.completions)
-        return [
-            Stat(MetricName("distance")).add(result),
-            Stat(MetricName("percent_valid")).add(percent_valid),
-        ]

helm/benchmark/metrics/test_numeracy_metrics.py DELETED Viewed

@@ -1,95 +0,0 @@
-from dataclasses import dataclass
-from typing import List, Callable
-from helm.benchmark.scenarios.numeracy_scenario import (
-    distance_linear,
-    distance_parabola,
-    distance_plane,
-    distance_paraboloid,
-)
-TOL = 1e-5  # note: different from TOL in numeracy_scenario.distance_<...> used for checking if real or complex
-@dataclass(frozen=True)
-class TestCase:
-    rel_str: str
-    point: List[int]
-    dist: float
-def check_test_cases(test_cases: List[TestCase], dist_func: Callable[[List[int], str], float]):
-    for test_case in test_cases:
-        dist = dist_func(test_case.point, test_case.rel_str)
-        dist_gt = test_case.dist
-        assert abs(dist - dist_gt) < TOL, f"{test_case.rel_str} {test_case.point}"
-        # print(f"{test_case.rel_str} {test_case.point} Dist: {dist}\tDist GT: {dist_gt}")
-def test_distance_linear():
-    test_cases = [
-        TestCase(
-            "y = 4x + 4", [59, 201], 9.458889376416986
-        ),  # https://www.wolframalpha.com/input?i=minimize+sqrt%28%28x+-+59%29%5E2+%2B+%284x+%2B+4+-+201%29%5E2%29
-        TestCase("y = x + 3 ", [30, 78], 31.819805153394636),
-        TestCase("y = 5x + 4", [-47, 2], 45.69505948719688),
-        TestCase("y = 4x + 3", [-65, -255], 0.48507125007266594),
-        TestCase("y = 4x + 3", [97, 391], 0.0),
-    ]
-    check_test_cases(test_cases, distance_linear)
-def test_distance_parabola():
-    test_cases = [
-        TestCase("y = 2x^2 + x + 1", [159, 50000], 1.137499072212397),
-        TestCase("y = 2x^2 + 2x + 4", [130, 28390], 11.364547837422966),
-        TestCase("y = 2x^2 + x + 4", [53, 10000], 17.4468675121177),
-        TestCase(
-            "y = 2x^2 + 2x + 2", [35, 1], 34.36171077312826
-        ),  # https://www.wolframalpha.com/input?i=minimize+%28x+-+35%29%5E2+%2B+%282x%5E2+%2B+2x+%2B+2+-+1%29%5E2
-        TestCase("y = x^2 + x + 2", [197, 39008], 0.0),
-    ]
-    check_test_cases(test_cases, distance_parabola)
-def test_distance_plane():
-    test_cases = [
-        TestCase(
-            "z = 4x + 4y + 1", [-4, 9, 1], 3.481553119113957
-        ),  # https://www.wolframalpha.com/input?i=minimize+sqrt%28%28x+%2B+4%29%5E2+%2B+%28y+-+9%29%5E2+%2B+%284x
-        # +%2B+4y+%2B+1+-+1%29%5E2%29
-        TestCase(
-            "z = 3x + 5y + 4", [-10, 4, 3], 1.52127765851133
-        ),  # https://www.wolframalpha.com/input?i=minimize+sqrt%28%28x+%2B+10%29%5E2+%2B+%28y+-+4%29%5E2+%2B+%283
-        # x+%2B+5y+%2B+4+-+3%29%5E2%29
-        TestCase("z = 4x + 3y + 4", [-5, 4, -7], 0.5883484054145521),
-        TestCase("z = 3x + 5y + 2", [-7, 10, 0], 5.239956379316803),
-        TestCase("z = 5x + 2y + 3", [-2, -1, -9], 0.0),
-    ]
-    check_test_cases(test_cases, distance_plane)
-def test_distance_paraboloid():
-    test_cases = [
-        TestCase("z = x^2 + y^2 + 2", [0, 0, 2], 0.0),
-        TestCase(
-            "z = 2x^2 + y^2 + 2", [0, 11, 151], 1.2055445093982982
-        ),  # https://www.wolframalpha.com/input?i=minimize+x%5E2+%2B+%28y+-+11%29%5E2+%2B+%28%282x%5E2+%2B+y%5E2+%2B+2%29+-+151%29%5E2  # noqa
-        TestCase(
-            "z = 2x^2 + 2y^2 + 2", [0, 0, 6], 1.3919410907075054
-        ),  # https://www.wolframalpha.com/input?i=minimize+x%5E2+%2By%5E2+%2B+%28%282x%5E2+%2B+2y%5E2+%2B+2%29+-+6%29%5E2  # noqa
-        TestCase(
-            "z = x^2 + y^2 + 2", [0, 0, 20], 4.2130748865881795
-        ),  # https://www.wolframalpha.com/input?i=x%5E2+%2B+y%5E2+%2B+%28%28x%5E2+%2B+y%5E2+%2B+2%29+-+20%29%5E2
-        TestCase("z = 2x^2 + xy + y^2 + 4", [6, 19, 519], 0.5290904095503263),
-        TestCase("z = 2x^2 + xy + 2y^2 + 3", [0, 14, 380], 0.26248531385619783),
-        TestCase("z = x^2 + 2y^2 + 1", [5, 14, 4], 13.354544558906934),
-        TestCase("z = x^2 + xy + 2y^2 + 4", [3, 20, 1001], 1.4206031238856873),
-        TestCase("z = x^2 + xy + 2y^2 + 4", [0, 0, 55], 51.0),
-        TestCase("z = x^2 + xy + 2y^2 + 4", [0, 9, 55], 3.8558889386410757),
-        TestCase("z = 2x^2 + 2y^2 + 1", [8, 9, 289], 0.04158555512549898),
-        TestCase("z = 2x^2 + 2y^2 + 1", [8, 9, 291], 0.0),
-        TestCase("z = x^2 + 2xy + 5y^2 + 4", [0, 9, 55], 5.7150737847649244),
-    ]
-    check_test_cases(test_cases, distance_paraboloid)

crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.7py3-none-any.whl