crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +5 -0
- helm/benchmark/presentation/summarize.py +9 -3
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +19 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/index.html +1 -1
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/huggingface_client.py +2 -2
- helm/clients/openai_client.py +2 -1
- helm/clients/openai_responses_client.py +6 -4
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +0 -2
- helm/clients/vertexai_client.py +11 -9
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +454 -175
- helm/config/model_metadata.yaml +117 -10
- helm/config/tokenizer_configs.yaml +81 -1
- helm/proxy/cli.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
helm/config/model_metadata.yaml
CHANGED
|
@@ -2624,6 +2624,15 @@ models:
|
|
|
2624
2624
|
release_date: 2024-11-18
|
|
2625
2625
|
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2626
2626
|
|
|
2627
|
+
# Moonshot AI
|
|
2628
|
+
- name: moonshotai/kimi-k2-instruct
|
|
2629
|
+
display_name: Kimi K2 Instruct
|
|
2630
|
+
description: Kimi K2 Instruct is a mixture-of-experts (MoE) language model with 32 billion activated parameters and 1 trillion total parameters trained with the Muon optimizer on 15.5T tokens. ([blog](https://moonshotai.github.io/Kimi-K2/))
|
|
2631
|
+
creator_organization_name: Moonshot AI
|
|
2632
|
+
access: open
|
|
2633
|
+
num_parameters: 1029173256720
|
|
2634
|
+
release_date: 2024-07-14 # Blog post has no date, so use the date from this news article https://www.cnbc.com/2025/07/14/alibaba-backed-moonshot-releases-kimi-k2-ai-rivaling-chatgpt-claude.html
|
|
2635
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2627
2636
|
|
|
2628
2637
|
# MosaicML
|
|
2629
2638
|
- name: mosaicml/mpt-7b
|
|
@@ -3256,6 +3265,14 @@ models:
|
|
|
3256
3265
|
release_date: 2025-04-16
|
|
3257
3266
|
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3258
3267
|
|
|
3268
|
+
- name: openai/o3-pro-2025-06-10-high-reasoning-effort
|
|
3269
|
+
display_name: o3-pro (2025-06-10, high reasoning effort)
|
|
3270
|
+
description: o3-pro is an o-series model designed to think longer and provide the most reliable responses. ([blog post](https://help.openai.com/en/articles/9624314-model-release-notes))
|
|
3271
|
+
creator_organization_name: OpenAI
|
|
3272
|
+
access: limited
|
|
3273
|
+
release_date: 2025-06-10
|
|
3274
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3275
|
+
|
|
3259
3276
|
## Codex Models
|
|
3260
3277
|
# DEPRECATED: Codex models have been shut down on March 23 2023.
|
|
3261
3278
|
|
|
@@ -4163,6 +4180,14 @@ models:
|
|
|
4163
4180
|
release_date: 2025-04-03 # https://docs.x.ai/docs/release-notes#april-2025
|
|
4164
4181
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4165
4182
|
|
|
4183
|
+
- name: xai/grok-4-0709
|
|
4184
|
+
display_name: Grok 4 (0709)
|
|
4185
|
+
description: Grok 4 (0709) is a model that includes native tool use and real-time search integration. ([blog](https://x.ai/news/grok-4))
|
|
4186
|
+
creator_organization_name: xAI
|
|
4187
|
+
access: limited
|
|
4188
|
+
release_date: 2025-07-09
|
|
4189
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4190
|
+
|
|
4166
4191
|
# Yandex
|
|
4167
4192
|
- name: yandex/yalm
|
|
4168
4193
|
display_name: YaLM (100B)
|
|
@@ -4266,6 +4291,31 @@ models:
|
|
|
4266
4291
|
release_date: 2023-11-08
|
|
4267
4292
|
tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4268
4293
|
|
|
4294
|
+
- name: maritaca-ai/sabiazinho-3
|
|
4295
|
+
display_name: Sabiazinho 3
|
|
4296
|
+
description: Sabiazinho-3 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to july 2023.
|
|
4297
|
+
creator_organization_name: Maritaca AI
|
|
4298
|
+
access: limited
|
|
4299
|
+
release_date: 2025-02-06
|
|
4300
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4301
|
+
|
|
4302
|
+
- name: maritaca-ai/sabia-3
|
|
4303
|
+
display_name: Sabía 3
|
|
4304
|
+
description: Sabiá-3 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to july 2023.
|
|
4305
|
+
creator_organization_name: Maritaca AI
|
|
4306
|
+
access: limited
|
|
4307
|
+
release_date: 2024-12-11
|
|
4308
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4309
|
+
|
|
4310
|
+
- name: maritaca-ai/sabia-3.1-2025-05-08
|
|
4311
|
+
display_name: Sabía 3.1
|
|
4312
|
+
description: Sabiá-3.1 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to August 2024.
|
|
4313
|
+
creator_organization_name: Maritaca AI
|
|
4314
|
+
access: limited
|
|
4315
|
+
release_date: 2025-05-08
|
|
4316
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4317
|
+
|
|
4318
|
+
|
|
4269
4319
|
# Granite - IBM
|
|
4270
4320
|
# https://www.ibm.com/granite
|
|
4271
4321
|
# https://github.com/ibm-granite/granite-3.0-language-models
|
|
@@ -4479,21 +4529,23 @@ models:
|
|
|
4479
4529
|
tags: [ TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG ]
|
|
4480
4530
|
|
|
4481
4531
|
- name: ibm/granite-3.3-8b-instruct
|
|
4482
|
-
display_name: Granite 3.3 8B Instruct
|
|
4483
|
-
description: Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
|
|
4532
|
+
display_name: IBM Granite 3.3 8B Instruct
|
|
4533
|
+
description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
|
|
4484
4534
|
creator_organization_name: IBM
|
|
4485
4535
|
access: open
|
|
4486
4536
|
num_parameters: 8170000000
|
|
4487
4537
|
release_date: 2025-04-16
|
|
4488
4538
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4489
4539
|
|
|
4490
|
-
- name:
|
|
4491
|
-
display_name:
|
|
4492
|
-
description:
|
|
4493
|
-
creator_organization_name:
|
|
4494
|
-
access:
|
|
4495
|
-
|
|
4496
|
-
|
|
4540
|
+
- name: ibm/granite-3.3-8b-instruct-with-guardian
|
|
4541
|
+
display_name: IBM Granite 3.3 8B Instruct (with guardian)
|
|
4542
|
+
description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct)) This model was run with an additional safety filter using [Granite Guardian 3.2](https://www.ibm.com/granite/docs/models/guardian/).
|
|
4543
|
+
creator_organization_name: IBM
|
|
4544
|
+
access: open
|
|
4545
|
+
num_parameters: 8170000000
|
|
4546
|
+
release_date: 2025-04-16
|
|
4547
|
+
# Unfortunately this setup is not easily reproducible, so we mark it with DEPRECATED_MODEL_TAG
|
|
4548
|
+
tags: [DEPRECATED_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4497
4549
|
|
|
4498
4550
|
- name: ura-hcmut/ura-llama-2.1-8b
|
|
4499
4551
|
display_name: URA-Llama 2.1 (8B)
|
|
@@ -4682,4 +4734,59 @@ models:
|
|
|
4682
4734
|
access: open
|
|
4683
4735
|
num_parameters: 4000000000
|
|
4684
4736
|
release_date: 2024-04-02
|
|
4685
|
-
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4737
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4738
|
+
|
|
4739
|
+
- name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
|
|
4740
|
+
display_name: Gemma-3 Gaia PT-BR 4b Instruct
|
|
4741
|
+
description: Gemma-3 Gaia PT-BR 4b Instruct is a model trained by CEIA-UFG for understanding and generating Brazilian Portuguese text.
|
|
4742
|
+
creator_organization_name: CEIA-UFG
|
|
4743
|
+
access: open
|
|
4744
|
+
num_parameters: 4000000000
|
|
4745
|
+
release_date: 2025-06-01
|
|
4746
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4747
|
+
|
|
4748
|
+
- name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
|
|
4749
|
+
display_name: Bode 13B Alpaca PT-BR
|
|
4750
|
+
description: Bode is a language model (LLM) for Portuguese, based on LLaMA 2 and fine-tuned with the Alpaca dataset translated into Portuguese. Suitable for instruction, text generation, translation and tasks in Portuguese.
|
|
4751
|
+
creator_organization_name: Recogna NLP
|
|
4752
|
+
access: open
|
|
4753
|
+
num_parameters: 13000000000
|
|
4754
|
+
release_date: 2024-01-05
|
|
4755
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4756
|
+
|
|
4757
|
+
- name: 22h/cabrita_7b_pt_850000
|
|
4758
|
+
display_name: Cabrita PT-BR 7B
|
|
4759
|
+
description: Cabrita is an OpenLLaMA-based model, continuously trained in Portuguese (mC4-pt subset) for 850000 steps with efficient tokenization adapted to the language.
|
|
4760
|
+
creator_organization_name: 22h
|
|
4761
|
+
access: open
|
|
4762
|
+
num_parameters: 7000000000
|
|
4763
|
+
release_date: 2023-08-23
|
|
4764
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4765
|
+
|
|
4766
|
+
- name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
|
|
4767
|
+
display_name: Gervásio PT-BR/PT-PT 7B Decoder
|
|
4768
|
+
description: Gervásio PT* is a 7B parameter decoder model, adapted from LLaMA27B, trained for both Brazilian and European Portuguese. Fine-tuned with translated data from benchmarks such as GLUE and SuperGLUE.
|
|
4769
|
+
creator_organization_name: PORTULAN (University of Lisbon NLX)
|
|
4770
|
+
access: open
|
|
4771
|
+
num_parameters: 6740000000
|
|
4772
|
+
release_date: 2024-02-29
|
|
4773
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4774
|
+
|
|
4775
|
+
- name: TucanoBR/Tucano-2b4
|
|
4776
|
+
display_name: Tucano PT-BR 2b4
|
|
4777
|
+
description: Tucano is a series of decoder models based on LLaMA2, natively pre-trained in Portuguese using the GigaVerbo dataset (200B tokens), with the 2B model trained for 1.96M steps over 845h (515B tokens, 4 epochs).
|
|
4778
|
+
creator_organization_name: TucanoBR (University of Bonn)
|
|
4779
|
+
access: open
|
|
4780
|
+
num_parameters: 2444618240
|
|
4781
|
+
release_date: 2024-12-11
|
|
4782
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4783
|
+
|
|
4784
|
+
- name: nicholasKluge/TeenyTinyLlama-460m
|
|
4785
|
+
display_name: TeenyTinyLlama 460M PT-BR
|
|
4786
|
+
description: TeenyTinyLlama-460m is a lightweight and efficient model based on LLaMA2, trained exclusively on Brazilian Portuguese. It uses RoPE embeddings and SwiGLU activations, with a refined SentencePiece tokenizer and a low-resource optimized architecture.
|
|
4787
|
+
creator_organization_name: Nicholas Kluge.
|
|
4788
|
+
access: open
|
|
4789
|
+
num_parameters: 460000000
|
|
4790
|
+
release_date: 2024-01-30
|
|
4791
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4792
|
+
|
|
@@ -265,6 +265,12 @@ tokenizer_configs:
|
|
|
265
265
|
end_of_text_token: ""
|
|
266
266
|
prefix_token: ""
|
|
267
267
|
|
|
268
|
+
- name: xai/grok-4-0709
|
|
269
|
+
tokenizer_spec:
|
|
270
|
+
class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
|
|
271
|
+
end_of_text_token: ""
|
|
272
|
+
prefix_token: ""
|
|
273
|
+
|
|
268
274
|
# Hf-internal-testing
|
|
269
275
|
|
|
270
276
|
# Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
|
|
@@ -582,6 +588,17 @@ tokenizer_configs:
|
|
|
582
588
|
end_of_text_token: "</s>"
|
|
583
589
|
prefix_token: "<s>"
|
|
584
590
|
|
|
591
|
+
# Moonshot AI
|
|
592
|
+
- name: moonshotai/kimi-k2-instruct
|
|
593
|
+
tokenizer_spec:
|
|
594
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
595
|
+
args:
|
|
596
|
+
pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
|
|
597
|
+
trust_remote_code: true
|
|
598
|
+
revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
|
|
599
|
+
end_of_text_token: "[EOS]"
|
|
600
|
+
prefix_token: "[BOS]"
|
|
601
|
+
|
|
585
602
|
# Nectec
|
|
586
603
|
- name: nectec/OpenThaiLLM-Prebuilt-7B
|
|
587
604
|
tokenizer_spec:
|
|
@@ -892,6 +909,7 @@ tokenizer_configs:
|
|
|
892
909
|
end_of_text_token: ""
|
|
893
910
|
prefix_token: ""
|
|
894
911
|
|
|
912
|
+
# Maritaca AI
|
|
895
913
|
- name: maritaca-ai/sabia-7b
|
|
896
914
|
tokenizer_spec:
|
|
897
915
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -900,6 +918,14 @@ tokenizer_configs:
|
|
|
900
918
|
end_of_text_token: "</s>"
|
|
901
919
|
prefix_token: "<s>"
|
|
902
920
|
|
|
921
|
+
- name: maritaca-ai/sabia-2-tokenizer-medium
|
|
922
|
+
tokenizer_spec:
|
|
923
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
924
|
+
args:
|
|
925
|
+
pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
|
|
926
|
+
end_of_text_token: "</s>"
|
|
927
|
+
prefix_token: "<s>"
|
|
928
|
+
|
|
903
929
|
# Granite-3.1-8b-base
|
|
904
930
|
- name: ibm-granite/granite-3.1-8b-base
|
|
905
931
|
tokenizer_spec:
|
|
@@ -1104,4 +1130,58 @@ tokenizer_configs:
|
|
|
1104
1130
|
args:
|
|
1105
1131
|
pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
|
|
1106
1132
|
end_of_text_token: "</s>"
|
|
1107
|
-
prefix_token: "<s>"
|
|
1133
|
+
prefix_token: "<s>"
|
|
1134
|
+
|
|
1135
|
+
# Gemma-3-Gaia-PT-BR-4b-it
|
|
1136
|
+
- name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
|
|
1137
|
+
tokenizer_spec:
|
|
1138
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1139
|
+
args:
|
|
1140
|
+
pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
|
|
1141
|
+
end_of_text_token: "<eos>"
|
|
1142
|
+
prefix_token: "<bos>"
|
|
1143
|
+
|
|
1144
|
+
# Bode 13B Alpaca PT-BR
|
|
1145
|
+
- name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
|
|
1146
|
+
tokenizer_spec:
|
|
1147
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1148
|
+
args:
|
|
1149
|
+
pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
|
|
1150
|
+
end_of_text_token: "</s>"
|
|
1151
|
+
prefix_token: "<s>"
|
|
1152
|
+
|
|
1153
|
+
# Cabrita 7B PT-BR tokenizer
|
|
1154
|
+
- name: 22h/cabrita_7b_pt_850000
|
|
1155
|
+
tokenizer_spec:
|
|
1156
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1157
|
+
args:
|
|
1158
|
+
pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
|
|
1159
|
+
end_of_text_token: "</s>"
|
|
1160
|
+
prefix_token: "<s>"
|
|
1161
|
+
|
|
1162
|
+
# Gervásio 7B PT‑BR/PT‑PT tokenizer
|
|
1163
|
+
- name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
|
|
1164
|
+
tokenizer_spec:
|
|
1165
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1166
|
+
args:
|
|
1167
|
+
pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
|
|
1168
|
+
end_of_text_token: "</s>"
|
|
1169
|
+
prefix_token: "<s>"
|
|
1170
|
+
|
|
1171
|
+
# Tucano 2b4 PT-BR tokenizer
|
|
1172
|
+
- name: TucanoBR/Tucano-2b4
|
|
1173
|
+
tokenizer_spec:
|
|
1174
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1175
|
+
args:
|
|
1176
|
+
pretrained_model_name_or_path: TucanoBR/Tucano-2b4
|
|
1177
|
+
end_of_text_token: "</s>"
|
|
1178
|
+
prefix_token: "<s>"
|
|
1179
|
+
|
|
1180
|
+
# TeenyTinyLlama 460M PT-BR tokenizer
|
|
1181
|
+
- name: nicholasKluge/TeenyTinyLlama-460m
|
|
1182
|
+
tokenizer_spec:
|
|
1183
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1184
|
+
args:
|
|
1185
|
+
pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
|
|
1186
|
+
end_of_text_token: "</s>"
|
|
1187
|
+
prefix_token: "<s>"
|
helm/proxy/cli.py
CHANGED
|
@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
|
|
|
123
123
|
|
|
124
124
|
# Update quotas
|
|
125
125
|
for quota_str in args.quotas:
|
|
126
|
-
m = re.match(
|
|
126
|
+
m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
|
|
127
127
|
if not m:
|
|
128
128
|
raise Exception(
|
|
129
129
|
f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "
|
helm/proxy/retry.py
CHANGED
|
@@ -5,6 +5,7 @@ from retrying import Retrying
|
|
|
5
5
|
from helm.common.request import RequestResult
|
|
6
6
|
from helm.common.tokenization_request import TokenizationRequestResult
|
|
7
7
|
from helm.common.hierarchical_logger import hlog
|
|
8
|
+
import os
|
|
8
9
|
import traceback
|
|
9
10
|
import threading
|
|
10
11
|
|
|
@@ -19,6 +20,10 @@ Example usage:
|
|
|
19
20
|
...
|
|
20
21
|
"""
|
|
21
22
|
|
|
23
|
+
# TODO: make these configurable at a config / cli level
|
|
24
|
+
HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
|
|
25
|
+
HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
|
|
26
|
+
|
|
22
27
|
# The lock is used to prevent multiple threads from printing at the same time.
|
|
23
28
|
# This can cause issues when printing the stack trace.
|
|
24
29
|
# (The stack traces can get mixed up and become unreadable.)
|
|
@@ -34,6 +34,8 @@ class GrokAPITokenizer(CachingTokenizer):
|
|
|
34
34
|
"Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
|
|
35
35
|
)
|
|
36
36
|
text = request["text"]
|
|
37
|
+
if not text:
|
|
38
|
+
return {"token_ids": []}
|
|
37
39
|
model = request["tokenizer"].split("/")[-1]
|
|
38
40
|
response = requests.post(
|
|
39
41
|
url="https://api.x.ai/v1/tokenize-text",
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
|
-
from helm.common.request import RequestResult
|
|
4
|
-
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
-
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
6
|
-
from helm.benchmark.scenarios.numeracy_scenario import ( # noqa
|
|
7
|
-
NumeracyScenario,
|
|
8
|
-
Polynomial,
|
|
9
|
-
RELTYPE_INFO,
|
|
10
|
-
distance_linear,
|
|
11
|
-
distance_parabola,
|
|
12
|
-
distance_plane,
|
|
13
|
-
distance_paraboloid,
|
|
14
|
-
)
|
|
15
|
-
from helm.benchmark.metrics.metric import Metric
|
|
16
|
-
from helm.benchmark.metrics.metric_name import MetricName
|
|
17
|
-
from helm.benchmark.metrics.metric_service import MetricService
|
|
18
|
-
from helm.benchmark.metrics.statistic import Stat
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class DistanceMetric(Metric):
|
|
22
|
-
"""Returns the minimum geometric distance between the point represented by the completion
|
|
23
|
-
and the curve or surface specified by `rel_str`.
|
|
24
|
-
|
|
25
|
-
Expects `references.outputs` to be a list containing the following:
|
|
26
|
-
|
|
27
|
-
- val_GT (str): the last coordinate of the point lying on the given curve / surface
|
|
28
|
-
with first coordinates as given in the input
|
|
29
|
-
- rel_str (str): the relation
|
|
30
|
-
- relation_type (str): one of {'linear', 'parabola', 'plane', 'paraboloid'}
|
|
31
|
-
|
|
32
|
-
Returns:
|
|
33
|
-
The minimum geometric distance from the point to the curve / surface float.
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
def evaluate_generation(
|
|
37
|
-
self,
|
|
38
|
-
adapter_spec: AdapterSpec,
|
|
39
|
-
request_state: RequestState,
|
|
40
|
-
metric_service: MetricService,
|
|
41
|
-
eval_cache_path: str,
|
|
42
|
-
) -> List[Stat]:
|
|
43
|
-
"""For given request, compute the following two metrics:
|
|
44
|
-
1. geometric distance metric in range [0, ∞), calling the appropriate distance method, if possible, and
|
|
45
|
-
2. percent valid metric in range [0., 1.] of completions that are a valid number, ignoring commas.
|
|
46
|
-
"""
|
|
47
|
-
references = request_state.instance.references
|
|
48
|
-
_, rel_str, relation_type = map(lambda _: _.output.text, references)
|
|
49
|
-
input_text: str = request_state.instance.input.text
|
|
50
|
-
datapoint_input = input_text.split("\n")[-1]
|
|
51
|
-
val = list(map(int, datapoint_input.split(NumeracyScenario.delimiter)))
|
|
52
|
-
|
|
53
|
-
distance_func = globals()[f"distance_{relation_type}"]
|
|
54
|
-
result = 0.0
|
|
55
|
-
num_valid = 0
|
|
56
|
-
assert request_state.result is not None
|
|
57
|
-
request_result: RequestResult = request_state.result
|
|
58
|
-
for completion_sequence in request_result.completions:
|
|
59
|
-
completion = completion_sequence.text.strip()
|
|
60
|
-
try:
|
|
61
|
-
pred = int(completion.replace(",", "")) # ignore commas in numbers
|
|
62
|
-
except Exception:
|
|
63
|
-
continue
|
|
64
|
-
point = val + [pred]
|
|
65
|
-
result += distance_func(point, rel_str)
|
|
66
|
-
num_valid += 1
|
|
67
|
-
percent_valid = 1.0 * num_valid / len(request_result.completions)
|
|
68
|
-
|
|
69
|
-
return [
|
|
70
|
-
Stat(MetricName("distance")).add(result),
|
|
71
|
-
Stat(MetricName("percent_valid")).add(percent_valid),
|
|
72
|
-
]
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import List, Callable
|
|
3
|
-
|
|
4
|
-
from helm.benchmark.scenarios.numeracy_scenario import (
|
|
5
|
-
distance_linear,
|
|
6
|
-
distance_parabola,
|
|
7
|
-
distance_plane,
|
|
8
|
-
distance_paraboloid,
|
|
9
|
-
)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
TOL = 1e-5 # note: different from TOL in numeracy_scenario.distance_<...> used for checking if real or complex
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataclass(frozen=True)
|
|
16
|
-
class TestCase:
|
|
17
|
-
rel_str: str
|
|
18
|
-
point: List[int]
|
|
19
|
-
dist: float
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def check_test_cases(test_cases: List[TestCase], dist_func: Callable[[List[int], str], float]):
|
|
23
|
-
for test_case in test_cases:
|
|
24
|
-
dist = dist_func(test_case.point, test_case.rel_str)
|
|
25
|
-
dist_gt = test_case.dist
|
|
26
|
-
assert abs(dist - dist_gt) < TOL, f"{test_case.rel_str} {test_case.point}"
|
|
27
|
-
# print(f"{test_case.rel_str} {test_case.point} Dist: {dist}\tDist GT: {dist_gt}")
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def test_distance_linear():
|
|
31
|
-
test_cases = [
|
|
32
|
-
TestCase(
|
|
33
|
-
"y = 4x + 4", [59, 201], 9.458889376416986
|
|
34
|
-
), # https://www.wolframalpha.com/input?i=minimize+sqrt%28%28x+-+59%29%5E2+%2B+%284x+%2B+4+-+201%29%5E2%29
|
|
35
|
-
TestCase("y = x + 3 ", [30, 78], 31.819805153394636),
|
|
36
|
-
TestCase("y = 5x + 4", [-47, 2], 45.69505948719688),
|
|
37
|
-
TestCase("y = 4x + 3", [-65, -255], 0.48507125007266594),
|
|
38
|
-
TestCase("y = 4x + 3", [97, 391], 0.0),
|
|
39
|
-
]
|
|
40
|
-
check_test_cases(test_cases, distance_linear)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def test_distance_parabola():
|
|
44
|
-
test_cases = [
|
|
45
|
-
TestCase("y = 2x^2 + x + 1", [159, 50000], 1.137499072212397),
|
|
46
|
-
TestCase("y = 2x^2 + 2x + 4", [130, 28390], 11.364547837422966),
|
|
47
|
-
TestCase("y = 2x^2 + x + 4", [53, 10000], 17.4468675121177),
|
|
48
|
-
TestCase(
|
|
49
|
-
"y = 2x^2 + 2x + 2", [35, 1], 34.36171077312826
|
|
50
|
-
), # https://www.wolframalpha.com/input?i=minimize+%28x+-+35%29%5E2+%2B+%282x%5E2+%2B+2x+%2B+2+-+1%29%5E2
|
|
51
|
-
TestCase("y = x^2 + x + 2", [197, 39008], 0.0),
|
|
52
|
-
]
|
|
53
|
-
check_test_cases(test_cases, distance_parabola)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def test_distance_plane():
|
|
57
|
-
test_cases = [
|
|
58
|
-
TestCase(
|
|
59
|
-
"z = 4x + 4y + 1", [-4, 9, 1], 3.481553119113957
|
|
60
|
-
), # https://www.wolframalpha.com/input?i=minimize+sqrt%28%28x+%2B+4%29%5E2+%2B+%28y+-+9%29%5E2+%2B+%284x
|
|
61
|
-
# +%2B+4y+%2B+1+-+1%29%5E2%29
|
|
62
|
-
TestCase(
|
|
63
|
-
"z = 3x + 5y + 4", [-10, 4, 3], 1.52127765851133
|
|
64
|
-
), # https://www.wolframalpha.com/input?i=minimize+sqrt%28%28x+%2B+10%29%5E2+%2B+%28y+-+4%29%5E2+%2B+%283
|
|
65
|
-
# x+%2B+5y+%2B+4+-+3%29%5E2%29
|
|
66
|
-
TestCase("z = 4x + 3y + 4", [-5, 4, -7], 0.5883484054145521),
|
|
67
|
-
TestCase("z = 3x + 5y + 2", [-7, 10, 0], 5.239956379316803),
|
|
68
|
-
TestCase("z = 5x + 2y + 3", [-2, -1, -9], 0.0),
|
|
69
|
-
]
|
|
70
|
-
check_test_cases(test_cases, distance_plane)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def test_distance_paraboloid():
|
|
74
|
-
test_cases = [
|
|
75
|
-
TestCase("z = x^2 + y^2 + 2", [0, 0, 2], 0.0),
|
|
76
|
-
TestCase(
|
|
77
|
-
"z = 2x^2 + y^2 + 2", [0, 11, 151], 1.2055445093982982
|
|
78
|
-
), # https://www.wolframalpha.com/input?i=minimize+x%5E2+%2B+%28y+-+11%29%5E2+%2B+%28%282x%5E2+%2B+y%5E2+%2B+2%29+-+151%29%5E2 # noqa
|
|
79
|
-
TestCase(
|
|
80
|
-
"z = 2x^2 + 2y^2 + 2", [0, 0, 6], 1.3919410907075054
|
|
81
|
-
), # https://www.wolframalpha.com/input?i=minimize+x%5E2+%2By%5E2+%2B+%28%282x%5E2+%2B+2y%5E2+%2B+2%29+-+6%29%5E2 # noqa
|
|
82
|
-
TestCase(
|
|
83
|
-
"z = x^2 + y^2 + 2", [0, 0, 20], 4.2130748865881795
|
|
84
|
-
), # https://www.wolframalpha.com/input?i=x%5E2+%2B+y%5E2+%2B+%28%28x%5E2+%2B+y%5E2+%2B+2%29+-+20%29%5E2
|
|
85
|
-
TestCase("z = 2x^2 + xy + y^2 + 4", [6, 19, 519], 0.5290904095503263),
|
|
86
|
-
TestCase("z = 2x^2 + xy + 2y^2 + 3", [0, 14, 380], 0.26248531385619783),
|
|
87
|
-
TestCase("z = x^2 + 2y^2 + 1", [5, 14, 4], 13.354544558906934),
|
|
88
|
-
TestCase("z = x^2 + xy + 2y^2 + 4", [3, 20, 1001], 1.4206031238856873),
|
|
89
|
-
TestCase("z = x^2 + xy + 2y^2 + 4", [0, 0, 55], 51.0),
|
|
90
|
-
TestCase("z = x^2 + xy + 2y^2 + 4", [0, 9, 55], 3.8558889386410757),
|
|
91
|
-
TestCase("z = 2x^2 + 2y^2 + 1", [8, 9, 289], 0.04158555512549898),
|
|
92
|
-
TestCase("z = 2x^2 + 2y^2 + 1", [8, 9, 291], 0.0),
|
|
93
|
-
TestCase("z = x^2 + 2xy + 5y^2 + 4", [0, 9, 55], 5.7150737847649244),
|
|
94
|
-
]
|
|
95
|
-
check_test_cases(test_cases, distance_paraboloid)
|