crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
  3. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  4. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  5. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  6. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  7. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  8. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  9. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  10. helm/benchmark/metrics/comet_metric.py +1 -1
  11. helm/benchmark/metrics/copyright_metrics.py +1 -1
  12. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  13. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  14. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  15. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  16. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  17. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  18. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  19. helm/benchmark/metrics/summac/model_summac.py +1 -1
  20. helm/benchmark/model_deployment_registry.py +11 -19
  21. helm/benchmark/presentation/create_plots.py +11 -2
  22. helm/benchmark/presentation/schema.py +5 -0
  23. helm/benchmark/presentation/summarize.py +9 -3
  24. helm/benchmark/presentation/test_create_plots.py +4 -1
  25. helm/benchmark/run.py +7 -1
  26. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  27. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  29. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  30. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  31. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  32. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  33. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  34. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  35. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  36. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  37. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  38. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  39. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  40. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  41. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  42. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  43. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  44. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  45. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  46. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  47. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  48. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  49. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  50. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  51. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  52. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  53. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  54. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  55. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  56. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  57. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  58. helm/benchmark/scenarios/math_scenario.py +21 -20
  59. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  60. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  61. helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
  62. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  63. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  64. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  65. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  66. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  67. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  68. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  69. helm/benchmark/slurm_jobs.py +1 -2
  70. helm/benchmark/slurm_runner.py +8 -1
  71. helm/benchmark/static/schema_arabic.yaml +228 -0
  72. helm/benchmark/static/schema_classic.yaml +0 -17
  73. helm/benchmark/static/schema_long_context.yaml +19 -1
  74. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  75. helm/benchmark/static_build/index.html +1 -1
  76. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  77. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  78. helm/clients/huggingface_client.py +2 -2
  79. helm/clients/openai_client.py +2 -1
  80. helm/clients/openai_responses_client.py +6 -4
  81. helm/clients/test_huggingface_client.py +3 -3
  82. helm/clients/together_client.py +0 -2
  83. helm/clients/vertexai_client.py +11 -9
  84. helm/clients/vllm_client.py +43 -7
  85. helm/clients/vllm_granite_thinking_client.py +56 -0
  86. helm/common/critique_request.py +0 -1
  87. helm/common/hierarchical_logger.py +83 -34
  88. helm/common/object_spec.py +23 -8
  89. helm/common/test_logging.py +94 -0
  90. helm/config/model_deployments.yaml +454 -175
  91. helm/config/model_metadata.yaml +117 -10
  92. helm/config/tokenizer_configs.yaml +81 -1
  93. helm/proxy/cli.py +1 -1
  94. helm/proxy/retry.py +5 -0
  95. helm/tokenizers/grok_tokenizer.py +2 -0
  96. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  97. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  98. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  99. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  100. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
  101. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  102. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  103. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
@@ -2624,6 +2624,15 @@ models:
2624
2624
  release_date: 2024-11-18
2625
2625
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
2626
2626
 
2627
+ # Moonshot AI
2628
+ - name: moonshotai/kimi-k2-instruct
2629
+ display_name: Kimi K2 Instruct
2630
+ description: Kimi K2 Instruct is a mixture-of-experts (MoE) language model with 32 billion activated parameters and 1 trillion total parameters trained with the Muon optimizer on 15.5T tokens. ([blog](https://moonshotai.github.io/Kimi-K2/))
2631
+ creator_organization_name: Moonshot AI
2632
+ access: open
2633
+ num_parameters: 1029173256720
2634
+ release_date: 2024-07-14 # Blog post has no date, so use the date from this news article https://www.cnbc.com/2025/07/14/alibaba-backed-moonshot-releases-kimi-k2-ai-rivaling-chatgpt-claude.html
2635
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
2627
2636
 
2628
2637
  # MosaicML
2629
2638
  - name: mosaicml/mpt-7b
@@ -3256,6 +3265,14 @@ models:
3256
3265
  release_date: 2025-04-16
3257
3266
  tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3258
3267
 
3268
+ - name: openai/o3-pro-2025-06-10-high-reasoning-effort
3269
+ display_name: o3-pro (2025-06-10, high reasoning effort)
3270
+ description: o3-pro is an o-series model designed to think longer and provide the most reliable responses. ([blog post](https://help.openai.com/en/articles/9624314-model-release-notes))
3271
+ creator_organization_name: OpenAI
3272
+ access: limited
3273
+ release_date: 2025-06-10
3274
+ tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
3275
+
3259
3276
  ## Codex Models
3260
3277
  # DEPRECATED: Codex models have been shut down on March 23 2023.
3261
3278
 
@@ -4163,6 +4180,14 @@ models:
4163
4180
  release_date: 2025-04-03 # https://docs.x.ai/docs/release-notes#april-2025
4164
4181
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4165
4182
 
4183
+ - name: xai/grok-4-0709
4184
+ display_name: Grok 4 (0709)
4185
+ description: Grok 4 (0709) is a model that includes native tool use and real-time search integration. ([blog](https://x.ai/news/grok-4))
4186
+ creator_organization_name: xAI
4187
+ access: limited
4188
+ release_date: 2025-07-09
4189
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4190
+
4166
4191
  # Yandex
4167
4192
  - name: yandex/yalm
4168
4193
  display_name: YaLM (100B)
@@ -4266,6 +4291,31 @@ models:
4266
4291
  release_date: 2023-11-08
4267
4292
  tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4268
4293
 
4294
+ - name: maritaca-ai/sabiazinho-3
4295
+ display_name: Sabiazinho 3
4296
+ description: Sabiazinho-3 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to july 2023.
4297
+ creator_organization_name: Maritaca AI
4298
+ access: limited
4299
+ release_date: 2025-02-06
4300
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4301
+
4302
+ - name: maritaca-ai/sabia-3
4303
+ display_name: Sabía 3
4304
+ description: Sabiá-3 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to july 2023.
4305
+ creator_organization_name: Maritaca AI
4306
+ access: limited
4307
+ release_date: 2024-12-11
4308
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4309
+
4310
+ - name: maritaca-ai/sabia-3.1-2025-05-08
4311
+ display_name: Sabía 3.1
4312
+ description: Sabiá-3.1 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to August 2024.
4313
+ creator_organization_name: Maritaca AI
4314
+ access: limited
4315
+ release_date: 2025-05-08
4316
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4317
+
4318
+
4269
4319
  # Granite - IBM
4270
4320
  # https://www.ibm.com/granite
4271
4321
  # https://github.com/ibm-granite/granite-3.0-language-models
@@ -4479,21 +4529,23 @@ models:
4479
4529
  tags: [ TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG ]
4480
4530
 
4481
4531
  - name: ibm/granite-3.3-8b-instruct
4482
- display_name: Granite 3.3 8B Instruct
4483
- description: Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4532
+ display_name: IBM Granite 3.3 8B Instruct
4533
+ description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
4484
4534
  creator_organization_name: IBM
4485
4535
  access: open
4486
4536
  num_parameters: 8170000000
4487
4537
  release_date: 2025-04-16
4488
4538
  tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4489
4539
 
4490
- - name: mistralai/mixtral-8x7b-instruct-v0:1
4491
- display_name: Mixtral 8x7B Instruct on IBM WatsonX
4492
- description: A 7B sparse Mixture-of-Experts model with stronger capabilities than Mistral 7B. Uses 12B active parameters out of 45B total. Supports multiple languages, code and 32k context window.
4493
- creator_organization_name: Mistral
4494
- access: limited
4495
- release_date: 2023-12-11
4496
- tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4540
+ - name: ibm/granite-3.3-8b-instruct-with-guardian
4541
+ display_name: IBM Granite 3.3 8B Instruct (with guardian)
4542
+ description: IBM Granite 3.3 8B Instruct is a 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct)) This model was run with an additional safety filter using [Granite Guardian 3.2](https://www.ibm.com/granite/docs/models/guardian/).
4543
+ creator_organization_name: IBM
4544
+ access: open
4545
+ num_parameters: 8170000000
4546
+ release_date: 2025-04-16
4547
+ # Unfortunately this setup is not easily reproducible, so we mark it with DEPRECATED_MODEL_TAG
4548
+ tags: [DEPRECATED_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4497
4549
 
4498
4550
  - name: ura-hcmut/ura-llama-2.1-8b
4499
4551
  display_name: URA-Llama 2.1 (8B)
@@ -4682,4 +4734,59 @@ models:
4682
4734
  access: open
4683
4735
  num_parameters: 4000000000
4684
4736
  release_date: 2024-04-02
4685
- tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4737
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4738
+
4739
+ - name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
4740
+ display_name: Gemma-3 Gaia PT-BR 4b Instruct
4741
+ description: Gemma-3 Gaia PT-BR 4b Instruct is a model trained by CEIA-UFG for understanding and generating Brazilian Portuguese text.
4742
+ creator_organization_name: CEIA-UFG
4743
+ access: open
4744
+ num_parameters: 4000000000
4745
+ release_date: 2025-06-01
4746
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4747
+
4748
+ - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
4749
+ display_name: Bode 13B Alpaca PT-BR
4750
+ description: Bode is a language model (LLM) for Portuguese, based on LLaMA 2 and fine-tuned with the Alpaca dataset translated into Portuguese. Suitable for instruction, text generation, translation and tasks in Portuguese.
4751
+ creator_organization_name: Recogna NLP
4752
+ access: open
4753
+ num_parameters: 13000000000
4754
+ release_date: 2024-01-05
4755
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4756
+
4757
+ - name: 22h/cabrita_7b_pt_850000
4758
+ display_name: Cabrita PT-BR 7B
4759
+ description: Cabrita is an OpenLLaMA-based model, continuously trained in Portuguese (mC4-pt subset) for 850000 steps with efficient tokenization adapted to the language.
4760
+ creator_organization_name: 22h
4761
+ access: open
4762
+ num_parameters: 7000000000
4763
+ release_date: 2023-08-23
4764
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4765
+
4766
+ - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
4767
+ display_name: Gervásio PT-BR/PT-PT 7B Decoder
4768
+ description: Gervásio PT* is a 7B parameter decoder model, adapted from LLaMA27B, trained for both Brazilian and European Portuguese. Fine-tuned with translated data from benchmarks such as GLUE and SuperGLUE.
4769
+ creator_organization_name: PORTULAN (University of Lisbon NLX)
4770
+ access: open
4771
+ num_parameters: 6740000000
4772
+ release_date: 2024-02-29
4773
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4774
+
4775
+ - name: TucanoBR/Tucano-2b4
4776
+ display_name: Tucano PT-BR 2b4
4777
+ description: Tucano is a series of decoder models based on LLaMA2, natively pre-trained in Portuguese using the GigaVerbo dataset (200B tokens), with the 2B model trained for 1.96M steps over 845h (515B tokens, 4 epochs).
4778
+ creator_organization_name: TucanoBR (University of Bonn)
4779
+ access: open
4780
+ num_parameters: 2444618240
4781
+ release_date: 2024-12-11
4782
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4783
+
4784
+ - name: nicholasKluge/TeenyTinyLlama-460m
4785
+ display_name: TeenyTinyLlama 460M PT-BR
4786
+ description: TeenyTinyLlama-460m is a lightweight and efficient model based on LLaMA2, trained exclusively on Brazilian Portuguese. It uses RoPE embeddings and SwiGLU activations, with a refined SentencePiece tokenizer and a low-resource optimized architecture.
4787
+ creator_organization_name: Nicholas Kluge.
4788
+ access: open
4789
+ num_parameters: 460000000
4790
+ release_date: 2024-01-30
4791
+ tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
4792
+
@@ -265,6 +265,12 @@ tokenizer_configs:
265
265
  end_of_text_token: ""
266
266
  prefix_token: ""
267
267
 
268
+ - name: xai/grok-4-0709
269
+ tokenizer_spec:
270
+ class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
271
+ end_of_text_token: ""
272
+ prefix_token: ""
273
+
268
274
  # Hf-internal-testing
269
275
 
270
276
  # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
@@ -582,6 +588,17 @@ tokenizer_configs:
582
588
  end_of_text_token: "</s>"
583
589
  prefix_token: "<s>"
584
590
 
591
+ # Moonshot AI
592
+ - name: moonshotai/kimi-k2-instruct
593
+ tokenizer_spec:
594
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
595
+ args:
596
+ pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
597
+ trust_remote_code: true
598
+ revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
599
+ end_of_text_token: "[EOS]"
600
+ prefix_token: "[BOS]"
601
+
585
602
  # Nectec
586
603
  - name: nectec/OpenThaiLLM-Prebuilt-7B
587
604
  tokenizer_spec:
@@ -892,6 +909,7 @@ tokenizer_configs:
892
909
  end_of_text_token: ""
893
910
  prefix_token: ""
894
911
 
912
+ # Maritaca AI
895
913
  - name: maritaca-ai/sabia-7b
896
914
  tokenizer_spec:
897
915
  class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -900,6 +918,14 @@ tokenizer_configs:
900
918
  end_of_text_token: "</s>"
901
919
  prefix_token: "<s>"
902
920
 
921
+ - name: maritaca-ai/sabia-2-tokenizer-medium
922
+ tokenizer_spec:
923
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
924
+ args:
925
+ pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
926
+ end_of_text_token: "</s>"
927
+ prefix_token: "<s>"
928
+
903
929
  # Granite-3.1-8b-base
904
930
  - name: ibm-granite/granite-3.1-8b-base
905
931
  tokenizer_spec:
@@ -1104,4 +1130,58 @@ tokenizer_configs:
1104
1130
  args:
1105
1131
  pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
1106
1132
  end_of_text_token: "</s>"
1107
- prefix_token: "<s>"
1133
+ prefix_token: "<s>"
1134
+
1135
+ # Gemma-3-Gaia-PT-BR-4b-it
1136
+ - name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
1137
+ tokenizer_spec:
1138
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1139
+ args:
1140
+ pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
1141
+ end_of_text_token: "<eos>"
1142
+ prefix_token: "<bos>"
1143
+
1144
+ # Bode 13B Alpaca PT-BR
1145
+ - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
1146
+ tokenizer_spec:
1147
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1148
+ args:
1149
+ pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
1150
+ end_of_text_token: "</s>"
1151
+ prefix_token: "<s>"
1152
+
1153
+ # Cabrita 7B PT-BR tokenizer
1154
+ - name: 22h/cabrita_7b_pt_850000
1155
+ tokenizer_spec:
1156
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1157
+ args:
1158
+ pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
1159
+ end_of_text_token: "</s>"
1160
+ prefix_token: "<s>"
1161
+
1162
+ # Gervásio 7B PT‑BR/PT‑PT tokenizer
1163
+ - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
1164
+ tokenizer_spec:
1165
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1166
+ args:
1167
+ pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
1168
+ end_of_text_token: "</s>"
1169
+ prefix_token: "<s>"
1170
+
1171
+ # Tucano 2b4 PT-BR tokenizer
1172
+ - name: TucanoBR/Tucano-2b4
1173
+ tokenizer_spec:
1174
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1175
+ args:
1176
+ pretrained_model_name_or_path: TucanoBR/Tucano-2b4
1177
+ end_of_text_token: "</s>"
1178
+ prefix_token: "<s>"
1179
+
1180
+ # TeenyTinyLlama 460M PT-BR tokenizer
1181
+ - name: nicholasKluge/TeenyTinyLlama-460m
1182
+ tokenizer_spec:
1183
+ class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
1184
+ args:
1185
+ pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
1186
+ end_of_text_token: "</s>"
1187
+ prefix_token: "<s>"
helm/proxy/cli.py CHANGED
@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
123
123
 
124
124
  # Update quotas
125
125
  for quota_str in args.quotas:
126
- m = re.match(f"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
126
+ m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
127
127
  if not m:
128
128
  raise Exception(
129
129
  f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "
helm/proxy/retry.py CHANGED
@@ -5,6 +5,7 @@ from retrying import Retrying
5
5
  from helm.common.request import RequestResult
6
6
  from helm.common.tokenization_request import TokenizationRequestResult
7
7
  from helm.common.hierarchical_logger import hlog
8
+ import os
8
9
  import traceback
9
10
  import threading
10
11
 
@@ -19,6 +20,10 @@ Example usage:
19
20
  ...
20
21
  """
21
22
 
23
+ # TODO: make these configurable at a config / cli level
24
+ HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
25
+ HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
26
+
22
27
  # The lock is used to prevent multiple threads from printing at the same time.
23
28
  # This can cause issues when printing the stack trace.
24
29
  # (The stack traces can get mixed up and become unreadable.)
@@ -34,6 +34,8 @@ class GrokAPITokenizer(CachingTokenizer):
34
34
  "Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
35
35
  )
36
36
  text = request["text"]
37
+ if not text:
38
+ return {"token_ids": []}
37
39
  model = request["tokenizer"].split("/")[-1]
38
40
  response = requests.post(
39
41
  url="https://api.x.ai/v1/tokenize-text",
@@ -1,72 +0,0 @@
1
- from typing import List
2
-
3
- from helm.common.request import RequestResult
4
- from helm.benchmark.adaptation.request_state import RequestState
5
- from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
- from helm.benchmark.scenarios.numeracy_scenario import ( # noqa
7
- NumeracyScenario,
8
- Polynomial,
9
- RELTYPE_INFO,
10
- distance_linear,
11
- distance_parabola,
12
- distance_plane,
13
- distance_paraboloid,
14
- )
15
- from helm.benchmark.metrics.metric import Metric
16
- from helm.benchmark.metrics.metric_name import MetricName
17
- from helm.benchmark.metrics.metric_service import MetricService
18
- from helm.benchmark.metrics.statistic import Stat
19
-
20
-
21
- class DistanceMetric(Metric):
22
- """Returns the minimum geometric distance between the point represented by the completion
23
- and the curve or surface specified by `rel_str`.
24
-
25
- Expects `references.outputs` to be a list containing the following:
26
-
27
- - val_GT (str): the last coordinate of the point lying on the given curve / surface
28
- with first coordinates as given in the input
29
- - rel_str (str): the relation
30
- - relation_type (str): one of {'linear', 'parabola', 'plane', 'paraboloid'}
31
-
32
- Returns:
33
- The minimum geometric distance from the point to the curve / surface float.
34
- """
35
-
36
- def evaluate_generation(
37
- self,
38
- adapter_spec: AdapterSpec,
39
- request_state: RequestState,
40
- metric_service: MetricService,
41
- eval_cache_path: str,
42
- ) -> List[Stat]:
43
- """For given request, compute the following two metrics:
44
- 1. geometric distance metric in range [0, ∞), calling the appropriate distance method, if possible, and
45
- 2. percent valid metric in range [0., 1.] of completions that are a valid number, ignoring commas.
46
- """
47
- references = request_state.instance.references
48
- _, rel_str, relation_type = map(lambda _: _.output.text, references)
49
- input_text: str = request_state.instance.input.text
50
- datapoint_input = input_text.split("\n")[-1]
51
- val = list(map(int, datapoint_input.split(NumeracyScenario.delimiter)))
52
-
53
- distance_func = globals()[f"distance_{relation_type}"]
54
- result = 0.0
55
- num_valid = 0
56
- assert request_state.result is not None
57
- request_result: RequestResult = request_state.result
58
- for completion_sequence in request_result.completions:
59
- completion = completion_sequence.text.strip()
60
- try:
61
- pred = int(completion.replace(",", "")) # ignore commas in numbers
62
- except Exception:
63
- continue
64
- point = val + [pred]
65
- result += distance_func(point, rel_str)
66
- num_valid += 1
67
- percent_valid = 1.0 * num_valid / len(request_result.completions)
68
-
69
- return [
70
- Stat(MetricName("distance")).add(result),
71
- Stat(MetricName("percent_valid")).add(percent_valid),
72
- ]
@@ -1,95 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import List, Callable
3
-
4
- from helm.benchmark.scenarios.numeracy_scenario import (
5
- distance_linear,
6
- distance_parabola,
7
- distance_plane,
8
- distance_paraboloid,
9
- )
10
-
11
-
12
- TOL = 1e-5 # note: different from TOL in numeracy_scenario.distance_<...> used for checking if real or complex
13
-
14
-
15
- @dataclass(frozen=True)
16
- class TestCase:
17
- rel_str: str
18
- point: List[int]
19
- dist: float
20
-
21
-
22
- def check_test_cases(test_cases: List[TestCase], dist_func: Callable[[List[int], str], float]):
23
- for test_case in test_cases:
24
- dist = dist_func(test_case.point, test_case.rel_str)
25
- dist_gt = test_case.dist
26
- assert abs(dist - dist_gt) < TOL, f"{test_case.rel_str} {test_case.point}"
27
- # print(f"{test_case.rel_str} {test_case.point} Dist: {dist}\tDist GT: {dist_gt}")
28
-
29
-
30
- def test_distance_linear():
31
- test_cases = [
32
- TestCase(
33
- "y = 4x + 4", [59, 201], 9.458889376416986
34
- ), # https://www.wolframalpha.com/input?i=minimize+sqrt%28%28x+-+59%29%5E2+%2B+%284x+%2B+4+-+201%29%5E2%29
35
- TestCase("y = x + 3 ", [30, 78], 31.819805153394636),
36
- TestCase("y = 5x + 4", [-47, 2], 45.69505948719688),
37
- TestCase("y = 4x + 3", [-65, -255], 0.48507125007266594),
38
- TestCase("y = 4x + 3", [97, 391], 0.0),
39
- ]
40
- check_test_cases(test_cases, distance_linear)
41
-
42
-
43
- def test_distance_parabola():
44
- test_cases = [
45
- TestCase("y = 2x^2 + x + 1", [159, 50000], 1.137499072212397),
46
- TestCase("y = 2x^2 + 2x + 4", [130, 28390], 11.364547837422966),
47
- TestCase("y = 2x^2 + x + 4", [53, 10000], 17.4468675121177),
48
- TestCase(
49
- "y = 2x^2 + 2x + 2", [35, 1], 34.36171077312826
50
- ), # https://www.wolframalpha.com/input?i=minimize+%28x+-+35%29%5E2+%2B+%282x%5E2+%2B+2x+%2B+2+-+1%29%5E2
51
- TestCase("y = x^2 + x + 2", [197, 39008], 0.0),
52
- ]
53
- check_test_cases(test_cases, distance_parabola)
54
-
55
-
56
- def test_distance_plane():
57
- test_cases = [
58
- TestCase(
59
- "z = 4x + 4y + 1", [-4, 9, 1], 3.481553119113957
60
- ), # https://www.wolframalpha.com/input?i=minimize+sqrt%28%28x+%2B+4%29%5E2+%2B+%28y+-+9%29%5E2+%2B+%284x
61
- # +%2B+4y+%2B+1+-+1%29%5E2%29
62
- TestCase(
63
- "z = 3x + 5y + 4", [-10, 4, 3], 1.52127765851133
64
- ), # https://www.wolframalpha.com/input?i=minimize+sqrt%28%28x+%2B+10%29%5E2+%2B+%28y+-+4%29%5E2+%2B+%283
65
- # x+%2B+5y+%2B+4+-+3%29%5E2%29
66
- TestCase("z = 4x + 3y + 4", [-5, 4, -7], 0.5883484054145521),
67
- TestCase("z = 3x + 5y + 2", [-7, 10, 0], 5.239956379316803),
68
- TestCase("z = 5x + 2y + 3", [-2, -1, -9], 0.0),
69
- ]
70
- check_test_cases(test_cases, distance_plane)
71
-
72
-
73
- def test_distance_paraboloid():
74
- test_cases = [
75
- TestCase("z = x^2 + y^2 + 2", [0, 0, 2], 0.0),
76
- TestCase(
77
- "z = 2x^2 + y^2 + 2", [0, 11, 151], 1.2055445093982982
78
- ), # https://www.wolframalpha.com/input?i=minimize+x%5E2+%2B+%28y+-+11%29%5E2+%2B+%28%282x%5E2+%2B+y%5E2+%2B+2%29+-+151%29%5E2 # noqa
79
- TestCase(
80
- "z = 2x^2 + 2y^2 + 2", [0, 0, 6], 1.3919410907075054
81
- ), # https://www.wolframalpha.com/input?i=minimize+x%5E2+%2By%5E2+%2B+%28%282x%5E2+%2B+2y%5E2+%2B+2%29+-+6%29%5E2 # noqa
82
- TestCase(
83
- "z = x^2 + y^2 + 2", [0, 0, 20], 4.2130748865881795
84
- ), # https://www.wolframalpha.com/input?i=x%5E2+%2B+y%5E2+%2B+%28%28x%5E2+%2B+y%5E2+%2B+2%29+-+20%29%5E2
85
- TestCase("z = 2x^2 + xy + y^2 + 4", [6, 19, 519], 0.5290904095503263),
86
- TestCase("z = 2x^2 + xy + 2y^2 + 3", [0, 14, 380], 0.26248531385619783),
87
- TestCase("z = x^2 + 2y^2 + 1", [5, 14, 4], 13.354544558906934),
88
- TestCase("z = x^2 + xy + 2y^2 + 4", [3, 20, 1001], 1.4206031238856873),
89
- TestCase("z = x^2 + xy + 2y^2 + 4", [0, 0, 55], 51.0),
90
- TestCase("z = x^2 + xy + 2y^2 + 4", [0, 9, 55], 3.8558889386410757),
91
- TestCase("z = 2x^2 + 2y^2 + 1", [8, 9, 289], 0.04158555512549898),
92
- TestCase("z = 2x^2 + 2y^2 + 1", [8, 9, 291], 0.0),
93
- TestCase("z = x^2 + 2xy + 5y^2 + 4", [0, 9, 55], 5.7150737847649244),
94
- ]
95
- check_test_cases(test_cases, distance_paraboloid)