crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +191 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +47 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +24 -6
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/huggingface_client.py +2 -2
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +33 -20
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -13
- helm/clients/vertexai_client.py +19 -11
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +525 -172
- helm/config/model_metadata.yaml +185 -10
- helm/config/tokenizer_configs.yaml +100 -2
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
helm/config/model_metadata.yaml
CHANGED
|
@@ -1253,6 +1253,14 @@ models:
|
|
|
1253
1253
|
release_date: 2025-06-17
|
|
1254
1254
|
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1255
1255
|
|
|
1256
|
+
- name: google/gemini-2.5-flash-lite
|
|
1257
|
+
display_name: Gemini 2.5 Flash-Lite
|
|
1258
|
+
description: Gemini 2.5 Flash-Lite ([blog](https://blog.google/products/gemini/gemini-2-5-model-family-expands/))
|
|
1259
|
+
creator_organization_name: Google
|
|
1260
|
+
access: limited
|
|
1261
|
+
release_date: 2025-07-22
|
|
1262
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, AUDIO_LANGUAGE_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
1263
|
+
|
|
1256
1264
|
- name: google/gemini-2.5-flash-preview-04-17
|
|
1257
1265
|
display_name: Gemini 2.5 Flash (04-17 preview)
|
|
1258
1266
|
description: Gemini 2.5 Flash (04-17 preview) ([documentation](https://ai.google.dev/gemini-api/docs/models/gemini))
|
|
@@ -2624,6 +2632,15 @@ models:
|
|
|
2624
2632
|
release_date: 2024-11-18
|
|
2625
2633
|
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2626
2634
|
|
|
2635
|
+
# Moonshot AI
|
|
2636
|
+
- name: moonshotai/kimi-k2-instruct
|
|
2637
|
+
display_name: Kimi K2 Instruct
|
|
2638
|
+
description: Kimi K2 Instruct is a mixture-of-experts (MoE) language model with 32 billion activated parameters and 1 trillion total parameters trained with the Muon optimizer on 15.5T tokens. ([blog](https://moonshotai.github.io/Kimi-K2/))
|
|
2639
|
+
creator_organization_name: Moonshot AI
|
|
2640
|
+
access: open
|
|
2641
|
+
num_parameters: 1029173256720
|
|
2642
|
+
release_date: 2024-07-14 # Blog post has no date, so use the date from this news article https://www.cnbc.com/2025/07/14/alibaba-backed-moonshot-releases-kimi-k2-ai-rivaling-chatgpt-claude.html
|
|
2643
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
2627
2644
|
|
|
2628
2645
|
# MosaicML
|
|
2629
2646
|
- name: mosaicml/mpt-7b
|
|
@@ -3043,6 +3060,30 @@ models:
|
|
|
3043
3060
|
release_date: 2025-04-14
|
|
3044
3061
|
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3045
3062
|
|
|
3063
|
+
- name: openai/gpt-5-2025-08-07
|
|
3064
|
+
display_name: GPT-5 (2025-08-07)
|
|
3065
|
+
description: GPT-5 (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
|
|
3066
|
+
creator_organization_name: OpenAI
|
|
3067
|
+
access: limited
|
|
3068
|
+
release_date: 2025-08-07
|
|
3069
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3070
|
+
|
|
3071
|
+
- name: openai/gpt-5-mini-2025-08-07
|
|
3072
|
+
display_name: GPT-5 mini (2025-08-07)
|
|
3073
|
+
description: GPT-5 mini (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
|
|
3074
|
+
creator_organization_name: OpenAI
|
|
3075
|
+
access: limited
|
|
3076
|
+
release_date: 2025-08-07
|
|
3077
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3078
|
+
|
|
3079
|
+
- name: openai/gpt-5-nano-2025-08-07
|
|
3080
|
+
display_name: GPT-5 nano (2025-08-07)
|
|
3081
|
+
description: GPT-5 nano (2025-08-07) is a multimdodal model trained for real-world coding tasks and long-running agentic tasks. ([blog](https://openai.com/index/introducing-gpt-5-for-developers/), [system card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf))
|
|
3082
|
+
creator_organization_name: OpenAI
|
|
3083
|
+
access: limited
|
|
3084
|
+
release_date: 2025-08-07
|
|
3085
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3086
|
+
|
|
3046
3087
|
- name: openai/whisper-1_gpt-4o-2024-11-20
|
|
3047
3088
|
display_name: Whisper-1 + GPT-4o (2024-11-20)
|
|
3048
3089
|
description: Transcribes the text with Whisper-1 and then uses GPT-4o to generate a response.
|
|
@@ -3256,6 +3297,31 @@ models:
|
|
|
3256
3297
|
release_date: 2025-04-16
|
|
3257
3298
|
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3258
3299
|
|
|
3300
|
+
- name: openai/o3-pro-2025-06-10-high-reasoning-effort
|
|
3301
|
+
display_name: o3-pro (2025-06-10, high reasoning effort)
|
|
3302
|
+
description: o3-pro is an o-series model designed to think longer and provide the most reliable responses. ([blog post](https://help.openai.com/en/articles/9624314-model-release-notes))
|
|
3303
|
+
creator_organization_name: OpenAI
|
|
3304
|
+
access: limited
|
|
3305
|
+
release_date: 2025-06-10
|
|
3306
|
+
tags: [TEXT_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3307
|
+
|
|
3308
|
+
## GPT-OSS
|
|
3309
|
+
- name: openai/gpt-oss-20b
|
|
3310
|
+
display_name: gpt-oss-20b
|
|
3311
|
+
description: gpt-oss-20b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 3.6B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
|
|
3312
|
+
creator_organization_name: OpenAI
|
|
3313
|
+
access: open
|
|
3314
|
+
release_date: 2025-08-05
|
|
3315
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3316
|
+
|
|
3317
|
+
- name: openai/gpt-oss-120b
|
|
3318
|
+
display_name: gpt-oss-120b
|
|
3319
|
+
description: gpt-oss-120b is an open-weight language model that was trained using a mix of reinforcement learning and other techniques informed by OpenAI's internal models. It uses a mixture-of-experts architecture and activates 5.1B parameters per token. ([blog](https://openai.com/index/introducing-gpt-oss/))
|
|
3320
|
+
creator_organization_name: OpenAI
|
|
3321
|
+
access: open
|
|
3322
|
+
release_date: 2025-08-05
|
|
3323
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3324
|
+
|
|
3259
3325
|
## Codex Models
|
|
3260
3326
|
# DEPRECATED: Codex models have been shut down on March 23 2023.
|
|
3261
3327
|
|
|
@@ -3532,6 +3598,14 @@ models:
|
|
|
3532
3598
|
release_date: 2025-04-29
|
|
3533
3599
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3534
3600
|
|
|
3601
|
+
- name: qwen/qwen3-235b-a22b-instruct-2507-fp8
|
|
3602
|
+
display_name: Qwen3 235B A22B Instruct 2507 FP8
|
|
3603
|
+
description: Qwen3 235B A22B Instruct 2507 FP8 is an updated version of the non-thinking mode of Qwen3 235B A22B FP8.
|
|
3604
|
+
creator_organization_name: Qwen
|
|
3605
|
+
access: open
|
|
3606
|
+
release_date: 2025-07-21 # https://x.com/Alibaba_Qwen/status/1947344511988076547
|
|
3607
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
3608
|
+
|
|
3535
3609
|
- name: qwen/qwq-32b-preview
|
|
3536
3610
|
display_name: QwQ (32B Preview)
|
|
3537
3611
|
description: QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. ([blog post](https://qwenlm.github.io/blog/qwq-32b-preview/)).
|
|
@@ -4163,6 +4237,14 @@ models:
|
|
|
4163
4237
|
release_date: 2025-04-03 # https://docs.x.ai/docs/release-notes#april-2025
|
|
4164
4238
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4165
4239
|
|
|
4240
|
+
- name: xai/grok-4-0709
|
|
4241
|
+
display_name: Grok 4 (0709)
|
|
4242
|
+
description: Grok 4 (0709) is a model that includes native tool use and real-time search integration. ([blog](https://x.ai/news/grok-4))
|
|
4243
|
+
creator_organization_name: xAI
|
|
4244
|
+
access: limited
|
|
4245
|
+
release_date: 2025-07-09
|
|
4246
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4247
|
+
|
|
4166
4248
|
# Yandex
|
|
4167
4249
|
- name: yandex/yalm
|
|
4168
4250
|
display_name: YaLM (100B)
|
|
@@ -4266,6 +4348,42 @@ models:
|
|
|
4266
4348
|
release_date: 2023-11-08
|
|
4267
4349
|
tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4268
4350
|
|
|
4351
|
+
- name: maritaca-ai/sabiazinho-3
|
|
4352
|
+
display_name: Sabiazinho 3
|
|
4353
|
+
description: Sabiazinho-3 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to july 2023.
|
|
4354
|
+
creator_organization_name: Maritaca AI
|
|
4355
|
+
access: limited
|
|
4356
|
+
release_date: 2025-02-06
|
|
4357
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4358
|
+
|
|
4359
|
+
- name: maritaca-ai/sabia-3
|
|
4360
|
+
display_name: Sabía 3
|
|
4361
|
+
description: Sabiá-3 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to july 2023.
|
|
4362
|
+
creator_organization_name: Maritaca AI
|
|
4363
|
+
access: limited
|
|
4364
|
+
release_date: 2024-12-11
|
|
4365
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4366
|
+
|
|
4367
|
+
- name: maritaca-ai/sabia-3.1-2025-05-08
|
|
4368
|
+
display_name: Sabía 3.1
|
|
4369
|
+
description: Sabiá-3.1 is a decoder-only language model designed for Portuguese text generation and understanding tasks. It supports a long context window of up to 128,000 tokens and is offered via API with scalable rate limits. The model is trained on diverse Portuguese corpora with knowledge up to August 2024.
|
|
4370
|
+
creator_organization_name: Maritaca AI
|
|
4371
|
+
access: limited
|
|
4372
|
+
release_date: 2025-05-08
|
|
4373
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4374
|
+
|
|
4375
|
+
# Z.ai
|
|
4376
|
+
|
|
4377
|
+
- name: zai-org/glm-4.5-air-fp8
|
|
4378
|
+
display_name: GLM-4.5-Air-FP8
|
|
4379
|
+
description: GLM-4.5-Air-FP8 is a hybrid reasoning model designed to unify reasoning, coding, and agentic capabilities into a single model. It has 106 billion total parameters and 12 billion active parameters. The thinking mode is enabled by default. ([blog](https://z.ai/blog/glm-4.5))
|
|
4380
|
+
creator_organization_name: Z.ai
|
|
4381
|
+
access: open
|
|
4382
|
+
num_parameters: 110000000000
|
|
4383
|
+
release_date: 2025-07-28
|
|
4384
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4385
|
+
|
|
4386
|
+
|
|
4269
4387
|
# Granite - IBM
|
|
4270
4388
|
# https://www.ibm.com/granite
|
|
4271
4389
|
# https://github.com/ibm-granite/granite-3.0-language-models
|
|
@@ -4479,21 +4597,23 @@ models:
|
|
|
4479
4597
|
tags: [ TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG ]
|
|
4480
4598
|
|
|
4481
4599
|
- name: ibm/granite-3.3-8b-instruct
|
|
4482
|
-
display_name: Granite 3.3 8B Instruct
|
|
4483
|
-
description: Granite 3.3 8B Instruct is
|
|
4600
|
+
display_name: IBM Granite 3.3 8B Instruct
|
|
4601
|
+
description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
|
|
4484
4602
|
creator_organization_name: IBM
|
|
4485
4603
|
access: open
|
|
4486
4604
|
num_parameters: 8170000000
|
|
4487
4605
|
release_date: 2025-04-16
|
|
4488
4606
|
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4489
4607
|
|
|
4490
|
-
- name:
|
|
4491
|
-
display_name:
|
|
4492
|
-
description:
|
|
4493
|
-
creator_organization_name:
|
|
4494
|
-
access:
|
|
4495
|
-
|
|
4496
|
-
|
|
4608
|
+
- name: ibm/granite-3.3-8b-instruct-with-guardian
|
|
4609
|
+
display_name: IBM Granite 3.3 8B Instruct (with guardian)
|
|
4610
|
+
description: IBM Granite 3.3 8B Instruct is an 8-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. All prompts were first evaluated for risk by [IBM Granite Guardian 3.2 5B](https://www.ibm.com/granite/docs/models/guardian/) and prompts that were deemed risky (with a risk threshold of 0.8) received the response "I'm very sorry, but I can't assist with that.". ([model card](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct))
|
|
4611
|
+
creator_organization_name: IBM
|
|
4612
|
+
access: open
|
|
4613
|
+
num_parameters: 8170000000
|
|
4614
|
+
release_date: 2025-04-16
|
|
4615
|
+
# Unfortunately this setup is not easily reproducible, so we mark it with DEPRECATED_MODEL_TAG
|
|
4616
|
+
tags: [DEPRECATED_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4497
4617
|
|
|
4498
4618
|
- name: ura-hcmut/ura-llama-2.1-8b
|
|
4499
4619
|
display_name: URA-Llama 2.1 (8B)
|
|
@@ -4682,4 +4802,59 @@ models:
|
|
|
4682
4802
|
access: open
|
|
4683
4803
|
num_parameters: 4000000000
|
|
4684
4804
|
release_date: 2024-04-02
|
|
4685
|
-
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4805
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4806
|
+
|
|
4807
|
+
- name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
|
|
4808
|
+
display_name: Gemma-3 Gaia PT-BR 4b Instruct
|
|
4809
|
+
description: Gemma-3 Gaia PT-BR 4b Instruct is a model trained by CEIA-UFG for understanding and generating Brazilian Portuguese text.
|
|
4810
|
+
creator_organization_name: CEIA-UFG
|
|
4811
|
+
access: open
|
|
4812
|
+
num_parameters: 4000000000
|
|
4813
|
+
release_date: 2025-06-01
|
|
4814
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4815
|
+
|
|
4816
|
+
- name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
|
|
4817
|
+
display_name: Bode 13B Alpaca PT-BR
|
|
4818
|
+
description: Bode is a language model (LLM) for Portuguese, based on LLaMA 2 and fine-tuned with the Alpaca dataset translated into Portuguese. Suitable for instruction, text generation, translation and tasks in Portuguese.
|
|
4819
|
+
creator_organization_name: Recogna NLP
|
|
4820
|
+
access: open
|
|
4821
|
+
num_parameters: 13000000000
|
|
4822
|
+
release_date: 2024-01-05
|
|
4823
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4824
|
+
|
|
4825
|
+
- name: 22h/cabrita_7b_pt_850000
|
|
4826
|
+
display_name: Cabrita PT-BR 7B
|
|
4827
|
+
description: Cabrita is an OpenLLaMA-based model, continuously trained in Portuguese (mC4-pt subset) for 850000 steps with efficient tokenization adapted to the language.
|
|
4828
|
+
creator_organization_name: 22h
|
|
4829
|
+
access: open
|
|
4830
|
+
num_parameters: 7000000000
|
|
4831
|
+
release_date: 2023-08-23
|
|
4832
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4833
|
+
|
|
4834
|
+
- name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
|
|
4835
|
+
display_name: Gervásio PT-BR/PT-PT 7B Decoder
|
|
4836
|
+
description: Gervásio PT* is a 7B parameter decoder model, adapted from LLaMA27B, trained for both Brazilian and European Portuguese. Fine-tuned with translated data from benchmarks such as GLUE and SuperGLUE.
|
|
4837
|
+
creator_organization_name: PORTULAN (University of Lisbon NLX)
|
|
4838
|
+
access: open
|
|
4839
|
+
num_parameters: 6740000000
|
|
4840
|
+
release_date: 2024-02-29
|
|
4841
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4842
|
+
|
|
4843
|
+
- name: TucanoBR/Tucano-2b4
|
|
4844
|
+
display_name: Tucano PT-BR 2b4
|
|
4845
|
+
description: Tucano is a series of decoder models based on LLaMA2, natively pre-trained in Portuguese using the GigaVerbo dataset (200B tokens), with the 2B model trained for 1.96M steps over 845h (515B tokens, 4 epochs).
|
|
4846
|
+
creator_organization_name: TucanoBR (University of Bonn)
|
|
4847
|
+
access: open
|
|
4848
|
+
num_parameters: 2444618240
|
|
4849
|
+
release_date: 2024-12-11
|
|
4850
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4851
|
+
|
|
4852
|
+
- name: nicholasKluge/TeenyTinyLlama-460m
|
|
4853
|
+
display_name: TeenyTinyLlama 460M PT-BR
|
|
4854
|
+
description: TeenyTinyLlama-460m is a lightweight and efficient model based on LLaMA2, trained exclusively on Brazilian Portuguese. It uses RoPE embeddings and SwiGLU activations, with a refined SentencePiece tokenizer and a low-resource optimized architecture.
|
|
4855
|
+
creator_organization_name: Nicholas Kluge.
|
|
4856
|
+
access: open
|
|
4857
|
+
num_parameters: 460000000
|
|
4858
|
+
release_date: 2024-01-30
|
|
4859
|
+
tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
|
|
4860
|
+
|
|
@@ -265,6 +265,12 @@ tokenizer_configs:
|
|
|
265
265
|
end_of_text_token: ""
|
|
266
266
|
prefix_token: ""
|
|
267
267
|
|
|
268
|
+
- name: xai/grok-4-0709
|
|
269
|
+
tokenizer_spec:
|
|
270
|
+
class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
|
|
271
|
+
end_of_text_token: ""
|
|
272
|
+
prefix_token: ""
|
|
273
|
+
|
|
268
274
|
# Hf-internal-testing
|
|
269
275
|
|
|
270
276
|
# Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
|
|
@@ -582,6 +588,17 @@ tokenizer_configs:
|
|
|
582
588
|
end_of_text_token: "</s>"
|
|
583
589
|
prefix_token: "<s>"
|
|
584
590
|
|
|
591
|
+
# Moonshot AI
|
|
592
|
+
- name: moonshotai/kimi-k2-instruct
|
|
593
|
+
tokenizer_spec:
|
|
594
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
595
|
+
args:
|
|
596
|
+
pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
|
|
597
|
+
trust_remote_code: true
|
|
598
|
+
revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
|
|
599
|
+
end_of_text_token: "[EOS]"
|
|
600
|
+
prefix_token: "[BOS]"
|
|
601
|
+
|
|
585
602
|
# Nectec
|
|
586
603
|
- name: nectec/OpenThaiLLM-Prebuilt-7B
|
|
587
604
|
tokenizer_spec:
|
|
@@ -633,6 +650,12 @@ tokenizer_configs:
|
|
|
633
650
|
end_of_text_token: "<|endoftext|>"
|
|
634
651
|
prefix_token: "<|endoftext|>"
|
|
635
652
|
|
|
653
|
+
- name: openai/o200k_harmony
|
|
654
|
+
tokenizer_spec:
|
|
655
|
+
class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
|
|
656
|
+
end_of_text_token: "<|endoftext|>"
|
|
657
|
+
prefix_token: "<|startoftext|>"
|
|
658
|
+
|
|
636
659
|
- name: openai/clip-vit-large-patch14
|
|
637
660
|
tokenizer_spec:
|
|
638
661
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -688,6 +711,12 @@ tokenizer_configs:
|
|
|
688
711
|
end_of_text_token: "<|im_end|>"
|
|
689
712
|
prefix_token: "<|im_start|>"
|
|
690
713
|
|
|
714
|
+
- name: qwen/qwen3-235b-a22b-instruct-2507-fp8
|
|
715
|
+
tokenizer_spec:
|
|
716
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
717
|
+
end_of_text_token: "<|im_end|>"
|
|
718
|
+
prefix_token: ""
|
|
719
|
+
|
|
691
720
|
- name: qwen/qwq-32b-preview
|
|
692
721
|
tokenizer_spec:
|
|
693
722
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -892,6 +921,7 @@ tokenizer_configs:
|
|
|
892
921
|
end_of_text_token: ""
|
|
893
922
|
prefix_token: ""
|
|
894
923
|
|
|
924
|
+
# Maritaca AI
|
|
895
925
|
- name: maritaca-ai/sabia-7b
|
|
896
926
|
tokenizer_spec:
|
|
897
927
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -900,6 +930,14 @@ tokenizer_configs:
|
|
|
900
930
|
end_of_text_token: "</s>"
|
|
901
931
|
prefix_token: "<s>"
|
|
902
932
|
|
|
933
|
+
- name: maritaca-ai/sabia-2-tokenizer-medium
|
|
934
|
+
tokenizer_spec:
|
|
935
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
936
|
+
args:
|
|
937
|
+
pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
|
|
938
|
+
end_of_text_token: "</s>"
|
|
939
|
+
prefix_token: "<s>"
|
|
940
|
+
|
|
903
941
|
# Granite-3.1-8b-base
|
|
904
942
|
- name: ibm-granite/granite-3.1-8b-base
|
|
905
943
|
tokenizer_spec:
|
|
@@ -1022,7 +1060,6 @@ tokenizer_configs:
|
|
|
1022
1060
|
end_of_text_token: ""
|
|
1023
1061
|
|
|
1024
1062
|
# IBM Granite 3.3
|
|
1025
|
-
|
|
1026
1063
|
- name: ibm/granite-3.3-8b-instruct
|
|
1027
1064
|
tokenizer_spec:
|
|
1028
1065
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -1031,6 +1068,13 @@ tokenizer_configs:
|
|
|
1031
1068
|
end_of_text_token: "<|end_of_text|>"
|
|
1032
1069
|
prefix_token: "<|end_of_text|>"
|
|
1033
1070
|
|
|
1071
|
+
# Z.ai GLM-4.5-AIR-FP8
|
|
1072
|
+
- name: zai-org/glm-4.5-air-fp8
|
|
1073
|
+
tokenizer_spec:
|
|
1074
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1075
|
+
end_of_text_token: "<|endoftext|>"
|
|
1076
|
+
prefix_token: ""
|
|
1077
|
+
|
|
1034
1078
|
|
|
1035
1079
|
|
|
1036
1080
|
# DeepSeek-R1-Distill-Llama-3.1-8b
|
|
@@ -1104,4 +1148,58 @@ tokenizer_configs:
|
|
|
1104
1148
|
args:
|
|
1105
1149
|
pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
|
|
1106
1150
|
end_of_text_token: "</s>"
|
|
1107
|
-
prefix_token: "<s>"
|
|
1151
|
+
prefix_token: "<s>"
|
|
1152
|
+
|
|
1153
|
+
# Gemma-3-Gaia-PT-BR-4b-it
|
|
1154
|
+
- name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
|
|
1155
|
+
tokenizer_spec:
|
|
1156
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1157
|
+
args:
|
|
1158
|
+
pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
|
|
1159
|
+
end_of_text_token: "<eos>"
|
|
1160
|
+
prefix_token: "<bos>"
|
|
1161
|
+
|
|
1162
|
+
# Bode 13B Alpaca PT-BR
|
|
1163
|
+
- name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
|
|
1164
|
+
tokenizer_spec:
|
|
1165
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1166
|
+
args:
|
|
1167
|
+
pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
|
|
1168
|
+
end_of_text_token: "</s>"
|
|
1169
|
+
prefix_token: "<s>"
|
|
1170
|
+
|
|
1171
|
+
# Cabrita 7B PT-BR tokenizer
|
|
1172
|
+
- name: 22h/cabrita_7b_pt_850000
|
|
1173
|
+
tokenizer_spec:
|
|
1174
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1175
|
+
args:
|
|
1176
|
+
pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
|
|
1177
|
+
end_of_text_token: "</s>"
|
|
1178
|
+
prefix_token: "<s>"
|
|
1179
|
+
|
|
1180
|
+
# Gervásio 7B PT‑BR/PT‑PT tokenizer
|
|
1181
|
+
- name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
|
|
1182
|
+
tokenizer_spec:
|
|
1183
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1184
|
+
args:
|
|
1185
|
+
pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
|
|
1186
|
+
end_of_text_token: "</s>"
|
|
1187
|
+
prefix_token: "<s>"
|
|
1188
|
+
|
|
1189
|
+
# Tucano 2b4 PT-BR tokenizer
|
|
1190
|
+
- name: TucanoBR/Tucano-2b4
|
|
1191
|
+
tokenizer_spec:
|
|
1192
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1193
|
+
args:
|
|
1194
|
+
pretrained_model_name_or_path: TucanoBR/Tucano-2b4
|
|
1195
|
+
end_of_text_token: "</s>"
|
|
1196
|
+
prefix_token: "<s>"
|
|
1197
|
+
|
|
1198
|
+
# TeenyTinyLlama 460M PT-BR tokenizer
|
|
1199
|
+
- name: nicholasKluge/TeenyTinyLlama-460m
|
|
1200
|
+
tokenizer_spec:
|
|
1201
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1202
|
+
args:
|
|
1203
|
+
pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
|
|
1204
|
+
end_of_text_token: "</s>"
|
|
1205
|
+
prefix_token: "<s>"
|
helm/proxy/cli.py
CHANGED
|
@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
|
|
|
123
123
|
|
|
124
124
|
# Update quotas
|
|
125
125
|
for quota_str in args.quotas:
|
|
126
|
-
m = re.match(
|
|
126
|
+
m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
|
|
127
127
|
if not m:
|
|
128
128
|
raise Exception(
|
|
129
129
|
f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "
|
helm/proxy/example_queries.py
CHANGED
|
@@ -21,7 +21,7 @@ example_queries = [
|
|
|
21
21
|
"""
|
|
22
22
|
temperature: 0.5 # Medium amount of randomness
|
|
23
23
|
stop_sequences: [.] # Stop when you hit a period
|
|
24
|
-
model: openai/gpt-
|
|
24
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
25
25
|
"""
|
|
26
26
|
),
|
|
27
27
|
environments="",
|
|
@@ -33,7 +33,7 @@ example_queries = [
|
|
|
33
33
|
temperature: 0.5 # Medium amount of randomness
|
|
34
34
|
stop_sequences: [\\n] # Stop when you hit a newline
|
|
35
35
|
num_completions: 5 # Generate many samples
|
|
36
|
-
model: openai/gpt-
|
|
36
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
37
37
|
"""
|
|
38
38
|
),
|
|
39
39
|
environments="",
|
|
@@ -58,7 +58,7 @@ example_queries = [
|
|
|
58
58
|
"""
|
|
59
59
|
temperature: 0 # Deterministic
|
|
60
60
|
max_tokens: 50
|
|
61
|
-
model: openai/gpt-
|
|
61
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
62
62
|
"""
|
|
63
63
|
),
|
|
64
64
|
environments="",
|
|
@@ -76,7 +76,7 @@ example_queries = [
|
|
|
76
76
|
environments=dedent(
|
|
77
77
|
"""
|
|
78
78
|
occupation: [mathematician, lawyer, doctor]
|
|
79
|
-
model: [openai/gpt-
|
|
79
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
80
80
|
"""
|
|
81
81
|
),
|
|
82
82
|
),
|
|
@@ -101,7 +101,7 @@ example_queries = [
|
|
|
101
101
|
),
|
|
102
102
|
environments=dedent(
|
|
103
103
|
"""
|
|
104
|
-
model: [openai/gpt-
|
|
104
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
105
105
|
"""
|
|
106
106
|
),
|
|
107
107
|
),
|
|
@@ -136,7 +136,7 @@ example_queries = [
|
|
|
136
136
|
),
|
|
137
137
|
environments=dedent(
|
|
138
138
|
"""
|
|
139
|
-
model: [openai/gpt-
|
|
139
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
140
140
|
"""
|
|
141
141
|
),
|
|
142
142
|
),
|
|
@@ -144,7 +144,7 @@ example_queries = [
|
|
|
144
144
|
prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
|
|
145
145
|
settings=dedent(
|
|
146
146
|
"""
|
|
147
|
-
model: openai/gpt-
|
|
147
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
148
148
|
"""
|
|
149
149
|
),
|
|
150
150
|
environments="",
|
|
@@ -161,7 +161,7 @@ example_queries = [
|
|
|
161
161
|
),
|
|
162
162
|
environments=dedent(
|
|
163
163
|
"""
|
|
164
|
-
model: [openai/gpt-
|
|
164
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
165
165
|
"""
|
|
166
166
|
),
|
|
167
167
|
),
|
helm/proxy/retry.py
CHANGED
|
@@ -5,6 +5,7 @@ from retrying import Retrying
|
|
|
5
5
|
from helm.common.request import RequestResult
|
|
6
6
|
from helm.common.tokenization_request import TokenizationRequestResult
|
|
7
7
|
from helm.common.hierarchical_logger import hlog
|
|
8
|
+
import os
|
|
8
9
|
import traceback
|
|
9
10
|
import threading
|
|
10
11
|
|
|
@@ -19,6 +20,10 @@ Example usage:
|
|
|
19
20
|
...
|
|
20
21
|
"""
|
|
21
22
|
|
|
23
|
+
# TODO: make these configurable at a config / cli level
|
|
24
|
+
HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
|
|
25
|
+
HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
|
|
26
|
+
|
|
22
27
|
# The lock is used to prevent multiple threads from printing at the same time.
|
|
23
28
|
# This can cause issues when printing the stack trace.
|
|
24
29
|
# (The stack traces can get mixed up and become unreadable.)
|
helm/proxy/server.py
CHANGED
|
@@ -23,7 +23,7 @@ from helm.benchmark.model_deployment_registry import get_default_model_deploymen
|
|
|
23
23
|
from helm.common.authentication import Authentication
|
|
24
24
|
from helm.common.cache_backend_config import CacheBackendConfig, MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
25
25
|
from helm.common.general import ensure_directory_exists
|
|
26
|
-
from helm.common.hierarchical_logger import hlog
|
|
26
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
27
27
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
28
28
|
from helm.common.request import Request
|
|
29
29
|
from helm.common.perspective_api_request import PerspectiveAPIRequest
|
|
@@ -273,6 +273,7 @@ def main():
|
|
|
273
273
|
default="",
|
|
274
274
|
)
|
|
275
275
|
args = parser.parse_args()
|
|
276
|
+
setup_default_logging()
|
|
276
277
|
|
|
277
278
|
register_builtin_configs_from_helm_package()
|
|
278
279
|
register_configs_from_directory(args.base_path)
|
helm/proxy/static/index.css
CHANGED
helm/proxy/static/index.js
CHANGED
|
@@ -282,7 +282,13 @@ $(function () {
|
|
|
282
282
|
requestResult.completions.forEach((completion) => {
|
|
283
283
|
const $contents = $("<span>", {
|
|
284
284
|
title: `logprob: ${completion.logprob}`,
|
|
285
|
-
})
|
|
285
|
+
});
|
|
286
|
+
if (completion.thinking) {
|
|
287
|
+
const $thinking = $("<span>", { class: "thinking" }).append(completion.thinking.text);
|
|
288
|
+
$contents.append($thinking);
|
|
289
|
+
}
|
|
290
|
+
const $resultText = completion.tokens.length > 0 ?renderTokens(completion.tokens) : $("<div>").append(completion.text);
|
|
291
|
+
$contents.append($resultText);
|
|
286
292
|
const $metadata = $("<span>", { class: "metadata" });
|
|
287
293
|
$metadata.append(
|
|
288
294
|
$("<span>", { title: "Log probability" }).append(
|
|
@@ -34,6 +34,8 @@ class GrokAPITokenizer(CachingTokenizer):
|
|
|
34
34
|
"Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
|
|
35
35
|
)
|
|
36
36
|
text = request["text"]
|
|
37
|
+
if not text:
|
|
38
|
+
return {"token_ids": []}
|
|
37
39
|
model = request["tokenizer"].split("/")[-1]
|
|
38
40
|
response = requests.post(
|
|
39
41
|
url="https://api.x.ai/v1/tokenize-text",
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class ACIBenchMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for ACIBench."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="aci_bench_accuracy",
|
|
11
|
-
scenario_name="aci_bench",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class CHWCarePlanMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for CHWCarePlan."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="chw_care_plan_accuracy",
|
|
11
|
-
scenario_name="chw_care_plan",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class DischargeMeMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for DischargeMe."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="dischargeme_accuracy",
|
|
11
|
-
scenario_name="dischargeme",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedDialogMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MedDialog."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="med_dialog_accuracy",
|
|
11
|
-
scenario_name="med_dialog",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedalignMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for Medalign."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="medalign_accuracy",
|
|
11
|
-
scenario_name="medalign",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|