crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +1 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +76 -59
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +78 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/long_context_run_specs.py +67 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/numeracy_scenario.py +2 -1
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +63 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +100 -54
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/together_client.py +31 -4
- helm/clients/vertexai_client.py +6 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/local_context.py +140 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/config/model_deployments.yaml +864 -193
- helm/config/model_metadata.yaml +667 -53
- helm/config/tokenizer_configs.yaml +144 -3
- helm/proxy/cli.py +3 -1
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -252,6 +252,19 @@ tokenizer_configs:
|
|
|
252
252
|
end_of_text_token: "<eos>"
|
|
253
253
|
prefix_token: "<bos>"
|
|
254
254
|
|
|
255
|
+
# Grok
|
|
256
|
+
- name: xai/grok-3-beta
|
|
257
|
+
tokenizer_spec:
|
|
258
|
+
class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
|
|
259
|
+
end_of_text_token: ""
|
|
260
|
+
prefix_token: ""
|
|
261
|
+
|
|
262
|
+
- name: xai/grok-3-mini-beta
|
|
263
|
+
tokenizer_spec:
|
|
264
|
+
class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
|
|
265
|
+
end_of_text_token: ""
|
|
266
|
+
prefix_token: ""
|
|
267
|
+
|
|
255
268
|
# Hf-internal-testing
|
|
256
269
|
|
|
257
270
|
# Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
|
|
@@ -299,6 +312,22 @@ tokenizer_configs:
|
|
|
299
312
|
end_of_text_token: "<|endoftext|>"
|
|
300
313
|
prefix_token: "<|endoftext|>"
|
|
301
314
|
|
|
315
|
+
- name: huggingface/smollm2-135m
|
|
316
|
+
tokenizer_spec:
|
|
317
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
318
|
+
args:
|
|
319
|
+
pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M
|
|
320
|
+
end_of_text_token: "<|endoftext|>"
|
|
321
|
+
prefix_token: "<|endoftext|>"
|
|
322
|
+
|
|
323
|
+
- name: huggingface/smollm2-135m-instruct
|
|
324
|
+
tokenizer_spec:
|
|
325
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
326
|
+
args:
|
|
327
|
+
pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
|
|
328
|
+
end_of_text_token: "<|endoftext|>"
|
|
329
|
+
prefix_token: "<|im_end|>"
|
|
330
|
+
|
|
302
331
|
# Lighting AI
|
|
303
332
|
- name: lightningai/lit-gpt
|
|
304
333
|
tokenizer_spec:
|
|
@@ -396,6 +425,14 @@ tokenizer_configs:
|
|
|
396
425
|
prefix_token: "<|begin_of_text|>"
|
|
397
426
|
end_of_text_token: "<|eot_id|>"
|
|
398
427
|
|
|
428
|
+
- name: meta/llama-4-scout-17b-16e-instruct
|
|
429
|
+
tokenizer_spec:
|
|
430
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
431
|
+
args:
|
|
432
|
+
pretrained_model_name_or_path: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
|
433
|
+
prefix_token: "<|begin_of_text|>"
|
|
434
|
+
end_of_text_token: "<|end_of_text|>"
|
|
435
|
+
|
|
399
436
|
# 01-ai
|
|
400
437
|
- name: 01-ai/Yi-6B
|
|
401
438
|
tokenizer_spec:
|
|
@@ -432,9 +469,33 @@ tokenizer_configs:
|
|
|
432
469
|
end_of_text_token: "<|endoftext|>"
|
|
433
470
|
prefix_token: ""
|
|
434
471
|
|
|
472
|
+
- name: allenai/olmo-2-1124-7b-instruct
|
|
473
|
+
tokenizer_spec:
|
|
474
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
475
|
+
end_of_text_token: "<|endoftext|>"
|
|
476
|
+
prefix_token: "<|endoftext|>"
|
|
477
|
+
|
|
478
|
+
- name: allenai/olmo-2-0325-32b-instruct
|
|
479
|
+
tokenizer_spec:
|
|
480
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
481
|
+
end_of_text_token: "<|endoftext|>"
|
|
482
|
+
prefix_token: "<|endoftext|>"
|
|
483
|
+
|
|
484
|
+
- name: allenai/olmoe-1b-7b-0125-instruct
|
|
485
|
+
tokenizer_spec:
|
|
486
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
487
|
+
end_of_text_token: "|||IP_ADDRESS|||"
|
|
488
|
+
prefix_token: "|||IP_ADDRESS|||"
|
|
489
|
+
|
|
490
|
+
# Marin Community
|
|
491
|
+
- name: marin-community/marin-8b-instruct
|
|
492
|
+
tokenizer_spec:
|
|
493
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
494
|
+
end_of_text_token: "<|eot_id|>"
|
|
495
|
+
prefix_token: "<|begin_of_text|>"
|
|
435
496
|
|
|
436
497
|
# Microsoft
|
|
437
|
-
- name: microsoft/phi-2
|
|
498
|
+
- name: microsoft/phi-2
|
|
438
499
|
tokenizer_spec:
|
|
439
500
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
440
501
|
end_of_text_token: "<|endoftext|>"
|
|
@@ -619,6 +680,14 @@ tokenizer_configs:
|
|
|
619
680
|
end_of_text_token: "<|im_end|>"
|
|
620
681
|
prefix_token: "<|im_start|>"
|
|
621
682
|
|
|
683
|
+
- name: qwen/qwen3-235b-a22b
|
|
684
|
+
tokenizer_spec:
|
|
685
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
686
|
+
args:
|
|
687
|
+
pretrained_model_name_or_path: Qwen/Qwen3-235B-A22B
|
|
688
|
+
end_of_text_token: "<|im_end|>"
|
|
689
|
+
prefix_token: "<|im_start|>"
|
|
690
|
+
|
|
622
691
|
- name: qwen/qwq-32b-preview
|
|
623
692
|
tokenizer_spec:
|
|
624
693
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -664,6 +733,15 @@ tokenizer_configs:
|
|
|
664
733
|
end_of_text_token: "<|endoftext|>"
|
|
665
734
|
prefix_token: ""
|
|
666
735
|
|
|
736
|
+
- name: qwen/qwen2.5-omni-7b
|
|
737
|
+
tokenizer_spec:
|
|
738
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
739
|
+
args:
|
|
740
|
+
pretrained_model_name_or_path: Qwen/Qwen2.5-Omni-7B
|
|
741
|
+
trust_remote_code: false
|
|
742
|
+
end_of_text_token: "<|endoftext|>"
|
|
743
|
+
prefix_token: ""
|
|
744
|
+
|
|
667
745
|
# SambaLingo
|
|
668
746
|
- name: sambanova/sambalingo-thai-base
|
|
669
747
|
tokenizer_spec:
|
|
@@ -910,8 +988,6 @@ tokenizer_configs:
|
|
|
910
988
|
prefix_token: ""
|
|
911
989
|
end_of_text_token: ""
|
|
912
990
|
|
|
913
|
-
|
|
914
|
-
|
|
915
991
|
- name: ibm-granite/granite-34b-code-instruct-8k
|
|
916
992
|
tokenizer_spec:
|
|
917
993
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -945,7 +1021,17 @@ tokenizer_configs:
|
|
|
945
1021
|
prefix_token: ""
|
|
946
1022
|
end_of_text_token: ""
|
|
947
1023
|
|
|
1024
|
+
# IBM Granite 3.3
|
|
1025
|
+
|
|
1026
|
+
- name: ibm/granite-3.3-8b-instruct
|
|
1027
|
+
tokenizer_spec:
|
|
1028
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1029
|
+
args:
|
|
1030
|
+
pretrained_model_name_or_path: ibm-granite/granite-3.3-8b-instruct
|
|
1031
|
+
end_of_text_token: "<|end_of_text|>"
|
|
1032
|
+
prefix_token: "<|end_of_text|>"
|
|
948
1033
|
|
|
1034
|
+
|
|
949
1035
|
|
|
950
1036
|
# DeepSeek-R1-Distill-Llama-3.1-8b
|
|
951
1037
|
- name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
|
|
@@ -964,3 +1050,58 @@ tokenizer_configs:
|
|
|
964
1050
|
pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
|
|
965
1051
|
end_of_text_token: "<|end▁of▁sentence|>"
|
|
966
1052
|
prefix_token: "<|begin▁of▁sentence|>"
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
# vilm/vinallama-2.7b-chat
|
|
1056
|
+
- name: vilm/vinallama-2.7b-chat
|
|
1057
|
+
tokenizer_spec:
|
|
1058
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1059
|
+
args:
|
|
1060
|
+
pretrained_model_name_or_path: vilm/vinallama-2.7b-chat
|
|
1061
|
+
end_of_text_token: "<im_end>"
|
|
1062
|
+
prefix_token: "<im_start>"
|
|
1063
|
+
|
|
1064
|
+
# vilm/vinallama-7b-chat
|
|
1065
|
+
- name: vilm/vinallama-7b-chat
|
|
1066
|
+
tokenizer_spec:
|
|
1067
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1068
|
+
args:
|
|
1069
|
+
pretrained_model_name_or_path: vilm/vinallama-7b-chat
|
|
1070
|
+
end_of_text_token: "<im_end>"
|
|
1071
|
+
prefix_token: "<im_start>"
|
|
1072
|
+
|
|
1073
|
+
# vilm/vietcuna-7b-v3
|
|
1074
|
+
- name: vilm/vietcuna-7b-v3
|
|
1075
|
+
tokenizer_spec:
|
|
1076
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1077
|
+
args:
|
|
1078
|
+
pretrained_model_name_or_path: vilm/vietcuna-7b-v3
|
|
1079
|
+
end_of_text_token: "</s>"
|
|
1080
|
+
prefix_token: "<s>"
|
|
1081
|
+
|
|
1082
|
+
# Viet-Mistral/Vistral-7B-Chat
|
|
1083
|
+
- name: Viet-Mistral/Vistral-7B-Chat
|
|
1084
|
+
tokenizer_spec:
|
|
1085
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1086
|
+
args:
|
|
1087
|
+
pretrained_model_name_or_path: Viet-Mistral/Vistral-7B-Chat
|
|
1088
|
+
end_of_text_token: "</s>"
|
|
1089
|
+
prefix_token: "<s>"
|
|
1090
|
+
|
|
1091
|
+
# vinai/PhoGPT-7B5-Instruct
|
|
1092
|
+
- name: vinai/PhoGPT-7B5-Instruct
|
|
1093
|
+
tokenizer_spec:
|
|
1094
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1095
|
+
args:
|
|
1096
|
+
pretrained_model_name_or_path: vinai/PhoGPT-7B5-Instruct
|
|
1097
|
+
end_of_text_token: "</s>"
|
|
1098
|
+
prefix_token: "<s>"
|
|
1099
|
+
|
|
1100
|
+
# vinai/PhoGPT-4B-Chat
|
|
1101
|
+
- name: vinai/PhoGPT-4B-Chat
|
|
1102
|
+
tokenizer_spec:
|
|
1103
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1104
|
+
args:
|
|
1105
|
+
pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
|
|
1106
|
+
end_of_text_token: "</s>"
|
|
1107
|
+
prefix_token: "<s>"
|
helm/proxy/cli.py
CHANGED
|
@@ -21,7 +21,7 @@ from typing import List, Dict
|
|
|
21
21
|
import re
|
|
22
22
|
import sys
|
|
23
23
|
|
|
24
|
-
from helm.common.hierarchical_logger import hlog
|
|
24
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
25
25
|
from helm.common.authentication import Authentication
|
|
26
26
|
from helm.proxy.accounts import Usage, Account
|
|
27
27
|
from helm.proxy.services.remote_service import RemoteService, add_service_args, create_authentication
|
|
@@ -198,6 +198,8 @@ def main():
|
|
|
198
198
|
|
|
199
199
|
args = parser.parse_args()
|
|
200
200
|
|
|
201
|
+
setup_default_logging()
|
|
202
|
+
|
|
201
203
|
service = create_remote_service(args)
|
|
202
204
|
auth = create_authentication(args)
|
|
203
205
|
|
|
@@ -38,7 +38,7 @@ def replace_emoji_characters(s: str) -> str:
|
|
|
38
38
|
highpoints = re.compile("[\U00010000-\U0010ffff]")
|
|
39
39
|
elif sys.maxunicode == 65535:
|
|
40
40
|
# Python was built with '--enable-unicode=ucs2'
|
|
41
|
-
highpoints = re.compile("[\
|
|
41
|
+
highpoints = re.compile("[\ud800-\udbff][\udc00-\udfff]")
|
|
42
42
|
else:
|
|
43
43
|
raise UnicodeError("Unable to determine if Python was built using UCS-2 or UCS-4")
|
|
44
44
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import dataclasses
|
|
2
1
|
import os
|
|
3
2
|
import signal
|
|
4
|
-
from typing import List
|
|
3
|
+
from typing import List
|
|
5
4
|
|
|
5
|
+
from helm.common.local_context import LocalContext
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
|
|
8
8
|
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
@@ -11,7 +11,6 @@ from helm.common.moderations_api_request import ModerationAPIRequest, Moderation
|
|
|
11
11
|
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
12
12
|
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
13
13
|
from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
14
|
-
from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
|
|
15
14
|
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
16
15
|
from helm.common.tokenization_request import (
|
|
17
16
|
TokenizationRequest,
|
|
@@ -22,27 +21,13 @@ from helm.common.tokenization_request import (
|
|
|
22
21
|
from helm.common.request import Request, RequestResult
|
|
23
22
|
from helm.common.hierarchical_logger import hlog
|
|
24
23
|
from helm.proxy.accounts import Accounts, Account
|
|
25
|
-
from helm.clients.auto_client import AutoClient
|
|
26
|
-
from helm.clients.moderation_api_client import ModerationAPIClient
|
|
27
|
-
from helm.clients.image_generation.nudity_check_client import NudityCheckClient
|
|
28
|
-
from helm.clients.gcs_client import GCSClient
|
|
29
|
-
from helm.clients.clip_score_client import CLIPScoreClient
|
|
30
|
-
from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
|
|
31
|
-
from helm.proxy.example_queries import example_queries
|
|
32
|
-
from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
|
|
33
24
|
from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization
|
|
34
25
|
from helm.proxy.query import Query, QueryResult
|
|
35
|
-
from helm.proxy.retry import retry_request
|
|
36
26
|
from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
|
|
37
|
-
from helm.tokenizers.auto_tokenizer import AutoTokenizer
|
|
38
27
|
from helm.proxy.services.service import (
|
|
39
28
|
Service,
|
|
40
|
-
CACHE_DIR,
|
|
41
29
|
ACCOUNTS_FILE,
|
|
42
30
|
GeneralInfo,
|
|
43
|
-
VERSION,
|
|
44
|
-
expand_environments,
|
|
45
|
-
synthesize_request,
|
|
46
31
|
)
|
|
47
32
|
|
|
48
33
|
|
|
@@ -57,43 +42,17 @@ class ServerService(Service):
|
|
|
57
42
|
root_mode: bool = False,
|
|
58
43
|
cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
|
|
59
44
|
):
|
|
60
|
-
ensure_directory_exists(base_path)
|
|
61
|
-
client_file_storage_path = os.path.join(base_path, CACHE_DIR)
|
|
62
|
-
ensure_directory_exists(client_file_storage_path)
|
|
63
|
-
|
|
64
|
-
credentials = get_credentials(base_path)
|
|
65
45
|
accounts_path = os.path.join(base_path, ACCOUNTS_FILE)
|
|
66
46
|
|
|
67
|
-
self.
|
|
68
|
-
self.
|
|
69
|
-
self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
|
|
70
|
-
self.token_counter = AutoTokenCounter(self.tokenizer)
|
|
47
|
+
self.context = LocalContext(base_path, cache_backend_config)
|
|
48
|
+
self.token_counter = AutoTokenCounter(self.context.tokenizer)
|
|
71
49
|
self.accounts = Accounts(accounts_path, root_mode=root_mode)
|
|
72
50
|
|
|
73
|
-
# Lazily instantiate the following clients
|
|
74
|
-
self.moderation_api_client: Optional[ModerationAPIClient] = None
|
|
75
|
-
self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
|
|
76
|
-
self.perspective_api_client: Optional[ToxicityClassifierClient] = None
|
|
77
|
-
self.nudity_check_client: Optional[NudityCheckClient] = None
|
|
78
|
-
self.clip_score_client: Optional[CLIPScoreClient] = None
|
|
79
|
-
self.gcs_client: Optional[GCSClient] = None
|
|
80
|
-
|
|
81
51
|
def get_general_info(self) -> GeneralInfo:
|
|
82
|
-
|
|
83
|
-
# TODO(#2158): Either fix this or delete get_general_info.
|
|
84
|
-
all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
|
|
85
|
-
return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
|
|
52
|
+
return self.context.get_general_info()
|
|
86
53
|
|
|
87
54
|
def expand_query(self, query: Query) -> QueryResult:
|
|
88
|
-
|
|
89
|
-
prompt = query.prompt
|
|
90
|
-
settings = query.settings
|
|
91
|
-
environments = parse_hocon(query.environments)
|
|
92
|
-
requests = []
|
|
93
|
-
for environment in expand_environments(environments):
|
|
94
|
-
request = synthesize_request(prompt, settings, environment)
|
|
95
|
-
requests.append(request)
|
|
96
|
-
return QueryResult(requests=requests)
|
|
55
|
+
return self.context.expand_query(query)
|
|
97
56
|
|
|
98
57
|
def _get_model_group_for_model_deployment(self, model_deployment: str) -> str:
|
|
99
58
|
if model_deployment.startswith("openai/"):
|
|
@@ -105,7 +64,11 @@ class ServerService(Service):
|
|
|
105
64
|
return "gpt4"
|
|
106
65
|
elif model_deployment.startswith("openai/gpt-3"):
|
|
107
66
|
return "gpt3"
|
|
108
|
-
elif
|
|
67
|
+
elif (
|
|
68
|
+
model_deployment.startswith("openai/o1")
|
|
69
|
+
or model_deployment.startswith("openai/o3")
|
|
70
|
+
or model_deployment.startswith("openai/o4")
|
|
71
|
+
):
|
|
109
72
|
return "o1"
|
|
110
73
|
else:
|
|
111
74
|
return "openai"
|
|
@@ -126,7 +89,7 @@ class ServerService(Service):
|
|
|
126
89
|
self.accounts.check_can_use(auth.api_key, model_group)
|
|
127
90
|
|
|
128
91
|
# Use!
|
|
129
|
-
request_result: RequestResult = self.
|
|
92
|
+
request_result: RequestResult = self.context.make_request(request)
|
|
130
93
|
|
|
131
94
|
# Only deduct if not cached
|
|
132
95
|
if not request_result.cached:
|
|
@@ -139,66 +102,39 @@ class ServerService(Service):
|
|
|
139
102
|
def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
140
103
|
"""Tokenize via an API."""
|
|
141
104
|
self.accounts.authenticate(auth)
|
|
142
|
-
return self.
|
|
105
|
+
return self.context.tokenize(request)
|
|
143
106
|
|
|
144
107
|
def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
|
|
145
108
|
"""Decodes to text."""
|
|
146
109
|
self.accounts.authenticate(auth)
|
|
147
|
-
return self.
|
|
110
|
+
return self.context.decode(request)
|
|
148
111
|
|
|
149
112
|
def upload(self, auth: Authentication, request: FileUploadRequest) -> FileUploadResult:
|
|
150
113
|
"""Uploads a file to external storage."""
|
|
151
114
|
self.accounts.authenticate(auth)
|
|
152
|
-
|
|
153
|
-
if not self.gcs_client:
|
|
154
|
-
self.gcs_client = self.client.get_gcs_client()
|
|
155
|
-
|
|
156
|
-
assert self.gcs_client
|
|
157
|
-
return self.gcs_client.upload(request)
|
|
115
|
+
return self.context.upload(request)
|
|
158
116
|
|
|
159
117
|
def check_nudity(self, auth: Authentication, request: NudityCheckRequest) -> NudityCheckResult:
|
|
160
118
|
"""Check for nudity."""
|
|
161
119
|
self.accounts.authenticate(auth)
|
|
162
|
-
|
|
163
|
-
if not self.nudity_check_client:
|
|
164
|
-
self.nudity_check_client = self.client.get_nudity_check_client()
|
|
165
|
-
|
|
166
|
-
assert self.nudity_check_client
|
|
167
|
-
return self.nudity_check_client.check_nudity(request)
|
|
120
|
+
return self.context.check_nudity(request)
|
|
168
121
|
|
|
169
122
|
def compute_clip_score(self, auth: Authentication, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
170
123
|
"""Computes CLIPScore for a given caption and image."""
|
|
171
124
|
self.accounts.authenticate(auth)
|
|
172
|
-
|
|
173
|
-
if not self.clip_score_client:
|
|
174
|
-
self.clip_score_client = self.client.get_clip_score_client()
|
|
175
|
-
|
|
176
|
-
assert self.clip_score_client
|
|
177
|
-
return self.clip_score_client.compute_score(request)
|
|
125
|
+
return self.context.compute_clip_score(request)
|
|
178
126
|
|
|
179
127
|
def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
180
|
-
@retry_request
|
|
181
|
-
def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
182
|
-
if not self.toxicity_classifier_client:
|
|
183
|
-
self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
|
|
184
|
-
return self.toxicity_classifier_client.get_toxicity_scores(request)
|
|
185
|
-
|
|
186
128
|
self.accounts.authenticate(auth)
|
|
187
|
-
return
|
|
129
|
+
return self.context.get_toxicity_scores(request)
|
|
188
130
|
|
|
189
131
|
def get_moderation_results(self, auth: Authentication, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
190
|
-
@retry_request
|
|
191
|
-
def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
192
|
-
if not self.moderation_api_client:
|
|
193
|
-
self.moderation_api_client = self.client.get_moderation_api_client()
|
|
194
|
-
return self.moderation_api_client.get_moderation_results(request)
|
|
195
|
-
|
|
196
132
|
self.accounts.authenticate(auth)
|
|
197
|
-
return
|
|
133
|
+
return self.context.get_moderation_results(request)
|
|
198
134
|
|
|
199
135
|
def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
|
|
200
136
|
self.accounts.authenticate(auth)
|
|
201
|
-
return self.
|
|
137
|
+
return self.context.make_critique_request(request)
|
|
202
138
|
|
|
203
139
|
def create_account(self, auth: Authentication) -> Account:
|
|
204
140
|
"""Creates a new account."""
|
|
@@ -233,4 +169,4 @@ class ServerService(Service):
|
|
|
233
169
|
hlog("Done.")
|
|
234
170
|
|
|
235
171
|
def get_cache_config(self, shard_name: str) -> CacheConfig:
|
|
236
|
-
return self.
|
|
172
|
+
return self.context.get_cache_config(shard_name)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.tokenization_request import (
|
|
9
|
+
TokenizationRequest,
|
|
10
|
+
TokenizationToken,
|
|
11
|
+
)
|
|
12
|
+
from helm.tokenizers.caching_tokenizer import CachingTokenizer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GrokAPITokenizer(CachingTokenizer):
|
|
16
|
+
"""Tokenizer that uses the xAI Grok Tokenize Text API
|
|
17
|
+
|
|
18
|
+
Doc: https://docs.x.ai/docs/api-reference#tokenize-text"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None) -> None:
|
|
21
|
+
super().__init__(cache_config)
|
|
22
|
+
self.api_key = api_key or os.environ.get("XAI_API_KEY")
|
|
23
|
+
|
|
24
|
+
def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
|
|
25
|
+
cache_key = dataclasses.asdict(request)
|
|
26
|
+
# Delete encode because the Grok API simulateously gives string and integer tokens.
|
|
27
|
+
del cache_key["encode"]
|
|
28
|
+
return cache_key
|
|
29
|
+
|
|
30
|
+
def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
31
|
+
if not self.api_key:
|
|
32
|
+
raise Exception(
|
|
33
|
+
"No Grok API key found. "
|
|
34
|
+
"Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
|
|
35
|
+
)
|
|
36
|
+
text = request["text"]
|
|
37
|
+
model = request["tokenizer"].split("/")[-1]
|
|
38
|
+
response = requests.post(
|
|
39
|
+
url="https://api.x.ai/v1/tokenize-text",
|
|
40
|
+
headers={"Authorization": f"Bearer {self.api_key}"},
|
|
41
|
+
json={"text": text, "model": model},
|
|
42
|
+
)
|
|
43
|
+
response.raise_for_status()
|
|
44
|
+
return response.json()
|
|
45
|
+
|
|
46
|
+
def _tokenization_raw_response_to_tokens(
|
|
47
|
+
self, response: Dict[str, Any], request: TokenizationRequest
|
|
48
|
+
) -> List[TokenizationToken]:
|
|
49
|
+
raw_token_field_name = "token_id" if request.encode else "string_token"
|
|
50
|
+
return [TokenizationToken(raw_token[raw_token_field_name]) for raw_token in response["token_ids"]]
|
|
51
|
+
|
|
52
|
+
def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
53
|
+
raise NotImplementedError("The xAI API does not support decoding.")
|
|
@@ -119,7 +119,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
119
119
|
tokens = tokenizer.encode(
|
|
120
120
|
request["text"],
|
|
121
121
|
truncation=request["truncation"],
|
|
122
|
-
max_length=request["max_length"],
|
|
122
|
+
max_length=max(request["max_length"], 0),
|
|
123
123
|
add_special_tokens=False,
|
|
124
124
|
)
|
|
125
125
|
else:
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from helm.common.cache import BlackHoleCacheConfig
|
|
5
|
+
from helm.common.tokenization_request import (
|
|
6
|
+
TokenizationRequest,
|
|
7
|
+
TokenizationToken,
|
|
8
|
+
)
|
|
9
|
+
from helm.tokenizers.grok_tokenizer import GrokAPITokenizer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.models
|
|
13
|
+
def test_tokenize():
|
|
14
|
+
if not os.environ.get("XAI_API_KEY"):
|
|
15
|
+
pytest.skip("No xAI API key found; skipping test")
|
|
16
|
+
tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
|
|
17
|
+
request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦")
|
|
18
|
+
result = tokenizer.tokenize(request)
|
|
19
|
+
assert result.success
|
|
20
|
+
assert not result.cached
|
|
21
|
+
assert result.tokens == [TokenizationToken(token) for token in ["otter", "", "", ""]]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.mark.models
|
|
25
|
+
def test_encode():
|
|
26
|
+
if not os.environ.get("XAI_API_KEY"):
|
|
27
|
+
pytest.skip("No xAI API key found; skipping test")
|
|
28
|
+
tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
|
|
29
|
+
request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦", encode=True)
|
|
30
|
+
result = tokenizer.tokenize(request)
|
|
31
|
+
assert result.success
|
|
32
|
+
assert not result.cached
|
|
33
|
+
assert result.tokens == [TokenizationToken(token) for token in [142507, 11637, 294, 294]]
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
import re
|
|
3
|
-
from tempfile import TemporaryDirectory
|
|
4
|
-
from helm.benchmark.scenarios.infinite_bench_sum_scenario import InfiniteBenchSumScenario
|
|
5
|
-
from helm.benchmark.scenarios.scenario import CORRECT_TAG
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def count_words(text: str) -> int:
|
|
9
|
-
return len(re.split(r"\s+", text.strip()))
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@pytest.mark.scenarios
|
|
13
|
-
def test_infinite_bench_sum_scenario():
|
|
14
|
-
with TemporaryDirectory() as tmpdir:
|
|
15
|
-
scenario = InfiniteBenchSumScenario(min_num_words=0, max_num_words=10000000)
|
|
16
|
-
instances = scenario.get_instances(tmpdir)
|
|
17
|
-
assert len(instances) == 103
|
|
18
|
-
assert instances[0].split == "test"
|
|
19
|
-
assert len(instances[0].input.text) == 1745528
|
|
20
|
-
assert instances[0].extra_data
|
|
21
|
-
assert instances[0].extra_data["word_count"] == 308762
|
|
22
|
-
references = instances[0].references
|
|
23
|
-
assert len(references[0].output.text) == 2865
|
|
24
|
-
assert references[0].tags == [CORRECT_TAG]
|
|
25
|
-
|
|
26
|
-
scenario = InfiniteBenchSumScenario(min_num_words=0, max_num_words=100000)
|
|
27
|
-
instances = scenario.get_instances(tmpdir)
|
|
28
|
-
assert len(instances) == 48
|
|
29
|
-
assert instances[0].split == "test"
|
|
30
|
-
assert len(instances[0].input.text) == 381778
|
|
31
|
-
assert instances[0].extra_data
|
|
32
|
-
assert instances[0].extra_data["word_count"] == 69458
|
|
33
|
-
references = instances[0].references
|
|
34
|
-
assert len(references[0].output.text) == 4217
|
|
35
|
-
assert references[0].tags == [CORRECT_TAG]
|
|
36
|
-
|
|
37
|
-
scenario = InfiniteBenchSumScenario(min_num_words=30000, max_num_words=80000)
|
|
38
|
-
instances = scenario.get_instances(tmpdir)
|
|
39
|
-
assert len(instances) == 32
|
|
40
|
-
assert instances[0].split == "test"
|
|
41
|
-
assert len(instances[1].input.text) == 383396
|
|
42
|
-
assert instances[1].extra_data
|
|
43
|
-
assert instances[1].extra_data["word_count"] == 68482
|
|
44
|
-
references = instances[1].references
|
|
45
|
-
assert len(references[0].output.text) == 5667
|
|
46
|
-
assert references[0].tags == [CORRECT_TAG]
|