PyPI - crfm-helm - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
helm/benchmark/__init__.py +13 -0
helm/benchmark/adaptation/adapter_spec.py +3 -0
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
helm/benchmark/contamination/__init__.py +0 -0
helm/benchmark/metrics/classification_metrics.py +70 -0
helm/benchmark/metrics/machine_translation_metrics.py +36 -0
helm/benchmark/metrics/summarization_metrics.py +7 -8
helm/benchmark/metrics/test_classification_metrics.py +150 -0
helm/benchmark/presentation/create_plots.py +617 -0
helm/benchmark/presentation/run_display.py +7 -48
helm/benchmark/presentation/summarize.py +4 -2
helm/benchmark/presentation/test_create_plots.py +32 -0
helm/benchmark/run.py +144 -48
helm/benchmark/run_expander.py +164 -47
helm/benchmark/run_specs.py +346 -39
helm/benchmark/runner.py +34 -6
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
helm/benchmark/scenarios/imdb_listdir.json +50014 -0
helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
helm/benchmark/scenarios/lextreme_scenario.py +458 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
helm/benchmark/scenarios/med_qa_scenario.py +96 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
helm/benchmark/scenarios/scenario.py +5 -0
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
helm/benchmark/static/benchmarking.css +14 -0
helm/benchmark/static/benchmarking.js +43 -0
helm/benchmark/static/index.html +2 -0
helm/benchmark/static/json-urls.js +4 -0
helm/benchmark/static/plot-captions.js +16 -0
helm/benchmark/static/schema.yaml +154 -1
helm/benchmark/window_services/cohere_window_service.py +20 -0
helm/benchmark/window_services/flan_t5_window_service.py +29 -0
helm/benchmark/window_services/huggingface_window_service.py +39 -0
helm/benchmark/window_services/santacoder_window_service.py +27 -0
helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
helm/benchmark/window_services/window_service_factory.py +34 -7
helm/common/codec.py +123 -0
helm/common/general.py +12 -5
helm/common/test_codec.py +144 -0
helm/proxy/clients/aleph_alpha_client.py +47 -28
helm/proxy/clients/auto_client.py +32 -24
helm/proxy/clients/google_client.py +88 -0
helm/proxy/clients/huggingface_client.py +32 -16
helm/proxy/clients/huggingface_model_registry.py +111 -0
helm/proxy/clients/huggingface_tokenizer.py +25 -7
helm/proxy/clients/openai_client.py +60 -2
helm/proxy/clients/test_huggingface_model_registry.py +57 -0
helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
helm/proxy/clients/together_client.py +17 -2
helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
helm/proxy/models.py +115 -7
helm/proxy/test_models.py +1 -1
helm/benchmark/presentation/present.py +0 -249
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.2.0
+Version: 0.2.2
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM
@@ -12,15 +12,11 @@ Classifier: Programming Language :: Python :: 3.8
 Classifier: License :: OSI Approved :: Apache Software License
 Requires-Python: ~=3.8
 License-File: LICENSE
-Requires-Dist: pytest (~=7.2.0)
-Requires-Dist: black (~=22.10.0)
-Requires-Dist: mypy (~=0.982)
-Requires-Dist: pre-commit (~=2.20.0)
-Requires-Dist: flake8 (~=5.0.4)
 Requires-Dist: zstandard (~=0.18.0)
 Requires-Dist: tqdm (~=4.64.1)
 Requires-Dist: pyhocon (~=0.3.59)
 Requires-Dist: dacite (~=1.6.0)
+Requires-Dist: aleph-alpha-client (~=2.14.0)
 Requires-Dist: bottle (~=0.12.23)
 Requires-Dist: gunicorn (~=20.1.0)
 Requires-Dist: Mako (~=1.2.3)
@@ -28,8 +24,9 @@ Requires-Dist: sqlitedict (~=1.7.0)
 Requires-Dist: pymongo (~=4.2.0)
 Requires-Dist: retrying (~=1.3.3)
 Requires-Dist: websocket-client (~=1.3.2)
-Requires-Dist: openai (~=0.25.0)
-Requires-Dist: transformers (~=4.22.2)
+Requires-Dist: openai (~=0.27.0)
+Requires-Dist: transformers (~=4.26.1)
+Requires-Dist: tokenizers (~=0.13.2)
 Requires-Dist: icetk (~=0.0.4)
 Requires-Dist: protobuf (~=3.20.2)
 Requires-Dist: google-api-python-client (~=2.64.0)
@@ -40,6 +37,8 @@ Requires-Dist: jsonlines (~=3.1.0)
 Requires-Dist: sympy (~=1.11.1)
 Requires-Dist: sentencepiece (~=0.1.97)
 Requires-Dist: numba (~=0.56.4)
+Requires-Dist: cattrs (~=22.2.0)
+Requires-Dist: xlrd (~=2.0.1)
 Requires-Dist: importlib-resources (~=5.10.0)
 Requires-Dist: nltk (~=3.7)
 Requires-Dist: scipy (~=1.9.1)
@@ -53,5 +52,9 @@ Requires-Dist: spacy (~=3.2.4)
 Requires-Dist: summ-eval (~=0.892)
 Requires-Dist: torch (~=1.12.1)
 Requires-Dist: torchvision (~=0.13.1)
+Requires-Dist: colorcet (~=3.0.1)
+Requires-Dist: matplotlib (~=3.6.0)
+Requires-Dist: numpy (~=1.23.3)
+Requires-Dist: seaborn (~=0.11.0)
 Benchmark for language models

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 helm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-helm/benchmark/__init__.py,sha256=XY0Yjn_tSit3lA16scPTqnfRcft4TnllSEHxrdy9v3U,3909
+helm/benchmark/__init__.py,sha256=haJrJawd2zOTaxV_nkk6-V05vnePuHwCi0DytuJ0898,4450
 helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYCK8pXVo,2173
 helm/benchmark/executor.py,sha256=Vkmc4wmar2MRIavfiUOa2mu8Pp-zXsguYOevbjog4-4,3299
-helm/benchmark/run.py,sha256=EAQsOXMVHJeEVv_iTRS-saU6zmydNAzEeFyCS7nM7u8,5794
-helm/benchmark/run_expander.py,sha256=IHzZs8Wsp9Bkw5vw5hSa7NLkYgFlxGr4aLAO2YJyVCc,28842
-helm/benchmark/run_specs.py,sha256=jQcAqhMk1kLi0loWNUaZ95BEiFXLRjvYYXOq0TDKaJA,62653
-helm/benchmark/runner.py,sha256=aNlOa7OLivyAuDvTJMn6-8CG0xPEQkEJ7VKWEYXnwXU,7593
+helm/benchmark/run.py,sha256=AWa862BtEh5aOTjKZ9OkSv3be2ZrU4R1qiwJtRTQwfk,9402
+helm/benchmark/run_expander.py,sha256=vnq-zRmuXLzgr3sS3XYaXJFarNC7-QKc0_DtPjwXq3Q,32952
+helm/benchmark/run_specs.py,sha256=ssBJYMZVMF4XGk6lvCSlQJh6A-Pmh2_ndi_JAwgW0CQ,71441
+helm/benchmark/runner.py,sha256=zYDe8UeB1LFmbpChmRdRqEIZo-X0xWMenOCp2NnZ9Ws,8802
 helm/benchmark/server.py,sha256=HsuVsch1SPjQ4YyZi60kjr3JZeL82h8jgkxTUlfb130,1620
 helm/benchmark/test_data_preprocessor.py,sha256=adT-pgVeWvmZXLUUehxH0C-lMhXhtdxsvYdr69o1BD4,2047
 helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-helm/benchmark/adaptation/adapter_spec.py,sha256=KgO6KGF3sRrM0rKTMulFp4GtwHbGNlOurE48d0Lv5hg,2679
+helm/benchmark/adaptation/adapter_spec.py,sha256=YoxMyN4RJM8GG_DeZ-k0edyARZ69hHLkvOlvKCk-u2o,2811
 helm/benchmark/adaptation/prompt.py,sha256=MATerIUIhFp_BMGvK7bLpNtWH6Oi4kknjBjOkr2bHv4,1948
 helm/benchmark/adaptation/request_state.py,sha256=o3OpZbB0TJFiZ2Nmhvg3vWmByaUSYTffT_WnoNb7w68,2712
 helm/benchmark/adaptation/scenario_state.py,sha256=ZflBuNgvN0JqUhshFcy0kTweO1WJs6j5UCaTxWTMe0o,1747
@@ -18,7 +18,7 @@ helm/benchmark/adaptation/adapters/adapter.py,sha256=8wK28jISxW8rUfXP-_-FfQJRRzc
 helm/benchmark/adaptation/adapters/adapter_factory.py,sha256=N2n-xIoGt_DxlN0LT4GUgVvdoaqhyUU8rSWr_nyfb80,2318
 helm/benchmark/adaptation/adapters/binary_ranking_adapter.py,sha256=3j24nFQuZE0Zl6DMAB4aYUpjieerdSMLsJbpMT9Nzfw,5646
 helm/benchmark/adaptation/adapters/generation_adapter.py,sha256=-on4QAo8hhzJVgAnM6G8lFFqaoiSiVF-KxwwfHwE61A,1927
-helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=qRmFP1wweGIcZX55M2bljaQ22Xof45_oyY7Bxg4c3yQ,12657
+helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=q5K4Hag8LOpfpkeEzwIMPLNpBqMThcB1LXLGr_n_Xfo,13118
 helm/benchmark/adaptation/adapters/language_modeling_adapter.py,sha256=vPo2EVgbMfzmwPPcljoXdDfBW3c80LLKaUhA-RefU2w,11967
 helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py,sha256=pV14yvmH_mRQpeXF0teAxGpJcouSQViipr-aMkNE-AM,1711
 helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py,sha256=4CALqc--PUaEl3cLzmjP9nFSuarCZMKBwrPQxde5TYM,3471
@@ -30,6 +30,7 @@ helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py,sha256=
 helm/benchmark/augmentations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/augmentations/contraction_expansion_perturbation.py,sha256=bqcXlazuoss4hOxOarQptNEh52SU4Gy58Zph1PZ34W4,4803
 helm/benchmark/augmentations/contrast_sets_perturbation.py,sha256=fpLtZDgQY8beQLBlCId3gSyvCoPHVq3J7PGzcNdR0kM,3454
+helm/benchmark/augmentations/correct_to_misspelling.json,sha256=L44RiJXlJCa6zQzTLf0MFHCOhFyRDRKfLQNXH-n3XIs,213429
 helm/benchmark/augmentations/data_augmenter.py,sha256=57LA6h7z1tVMy_xGcW46F1KRi3D4wnv0fi8XeJjsi2c,3849
 helm/benchmark/augmentations/dialect_perturbation.py,sha256=zy3SJtYAxHf2fMB7w-u5gsEC6q8g-94sKnwNLVp0pFc,6227
 helm/benchmark/augmentations/extra_space_perturbation.py,sha256=9_pmthcyFfuYu6GsJB03hKkhvDZqqfH7hOeSNZRohSg,835
@@ -45,6 +46,7 @@ helm/benchmark/augmentations/space_perturbation.py,sha256=7OdpoibdizoPDBPpLc1ENy
 helm/benchmark/augmentations/synonym_perturbation.py,sha256=2qFx7xparhEPd82tvs59HkAr1hwQWv7asWtmNCbcQrQ,4209
 helm/benchmark/augmentations/test_perturbation.py,sha256=v_U5CmBpA5aXqg4EJUYZrSfGsNbZTwCP0inxz1XNGq0,9991
 helm/benchmark/augmentations/typos_perturbation.py,sha256=nfF1Zw2REKZEnnyPVFWD87MP8L5ANbaZXeI2n70Sonw,2790
+helm/benchmark/contamination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/efficiency_data/inference_denoised_runtimes.json,sha256=ios_dt-_8wtXvkVAx0iI2zwCxqHvk3XKTx31qHPalsI,4203
 helm/benchmark/efficiency_data/inference_idealized_runtimes.json,sha256=5w7reeZc0yc4cjH8kJGxQQSoe8yaRVX2SSlSrx0QWFQ,12348
 helm/benchmark/efficiency_data/training_efficiency.json,sha256=aH2moiBLStOLVi8Ci2KTK5ZkWlTBLK-B3fRfNZwhoSg,9763
@@ -53,18 +55,21 @@ helm/benchmark/metrics/basic_metrics.py,sha256=xBCTgLVdiWVxdpB08MbWFc2nvwCJTgD1-
 helm/benchmark/metrics/bbq_metrics.py,sha256=H44mwKXLJ0PXo-sVKgRHCEKjZGnCahDD6GOQMWpnbOQ,6061
 helm/benchmark/metrics/bias_metrics.py,sha256=uVJFQvDSzvPR1ELu0FNYyExjRy2ThaJRCw8beEMDqJs,11309
 helm/benchmark/metrics/bias_word_lists.py,sha256=mx5JjW3mHffXIqo4GcQN-zENUEttBqQnEjPTz3J3J_4,13909
+helm/benchmark/metrics/classification_metrics.py,sha256=1q7gPnWRrx4QwE8T0m269vFJWg_bKfVx21a5spDBbjU,3701
 helm/benchmark/metrics/code_metrics.py,sha256=uWdigk0QyEsfVHQzq9KxkOc-LROvcqWXeui42Mr0YF4,5119
 helm/benchmark/metrics/code_metrics_helper.py,sha256=byyuI1lJgbIDPVJzywaBsam9zFMPPyn28g1grsK9xyA,22336
 helm/benchmark/metrics/copyright_metrics.py,sha256=8sk85mLTasWIgHIXxOho0z_nQYyLqtzSWHSAwd5ayAQ,7560
 helm/benchmark/metrics/disinformation_metrics.py,sha256=YH8QJ5s8LvIRzp49_O20UvOm8_z7PKyleoOX3hdX0HE,10499
+helm/benchmark/metrics/machine_translation_metrics.py,sha256=Ki7wBa9Odko_wVg1ec1MIoY2fHn0oY_vYrT3r7Ya6tc,1559
 helm/benchmark/metrics/metric.py,sha256=zF3IHmjGxRXrUoIOEIb2wRbsTPldzgclF8uPiGsZv4g,18789
 helm/benchmark/metrics/metric_name.py,sha256=hk5WQ6uj_9EjgKKFawPenL2-XOMf-aKvNRkOxlu4nCo,1355
 helm/benchmark/metrics/metric_service.py,sha256=aJ21wWdc1Spfi3mjrj8JEnsANL45P7wr7fk_EdDObko,709
 helm/benchmark/metrics/numeracy_metrics.py,sha256=panMWD3a1NPerg3Ix7l6NhR7jGOIQOQV9i_KysBeDA8,2818
 helm/benchmark/metrics/ranking_metrics.py,sha256=b3qxTRnr62zz1Gr1dsVDYtdwB8WBIb-v98yoRB3Wtvs,17231
 helm/benchmark/metrics/statistic.py,sha256=9VM5JA1-M_iYCNziWm2qeDZaAQqPQ_ySdaSMcqAeYdM,3048
-helm/benchmark/metrics/summarization_metrics.py,sha256=xFQOiiIRM8AnIS9NUt74vzu7dfvCsoc-0Mh9m4fkexc,16011
+helm/benchmark/metrics/summarization_metrics.py,sha256=hHNWGYA1bNfgCg7o1RSiTo7E-SJujHhkKh9G204icoo,16083
 helm/benchmark/metrics/test_bias_metrics.py,sha256=brut1rdnKNtTVJoe6qkllmJwZTFBZkLcyI_4qmqZ_vA,6264
+helm/benchmark/metrics/test_classification_metrics.py,sha256=usW5ciUYu2ZUUqVjFk4NfZTGNIoBArwia_-8uGOvFpw,5475
 helm/benchmark/metrics/test_metric.py,sha256=S7LGHNCHuhMk582eHylw1tOasUBEf_7F0T4u3tey7b4,757
 helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SIBjLFuLIzGbfg6OiZvM,4162
 helm/benchmark/metrics/test_statistic.py,sha256=WQv9i8wSNTCzlw-L1wir0lmW0g3D4CM_ebpii7IB9Lw,406
@@ -85,13 +90,14 @@ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=gPqoYNI
 helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
 helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/presentation/contamination.py,sha256=5wLwq266sCxT62MdzAXT9V-au6b07HaL44DLj_2qiSk,2788
-helm/benchmark/presentation/present.py,sha256=dHlDGixc7dkHFAQ_yMGAd5ik9G-cQ-sq8MDzKdBKNT0,9083
-helm/benchmark/presentation/run_display.py,sha256=JumIiYfm0UElrXNz-iJmjdMWvxLUJ1opWwcgyfFiSwg,12207
+helm/benchmark/presentation/create_plots.py,sha256=-YyrhEmfVOMnESJ8m2yk7RWAOYdZkVrLAt2K8XnpNF0,28442
+helm/benchmark/presentation/run_display.py,sha256=HSvV71ZRshMIhHZHGtlbYfRxK9xx1GQgn6YmGPVncME,9892
 helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
 helm/benchmark/presentation/schema.py,sha256=i1utCqiNkbTK9CcDcOFwi7e91KPaKkpFU07ZcBfWXTc,8753
-helm/benchmark/presentation/summarize.py,sha256=UxqU0nNdKv80h3ROwP7fWrxmaivGJ3yUBiNKB2sBWOw,45067
+helm/benchmark/presentation/summarize.py,sha256=BUXog2m_UPbftyzFHx_U4mE2FrG56iv9mvcCdXoZVmI,45071
 helm/benchmark/presentation/table.py,sha256=VzVMwsgP3kItAM6FPRUaTphzJ-ZjriiuFbWlO1rJUMU,2879
 helm/benchmark/presentation/test_contamination.py,sha256=8mnzUzxUW9pXUOuLpU4BBBg0V7Mn1d1s4AQgwy6_kl4,459
+helm/benchmark/presentation/test_create_plots.py,sha256=2q3v2Qdh_hBKCEX9toygXFLIryu1FlcLMt2PXprx7j8,1251
 helm/benchmark/presentation/test_run_entry.py,sha256=M5z4dnVb7fM3PWrZWIZNlG8CT4KnDxjnEE4FBb1ZFNU,621
 helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/scenarios/babi_qa_scenario.py,sha256=SkIJjqBuMLYxZR8l9epu9arBeirvJPtQsIBXv4bzkx4,5030
@@ -104,7 +110,8 @@ helm/benchmark/scenarios/civil_comments_scenario.py,sha256=vXq6KxyS5C0-tD8xUmkG5
 helm/benchmark/scenarios/code_scenario.py,sha256=Q_TP_vWewkClvibPFHXpsOjR-CWexYgu5kl4OpfZXNc,11355
 helm/benchmark/scenarios/code_scenario_helper.py,sha256=EbQNfHqhQXaMMPmYT2mG2dRjzYaI2FvcPb9j6NlNHDU,5853
 helm/benchmark/scenarios/commonsense_scenario.py,sha256=9roSJS3iGSNgqxTbLI87xuZGB8IxJkbbtzr-ep0HUn0,10661
-helm/benchmark/scenarios/copyright_scenario.py,sha256=3dzDZ4B2a3ZmY6zMlQ98Ni-9836kPE4V1U6fecYrQHM,3646
+helm/benchmark/scenarios/copyright_scenario.py,sha256=APYQPC-esq3oM2qQxW6JNxa4pkv_yHDKfePjpvvi6nQ,3660
+helm/benchmark/scenarios/covid_dialog_scenario.py,sha256=FmYIuRr81xD_d0iyRa5blPC8OTqpfv8XGTz5XXUOd2E,3958
 helm/benchmark/scenarios/dialogue_scenarios.py,sha256=SPwo1iYiLbPwNtOgAVkTr-dO8FQLshmrfXdjPcayW5A,5616
 helm/benchmark/scenarios/disinformation_scenario.py,sha256=Ff66LxBm8APuMziLfGvTM0WIatrAty5_q_8ObaLW5lo,8491
 helm/benchmark/scenarios/dyck_language_scenario.py,sha256=Yua5S2gwLX2C-odJY3LeL-Yj47H1xXChdlI8cratVz4,9300
@@ -112,42 +119,53 @@ helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=oChQzEptlf731
 helm/benchmark/scenarios/entity_matching_scenario.py,sha256=gtrLSCw2JSNnBgFQFsUm4EcICsjVWtp9wsOdqcyBU4k,6863
 helm/benchmark/scenarios/gsm_scenario.py,sha256=PmX0zutkGqnqGirWidUdk166cWv_23RtaTFcVQGBpzc,2619
 helm/benchmark/scenarios/ice_scenario.py,sha256=smrpTOwtMDL-m40zfKfNz9btOGoINZNv3_2oBcLBMmk,16156
+helm/benchmark/scenarios/imdb_listdir.json,sha256=eczxp9gslYYwx5XR86ATnZorIxuujFMDTfzR4h5NCpo,1015402
 helm/benchmark/scenarios/imdb_scenario.py,sha256=VTD0Ur6ATyY9NWxcnkGzn9Iw5vl4d94o0FbFm61ZZTA,6057
 helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=wYaivSqqYYZPjPHTKaS6D7j960dcLIVTfmuZ8awd1Zs,2192
 helm/benchmark/scenarios/legal_support_scenario.py,sha256=K9HfTyHJnnLFvGKNwYQayu7JV4uyNT5wH48wc9ixRa0,3912
+helm/benchmark/scenarios/lex_glue_scenario.py,sha256=r89KevvM1Kifu3ZkUIXAV8jXktclNtL0-JL9T6qOx_Q,10224
+helm/benchmark/scenarios/lextreme_scenario.py,sha256=tejeKE08YX5MYFXrJRY48rIzO5V5fdwHDWEzg883K2E,20300
 helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=2-GrPB8whoBFcQ5608jrwlAcRJgpkT1P2UehcR6-EYY,5977
 helm/benchmark/scenarios/math_scenario.py,sha256=5PespNtseDOnPgAwtdP0vMkXz1CaJM0BkJsWdeG5gUM,13825
+helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=ZIn7tlBC4baV8CYcU_-mYe2RbYaQ-8dX1Ca_hOvZTfI,3988
+helm/benchmark/scenarios/med_dialog_scenario.py,sha256=w_s0s5TY6VjnCdmJ5BcSbDAYKZtnb5c7KSP1wYd9z9A,7282
+helm/benchmark/scenarios/med_mcqa_scenario.py,sha256=OgxZbyqzNnADTjYMlejV2I54G7tK3awGecTaxSgO9W8,5022
+helm/benchmark/scenarios/med_paragraph_simplification_scenario.py,sha256=nw6EhUtZq6ABFXxNU0Uh2mG5yiCUDesKqLQdZo7Kr90,7578
+helm/benchmark/scenarios/med_qa_scenario.py,sha256=rHSoIfb50Uq8T7l37m9Wn_fC16otifaP77qAEctf1oI,4404
 helm/benchmark/scenarios/mmlu_scenario.py,sha256=pzIRmLGikWTgB0AD2VFj64Q3GUsQg7nJzUqTSo-7pZo,3777
 helm/benchmark/scenarios/msmarco_scenario.py,sha256=_EaKsppb2Ax8f_ETc3cBy27i5w5ajTtIbF3xNWO8lUA,33669
 helm/benchmark/scenarios/narrativeqa_scenario.py,sha256=r5TwYkZH_YaCws6BSnjbiDQkZH_YM2BtxtDFVl4jj5s,5595
 helm/benchmark/scenarios/natural_qa_scenario.py,sha256=nuL5Qlh26xq7Q_lvzK2sKVN7eNYh7F5SjbIjahKaMNg,12527
 helm/benchmark/scenarios/newsqa_scenario.py,sha256=vMuIZyYxufH2AqhDoIZzzllfq8ScJIhSDH6lM3IUxGM,7242
 helm/benchmark/scenarios/numeracy_scenario.py,sha256=Iwtypyb0249zKYyV6p4YUX4bYke6uIyL6R3aopiGUb8,30552
+helm/benchmark/scenarios/opinions_qa_scenario.py,sha256=dtIYKL9ZiccX_F3-5OrnHXJdNBBTxCoTHc2Kc-XX79E,7380
 helm/benchmark/scenarios/pubmed_qa_scenario.py,sha256=cpMmQWwCDXIYO0btGLhevMT-Mhs9-5Es9cDvQYkIlL0,7493
 helm/benchmark/scenarios/quac_scenario.py,sha256=46nqmeVgkWu6jDGCHl-KHu351bmJj7jx_1p5kPwcOjc,6615
 helm/benchmark/scenarios/raft_scenario.py,sha256=RzewlMVkHJ2XbZ9_9FzBvbdV__BuBRgtX2HyhCnmH1o,4500
 helm/benchmark/scenarios/real_toxicity_prompts_scenario.py,sha256=8DQD7KkilAfxwzHUwCPrPPHeYEz_dEOWSls8yZo15do,2387
-helm/benchmark/scenarios/scenario.py,sha256=aj2golyi1TAAAebfW7eouTWRwZ07KbK034HlLx-q-1g,7164
+helm/benchmark/scenarios/scenario.py,sha256=bdRcv-YoLkxjlpNcq4MXiu8HQgjByHkkLWOdih4ahsM,7365
 helm/benchmark/scenarios/simple_scenarios.py,sha256=rcHzukhjBgvNRqkjcg0cms_zWtAsLPk0xiFN9I25_hI,1947
 helm/benchmark/scenarios/summarization_scenario.py,sha256=fKeRSkXrH26WyfeIhn43_fZxnAO4bIX1Xh5HoKcjOQM,6550
 helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=dtMMeULjw6pcobBRp0r6f9N4VCUKamx1Jy-6xPu85q0,3083
 helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=YvEgO5qVZ4hLpnvjer4CG0Ct1upssZRjZWxnNi1ZUtY,16308
 helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=Pm63shscbGuigg4zWfcw3rOIPP8LgxBCnpcQKAA_CX0,8327
 helm/benchmark/scenarios/test_scenario.py,sha256=o8w8ElDPF-RzeCmecwyvie_nRMYj01b38BufXS-igqY,1612
-helm/benchmark/scenarios/the_pile_scenario.py,sha256=UXxRlaZdVRoWTjw8h5TUhXouq5JLaidPbL5-Itai0KE,4988
+helm/benchmark/scenarios/the_pile_scenario.py,sha256=aCcjZp0wabu8lpPVNAdCr1x6m_3QvgKe7dIGS2qgGm4,4981
 helm/benchmark/scenarios/truthful_qa_scenario.py,sha256=zQgv_qUU5h5ODoQE06rpJa6O_8FwF695cIie3PG7bx8,5969
 helm/benchmark/scenarios/twitter_aae_scenario.py,sha256=_qWotvPagNj3ATnyaww3U1XZtxN-wgueUf34fFmKXQI,2083
 helm/benchmark/scenarios/wikifact_scenario.py,sha256=pucEuLYz5N9qIobnEbJnRUnK6PrkWFdkl7yPuCJj3SE,5778
 helm/benchmark/scenarios/wikitext_103_scenario.py,sha256=rXVbUzOZi4eWM-_HP1gzY5SBmMwOX1vk12WrLkR3NHo,3074
-helm/benchmark/static/benchmarking.css,sha256=EkLEyXEL5Qwq-312D01y9EaQV2IBa67fw8Bjc3PQPJs,1928
-helm/benchmark/static/benchmarking.js,sha256=NQUoE05neH_YN9BgyNVFwEXX09YDRZOcunK_6tCZomA,47399
+helm/benchmark/scenarios/wmt_14_scenario.py,sha256=u24E_w0AOXpl3PzEFLmiBcl8qyJEy-1Yc-i4YHgU99M,4356
+helm/benchmark/static/benchmarking.css,sha256=DGC4Huh4tVD2o9wEeUf3YOc3MYcq2fmJQXvhTjVDumE,2057
+helm/benchmark/static/benchmarking.js,sha256=qiXAY_9fiWZ4ydzhBQAUhylzxNCoIN8ciLlEFMg33uE,49107
 helm/benchmark/static/contamination.yaml,sha256=LbISh56ORvfkkWptm7ZWmlPvWxtls6pBF1TbGiWD7hk,3096
 helm/benchmark/static/general.js,sha256=L3S4CBUED0k7RsjLHCeWjO29ZMFJckZgNTAYAARzaEg,3029
-helm/benchmark/static/index.html,sha256=LZMmoydG2LLqRfvGSpK3eRt1n92o19AELAzxCi-kok0,2994
+helm/benchmark/static/index.html,sha256=yzf_VEGW15CPZOW9a2Z6opoQ0Gg2YQCtj182niUb0fk,3130
 helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheGWu7zNI,3428
 helm/benchmark/static/json-urls-root.js,sha256=G3qenwLgBojh3ukzp_gyMUaZja83ZFqvT1WQ_Rg11BU,98
-helm/benchmark/static/json-urls.js,sha256=UzWTp-dowxZhJuWtoPyi5vpfwlRr561DGYWRHdkvZ1E,1634
-helm/benchmark/static/schema.yaml,sha256=c3ipoxCLWCIY2wxNhDJF2JC0mSrC_g0NORnXNWCbs7k,89602
+helm/benchmark/static/json-urls.js,sha256=wvsG2Lrz2XArwwMOl_tGXL9y4mjqzjod4gcqlvVCiQA,1750
+helm/benchmark/static/plot-captions.js,sha256=gTBn-IPPD4BkzryVYj3KkGqLhWqWvBbvufeJaDygQxk,3010
+helm/benchmark/static/schema.yaml,sha256=TQdzlpOPvTwELiU0w0HBAZqKQBmjj1LOkEGM85oa8e4,95657
 helm/benchmark/static/utils.js,sha256=H2PKYjuXZ392DlALCPJ1XRwGxBDRFjL9eTFiTd4vBU8,7338
 helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
 helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
@@ -172,23 +190,27 @@ helm/benchmark/window_services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
 helm/benchmark/window_services/ai21_window_service.py,sha256=Mo0Zzj2a9hDiUg6hTuJWb3ABhBUPlOYS-kHhaE-pHUE,12672
 helm/benchmark/window_services/anthropic_window_service.py,sha256=mA9aWKfc-dbSnpt37k6zDFdNNMfbevAoleCzSaar-uE,799
 helm/benchmark/window_services/bloom_window_service.py,sha256=o7MVedt6khdoj8zikLDuVraEzuoBZk7j4Fzjsas0sD4,1023
-helm/benchmark/window_services/cohere_window_service.py,sha256=MXBRAjuAQGq0iEpU9OLORH1FvHMak1-Nf5-D7k7UO9I,6182
+helm/benchmark/window_services/cohere_window_service.py,sha256=3SJT97CaxNxtUNS9_qKvKMVCA6lvNavS-xo6jAsxPbM,7070
 helm/benchmark/window_services/encoder_decoder_window_service.py,sha256=Dg_rD0RtKIALtpvT4Wi4Am3zgFcIvgxfzdglw5fbbTU,2478
+helm/benchmark/window_services/flan_t5_window_service.py,sha256=39IZX89_tay3bpSGVoWDoekmhW-RUNATQuT-bNyFRTs,915
 helm/benchmark/window_services/gpt2_window_service.py,sha256=d-ys9FHoVI1u1GZOH734JPAOc8W6QZb1N3ZjooKmwz8,990
 helm/benchmark/window_services/gptj_window_service.py,sha256=C2OEl-3ZatwxYVoaQgyvAZ5SBS2TK-2PnM02zUNjhiA,1103
 helm/benchmark/window_services/gptneox_window_service.py,sha256=FD0NDlBxDu1fZ8vlaTsOBh2IKYfrMV73qNGetkrp1P4,944
+helm/benchmark/window_services/huggingface_window_service.py,sha256=tDbhjn81Aw3ZSYMKh4aQU3e8JX1IVYCu_2gPOLaIWD8,1440
 helm/benchmark/window_services/ice_window_service.py,sha256=5z52rP-xAF_jckIPoogyoNFW6FQXqaq3SHybCaBuRn0,2005
 helm/benchmark/window_services/local_window_service.py,sha256=wgNBB-p9Zk-uFLRXYNxrz16_DhWyQ8x9sltSLBhzUh4,4247
 helm/benchmark/window_services/luminous_window_service.py,sha256=0w-nyfXXDwnIDBzU3Y84LZPnQyJGiSbvf5Y_MQh-hCg,1791
 helm/benchmark/window_services/mt_nlg_window_service.py,sha256=7zvEEZhqfdefUUFiDaqbJqrBu3Pt7BBuT_Si-4swV5s,838
 helm/benchmark/window_services/openai_window_service.py,sha256=Oguy_ewlL2Uydq-B1QrqqHWKTenL4mtwtY24RNqhFCM,466
 helm/benchmark/window_services/opt_window_service.py,sha256=ilj1G_pslwYeRZ2fhRMXsg2WjQ4rCfTUwuSmjItH-t4,1050
+helm/benchmark/window_services/santacoder_window_service.py,sha256=sfbuAAdhEkQkFON_bHxDI9Ek3jZiwCF6upa41bsjUO4,674
 helm/benchmark/window_services/t0pp_window_service.py,sha256=oa1vJRiyFPbkTb8eYnfjZNyKFJSvceipOq0U3Ys5e04,1196
 helm/benchmark/window_services/t511b_window_service.py,sha256=8CvkSfuG_Bg17gAEre_bmM4tUwoi9fAWpPxx6qCJwAE,1005
 helm/benchmark/window_services/test_ai21_window_service.py,sha256=U_n2mQ5GqD8oYv7e9vfNkACDic2-zDVuUIHN4meSC-I,8177
 helm/benchmark/window_services/test_bloom_window_service.py,sha256=xeYpsvdR8Ug31BAb4a-PU4Sc0oihpCIMJ5OzTfgUhM0,4221
 helm/benchmark/window_services/test_cohere_window_service.py,sha256=6WpIiuEyfgTZS3BIeAGOR8AAr2djpVIfGbItQRI19Ck,3205
 helm/benchmark/window_services/test_cohere_window_service_utils.py,sha256=sf25f9MeXzoqsbDzZ7d7le13hm8RkDe54nhLtKF2pqo,158150
+helm/benchmark/window_services/test_flan_t5_window_service.py,sha256=xv_EXbiRklveJPQtThYCSYF7qBYwjL7K4wH3Xu5z2Fg,591
 helm/benchmark/window_services/test_gpt2_window_service.py,sha256=3k25pLa_z__g4yoQL40DEXj-T4dGtrgif13N2NXs59U,2568
 helm/benchmark/window_services/test_gptj_window_service.py,sha256=sxsTpozKv9N-wZXtGl1prkQr9Md_q-tnCjO9zt226Co,2267
 helm/benchmark/window_services/test_gptneox_window_service.py,sha256=MCYHZIoulJf_WCx6de7rWB3nqku6wCyAokA-SWIPEks,4140
@@ -203,57 +225,64 @@ helm/benchmark/window_services/test_utils.py,sha256=1k2TlPdDIRjum669jpH3O7UOqm4G
 helm/benchmark/window_services/test_yalm_window_service.py,sha256=NdunSxq-qDzfzYMBYZ-0my6LaU2qUxtm7Ii0c4fyKnY,4273
 helm/benchmark/window_services/tokenizer_service.py,sha256=RNznJBAxcCUMCurb7mbraZULx_ZtB0G7IxbrnUe0Urk,865
 helm/benchmark/window_services/ul2_window_service.py,sha256=R_VEzOb59zQE9mmbTLunQeIvLAtK3-97h-B2_oc0Uxs,1021
+helm/benchmark/window_services/wider_ai21_window_service.py,sha256=VZ6EERN48FSYsmJ_aiwI30SEbobLt27c1QqL29Zg_8M,414
 helm/benchmark/window_services/wider_openai_window_service.py,sha256=cpm4mDEGTY-cmrECcDCL0flONBdh4g40uWNLI-v46BU,539
 helm/benchmark/window_services/window_service.py,sha256=aV4YnbXl7T23runB8xmSWRwV7YtliUlXrveEejOMJ1Y,2885
-helm/benchmark/window_services/window_service_factory.py,sha256=03UL8pE98Bh3lDSXe93m-R4j4riUJm1o7RkZ-UZfAgg,4403
+helm/benchmark/window_services/window_service_factory.py,sha256=UDrXRUXGwBMEiO7Cw_nNbr0XM6mzzXrXiyJsvCmW9wg,5845
 helm/benchmark/window_services/yalm_window_service.py,sha256=g7NeXvlRq3FXf2HwRStBMTkbWDxTdEITkrVRe-fv3mg,1805
 helm/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/common/authentication.py,sha256=RlMx29_TSrfU7ujE7dJkxmFub5EqLj2NswV5lAVFFDk,179
 helm/common/cache.py,sha256=ustgsRHX0W3zoLPN05W3mFl9m9JYp9Ppq5cjMbdmm6Y,13116
-helm/common/general.py,sha256=bIhgtiEIz1zA7cQVH-U_6FuW_bmDzYaff4QZ779tR3U,10087
+helm/common/codec.py,sha256=zm8MP9Aqfh64D2HMZiCPEMoPkkiJxEzvzmuupGvkRh0,5499
+helm/common/general.py,sha256=7vFw10h_hTrxUCfxE0LH13hp8Lunp77lbVmucgoeq2Y,10181
 helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
 helm/common/object_spec.py,sha256=COMd4RpYgfulW940a5M_npbsfRBvLkmhjfwDIq4Gpqs,1833
 helm/common/perspective_api_request.py,sha256=WAVwtajNVmi5XJNsPcorGEAVrqkpPSk-Kd3b0hJghbA,2427
 helm/common/request.py,sha256=QBpkZBpU1nCTqtEM8Ekki3bZgxpASuVRC3sQ_4RYaRE,5793
 helm/common/test_cache.py,sha256=XqboYHQAkFWIHPsuIjuageRSLeN7QoATKF7wwxggPqE,7054
+helm/common/test_codec.py,sha256=igL--k-2DwAy0eoMr8D9Xs8MOjBoT0LutbMPzDlTNkM,5885
 helm/common/test_general.py,sha256=zOxSwWNgWnWHsXKcG4NZ50GkWicn4uZ4jPVypSwFaQE,1672
 helm/common/tokenization_request.py,sha256=aDyf4A6QlTgISXy4IyXJVQytrOLwYVX9-TCa2CK2h1M,3226
 helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/accounts.py,sha256=xq-zVggvueB4D5QK58mFWGPxZe-hnIUAT4D341yd0ac,13503
 helm/proxy/cli.py,sha256=2SOxIF55PDjzXXcDOYRT8m-oyQM_2VyZheKRC3fXDVw,8094
 helm/proxy/example_queries.py,sha256=p1wH-tp1pRUslkAwaJYrrG5aDfmFWK3KYn4M6WQfPqQ,4120
-helm/proxy/models.py,sha256=N-5PYF0CQij7UrWl5v4hJ2TAlTsMjF2wghK9bZtaGEc,24838
+helm/proxy/models.py,sha256=sfdp7DasJXbun-VMZz5p7iD6uhoUW1mFsy64BWfDoV4,29681
 helm/proxy/query.py,sha256=eftbiUICMh8QIHVs-7cLtv_rDXKeKdRPmwjLMu0TDxQ,645
 helm/proxy/retry.py,sha256=GLLDW1iGCwHfgTle8YK7ZB3vV-7XqsHcqeruKoVdsxE,1953
 helm/proxy/server.py,sha256=uBispGXfn39s_Pskd9Xjud0rijTjqXtSKU_2YvE6zGE,7356
-helm/proxy/test_models.py,sha256=-vb1s5WFhv10qvCLxwVhpoSY-yXSVF8CUQ2GwK8QXJU,782
+helm/proxy/test_models.py,sha256=hWeDcBw1GkPvyJUd-ABxRVe1FhSUfz8bzyrKYdsqmyY,726
 helm/proxy/test_retry.py,sha256=8h398auzjW9VnlTJWllxR-bdpub-XFp8EN8LWDEnEHM,1049
 helm/proxy/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/clients/ai21_client.py,sha256=c4D64-1mLaFcGsqOb5CYkaXTmAWEJYk9cIjtr6DbgZQ,7770
-helm/proxy/clients/aleph_alpha_client.py,sha256=kh7Wrf9D5aZo5HYbvW1KFdRvdOn1EO64V44tNXlUKlk,6530
+helm/proxy/clients/aleph_alpha_client.py,sha256=VV7dbgh7sYqoSWfNkxxAiQ7i3yPW33rBPnBig9EXL10,7707
 helm/proxy/clients/anthropic_client.py,sha256=vrfc8lS9bxQbUxMxbElV2z5cMDq-JD6yFDTS2cJdFO0,15526
-helm/proxy/clients/auto_client.py,sha256=6m9datYzA3izIAlv1M14RTfzwjOAafILp9gsoml9ud4,10635
+helm/proxy/clients/auto_client.py,sha256=oktJQ4rCntp7id4vj-d3x_EA3RWGNZyXUMFwzRse--c,11063
 helm/proxy/clients/chat_gpt_client.py,sha256=nG3opHbnzX50r9Ialh3RaRErOZo6k4Q7gVzRHeGQgj8,5312
 helm/proxy/clients/client.py,sha256=bh6FvYFjw6MoHp5n7-KN1asXrIrOC-jfYsg3aW4xMgo,4570
 helm/proxy/clients/cohere_client.py,sha256=KF21m7qUjuhrpEEQv68FNeX0rsWSmxZgw52Oa7CZ5pI,11362
+helm/proxy/clients/google_client.py,sha256=sGGxDWD22c9a9KMzLAFL3vAEDHxp5jSY2W3RDpVDIak,3334
 helm/proxy/clients/goose_ai_client.py,sha256=2tqJK_AhD2-ScXtOTdt9S9khzVjal5pm38BJWiFhwq8,4217
-helm/proxy/clients/huggingface_client.py,sha256=W3R4IfA75tZuSEI3oH_b3fSQjKIAXAmH98ebYonZVOg,10805
-helm/proxy/clients/huggingface_tokenizer.py,sha256=4tsAW9oLW3NczBWWLAZuvE1gL-5HAj0p0Vi6c0UIB9M,3679
+helm/proxy/clients/huggingface_client.py,sha256=LypY3YfyoaGFH83UkYwDARGuoN2JOUk2S-nEaJ6GemI,11813
+helm/proxy/clients/huggingface_model_registry.py,sha256=0WHyWPxxBI4KtTs2Yt6-Cw16FC4XBEe6yqUc0-YSn1Y,3891
+helm/proxy/clients/huggingface_tokenizer.py,sha256=ujtsBupMMrE9efds2205c8NiPTcxHX8XM0UoV9spLK0,4591
 helm/proxy/clients/ice_tokenizer_client.py,sha256=Ui8YhAXoY1Q0vC3icoeFs6X9xAcESF6Tl2EGERGWVGU,2325
 helm/proxy/clients/microsoft_client.py,sha256=-VC8IrgrpSp1_FvRSI_8MSxhNp5I6dMc4qWSHc4Oulg,8237
-helm/proxy/clients/openai_client.py,sha256=rPV74pbiAlkLS4YO6J-dnzpVmsShWKYISD2nJTk6Sds,6025
+helm/proxy/clients/openai_client.py,sha256=frW9fOjYWkRXdfzE88ppxaLhVl9pCnXfheUqeANW6QQ,9415
 helm/proxy/clients/perspective_api_client.py,sha256=-L8IwokuktWPoOu7nXwsfoab_U1QRGCt8xT1SrcGfYE,5491
 helm/proxy/clients/simple_client.py,sha256=GXHTCRB58XAxnUVqgpynidc7h6kaDBOP7TedVHrOpD4,2915
 helm/proxy/clients/test_client.py,sha256=bvkFob_Yoy8bALrVeQ0h757g9RU687JYI0g3AISPFQ8,1268
 helm/proxy/clients/test_huggingface_client.py,sha256=n-6D-RXqwQyxPxCLCSqHxqfK5JA-PdP5ffP17XwTe2I,3520
-helm/proxy/clients/test_huggingface_tokenizer.py,sha256=OtG8kVSSsXlYkTBiSF9eHODHmUad99GUu2oYR2zrRME,2072
+helm/proxy/clients/test_huggingface_model_registry.py,sha256=zMboFlwMtDEV7hkd9SZFuItye-vkzz3CE5mWQrw--W4,2554
+helm/proxy/clients/test_huggingface_tokenizer.py,sha256=KmlAXezQ6R7DAEpV85_JRdTRrOJoJxfmtylqybWn5VA,2189
 helm/proxy/clients/test_ice_tokenizer_client.py,sha256=Ugmn5a7QdAPEAbLtreLS5-sji8yrzxy5mhFPAl3rOuI,2404
 helm/proxy/clients/test_yalm_tokenizer_client.py,sha256=tnrYl7T1DcZ9FN09nBWV1gesjWQ3osiUpsGHSm_IypQ,2336
-helm/proxy/clients/together_client.py,sha256=k7TmK940KHdtvb2pxAlX4kHUHrfiZL2HAzM3dgwJowg,5731
+helm/proxy/clients/together_client.py,sha256=epyiYElD0BfAZgUSu4zZKC5Oe8yIVVyJn2RtTwjPMzM,6334
 helm/proxy/clients/yalm_tokenizer_client.py,sha256=cpBoc8eHQoBGQguZsDaVnGWLdZnPgkHjqLSO0B94O0U,2420
 helm/proxy/clients/yalm_tokenizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
-helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py,sha256=z1JEJynC51T70JSuig6TrNwLEUXiS2SVTMvWECya7ww,5743
+helm/proxy/clients/yalm_tokenizer/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
+helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py,sha256=7Y4_nCZptFWzifCJ5aPmM3_OOxhtomIAQVjpJGV1D8g,5954
 helm/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/services/remote_service.py,sha256=xKS-0P-EqKTPn7odTXDoQPjn9FliQRLaMFUnCEsUmCU,6965
 helm/proxy/services/server_service.py,sha256=gps7PwXqCi8b0yGYC0nQwFdKbfxCriSHt5CD1N1kkJs,5696
@@ -270,9 +299,9 @@ helm/proxy/token_counters/openai_token_counter.py,sha256=gPo_VrkEH07xmprzdfIhmJ_
 helm/proxy/token_counters/test_ai21_token_counter.py,sha256=42J1fCi20kQUwAD18bIa6h9TaP7KZnlgF-mLbvKURro,5508
 helm/proxy/token_counters/test_openai_token_counter.py,sha256=EovaVCZSr9moITZ9-AKiv_YM-D3OUsUDs4iQhEvpazQ,4823
 helm/proxy/token_counters/token_counter.py,sha256=x8KyTR82EedgCQUuneQiVq9AiU1B3_CHPmKPNumClHc,429
-crfm_helm-0.2.0.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
-crfm_helm-0.2.0.dist-info/METADATA,sha256=rkgCIgDkX6yAGYGEzrFrlbIO_2MEeIGXlXneQad-mx0,1949
-crfm_helm-0.2.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
-crfm_helm-0.2.0.dist-info/entry_points.txt,sha256=o2pZIIQCZp4hBs4ZzZkKK0qvThIXXK57YV584ANCK7E,251
-crfm_helm-0.2.0.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
-crfm_helm-0.2.0.dist-info/RECORD,,
+crfm_helm-0.2.2.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
+crfm_helm-0.2.2.dist-info/METADATA,sha256=_OlkKmj1P7vaZvlpvOnNnzzm3w1IEW6de75SK7TmuPw,2066
+crfm_helm-0.2.2.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
+crfm_helm-0.2.2.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
+crfm_helm-0.2.2.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
+crfm_helm-0.2.2.dist-info/RECORD,,

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.38.4)
+Generator: bdist_wheel (0.40.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,6 +1,7 @@
 [console_scripts]
 crfm-proxy-cli = helm.proxy.cli:main
 crfm-proxy-server = helm.proxy.server:main
-helm-run = helm.benchmark.presentation.present:main
+helm-create-plots = helm.benchmark.presentation.create_plots:main
+helm-run = helm.benchmark.run:main
 helm-server = helm.benchmark.server:main
 helm-summarize = helm.benchmark.presentation.summarize:main

helm/benchmark/__init__.py CHANGED Viewed

@@ -42,12 +42,24 @@ from .scenarios import legal_support_scenario  # noqa
 from .scenarios import entity_matching_scenario  # noqa
 from .scenarios import entity_data_imputation_scenario  # noqa
 from .scenarios import big_bench_scenario  # noqa
+from .scenarios import opinions_qa_scenario  # noqa
+# Biomedical
+from .scenarios import covid_dialog_scenario  # noqa
+from .scenarios import me_q_sum_scenario  # noqa
+from .scenarios import med_dialog_scenario  # noqa
+from .scenarios import med_mcqa_scenario  # noqa
+from .scenarios import med_paragraph_simplification_scenario  # noqa
+from .scenarios import med_qa_scenario  # noqa
 from .scenarios import pubmed_qa_scenario  # noqa
+from .scenarios import wmt_14_scenario  # noqa
 # Metrics
 from .metrics import basic_metrics  # noqa
 from .metrics import bbq_metrics  # noqa
 from .metrics import bias_metrics  # noqa
+from .metrics import classification_metrics  # noqa
 from .metrics import code_metrics  # noqa
 from .metrics import copyright_metrics  # noqa
 from .metrics import disinformation_metrics  # noqa
@@ -56,6 +68,7 @@ from .metrics import ranking_metrics  # noqa
 from .metrics import summarization_metrics  # noqa
 from .metrics import toxicity_metrics  # noqa
 from .metrics import tokens_metric  # noqa
+from .metrics import machine_translation_metrics  # noqa
 # Perturbations for data augmentation
 from .augmentations.extra_space_perturbation import ExtraSpacePerturbation  # noqa

helm/benchmark/adaptation/adapter_spec.py CHANGED Viewed

@@ -68,6 +68,9 @@ class AdapterSpec:
     # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
+    # If true, randomly sample N training examples; if false, select N consecutive training examples
+    sample_train: bool = True
     # Decoding parameters (inherited by `Request`)
     # Model to make the request to (need to fill in)

helm/benchmark/adaptation/adapters/in_context_learning_adapter.py CHANGED Viewed

@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
     @htrack(None)
     def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
         """
-        Takes a a list of `Instance`s and builds a list of corresponding `RequestState`s.
+        Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
         The reason we don't do this per eval instance is that we create a common set of
         training instances which is shared across all eval instances.
         """
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
         parallelism: int,
     ) -> List[RequestState]:
         self.train_trial_index: int = train_trial_index
-        self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index)
+        self.train_instances: List[Instance] = self.sample_examples(
+            all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
+        )
         hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
         # Generate request_states
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
         return [request_state for result in results for request_state in result]
-    def sample_examples(self, all_train_instances: List[Instance], seed: int) -> List[Instance]:
+    def sample_examples(
+        self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
+    ) -> List[Instance]:
         """
         Sample a random set of train instances to use as examples by following the steps below:
         1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
         random.seed(seed)
         num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
+        examples: List[Instance] = []
+        if not sample_train:
+            # Select sequentially from the train set
+            examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
+            return examples
         unlabeled_instances: List[Instance] = []
         label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
         for instance in all_train_instances:
             if instance.first_correct_reference:
                 label_to_instances[instance.first_correct_reference.output.text].append(instance)
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
             sorted_labels.extend(labels)
         labels_iterable = cycle(sorted_labels)
-        examples: List[Instance] = []
         while num_instances_to_sample > 0:
             next_label: Optional[str] = next(labels_iterable, None)
             if not next_label:
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
         # References (optionally) and output
         output: str
+        delimiter = ","
         if reference_index is None:
             # Put only the correct reference as the output
-            correct_reference: Optional[Reference] = instance.first_correct_reference
-            output = correct_reference.output.text if correct_reference is not None else "n/a"
+            correct_references: List[Reference] = instance.all_correct_references
+            if not correct_references:
+                output = "n/a"
+            else:
+                output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
         else:
             reference = instance.references[reference_index]
             output = reference.output.text

crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl