PyPI - crfm-helm - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

crfm-helm 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +10 -8
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +50 -37
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +1 -0
helm/benchmark/__init__.py +2 -0
helm/benchmark/adaptation/adapter_spec.py +3 -0
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
helm/benchmark/contamination/__init__.py +0 -0
helm/benchmark/metrics/classification_metrics.py +28 -23
helm/benchmark/metrics/test_classification_metrics.py +44 -9
helm/benchmark/presentation/create_plots.py +617 -0
helm/benchmark/presentation/summarize.py +4 -2
helm/benchmark/presentation/test_create_plots.py +32 -0
helm/benchmark/run.py +23 -1
helm/benchmark/run_expander.py +161 -47
helm/benchmark/run_specs.py +84 -10
helm/benchmark/runner.py +31 -3
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/imdb_listdir.json +50014 -0
helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
helm/benchmark/scenarios/lextreme_scenario.py +37 -25
helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
helm/benchmark/scenarios/scenario.py +5 -0
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/static/benchmarking.css +14 -0
helm/benchmark/static/benchmarking.js +43 -0
helm/benchmark/static/index.html +2 -0
helm/benchmark/static/json-urls.js +4 -0
helm/benchmark/static/plot-captions.js +16 -0
helm/benchmark/static/schema.yaml +66 -8
helm/benchmark/window_services/cohere_window_service.py +20 -0
helm/benchmark/window_services/flan_t5_window_service.py +29 -0
helm/benchmark/window_services/huggingface_window_service.py +39 -0
helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
helm/benchmark/window_services/window_service_factory.py +27 -6
helm/common/general.py +12 -5
helm/proxy/clients/aleph_alpha_client.py +47 -28
helm/proxy/clients/auto_client.py +28 -24
helm/proxy/clients/huggingface_client.py +30 -17
helm/proxy/clients/huggingface_model_registry.py +111 -0
helm/proxy/clients/huggingface_tokenizer.py +23 -7
helm/proxy/clients/openai_client.py +60 -2
helm/proxy/clients/test_huggingface_model_registry.py +57 -0
helm/proxy/clients/together_client.py +17 -2
helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
helm/proxy/models.py +82 -2
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0

{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: crfm-helm
-Version: 0.2.1
+Version: 0.2.2
 Summary: Benchmark for language models
 Home-page: https://github.com/stanford-crfm/helm
 Author: Stanford CRFM
@@ -12,15 +12,11 @@ Classifier: Programming Language :: Python :: 3.8
 Classifier: License :: OSI Approved :: Apache Software License
 Requires-Python: ~=3.8
 License-File: LICENSE
-Requires-Dist: pytest (~=7.2.0)
-Requires-Dist: black (~=22.10.0)
-Requires-Dist: mypy (~=0.982)
-Requires-Dist: pre-commit (~=2.20.0)
-Requires-Dist: flake8 (~=5.0.4)
 Requires-Dist: zstandard (~=0.18.0)
 Requires-Dist: tqdm (~=4.64.1)
 Requires-Dist: pyhocon (~=0.3.59)
 Requires-Dist: dacite (~=1.6.0)
+Requires-Dist: aleph-alpha-client (~=2.14.0)
 Requires-Dist: bottle (~=0.12.23)
 Requires-Dist: gunicorn (~=20.1.0)
 Requires-Dist: Mako (~=1.2.3)
@@ -28,8 +24,9 @@ Requires-Dist: sqlitedict (~=1.7.0)
 Requires-Dist: pymongo (~=4.2.0)
 Requires-Dist: retrying (~=1.3.3)
 Requires-Dist: websocket-client (~=1.3.2)
-Requires-Dist: openai (~=0.25.0)
-Requires-Dist: transformers (~=4.22.2)
+Requires-Dist: openai (~=0.27.0)
+Requires-Dist: transformers (~=4.26.1)
+Requires-Dist: tokenizers (~=0.13.2)
 Requires-Dist: icetk (~=0.0.4)
 Requires-Dist: protobuf (~=3.20.2)
 Requires-Dist: google-api-python-client (~=2.64.0)
@@ -41,6 +38,7 @@ Requires-Dist: sympy (~=1.11.1)
 Requires-Dist: sentencepiece (~=0.1.97)
 Requires-Dist: numba (~=0.56.4)
 Requires-Dist: cattrs (~=22.2.0)
+Requires-Dist: xlrd (~=2.0.1)
 Requires-Dist: importlib-resources (~=5.10.0)
 Requires-Dist: nltk (~=3.7)
 Requires-Dist: scipy (~=1.9.1)
@@ -54,5 +52,9 @@ Requires-Dist: spacy (~=3.2.4)
 Requires-Dist: summ-eval (~=0.892)
 Requires-Dist: torch (~=1.12.1)
 Requires-Dist: torchvision (~=0.13.1)
+Requires-Dist: colorcet (~=3.0.1)
+Requires-Dist: matplotlib (~=3.6.0)
+Requires-Dist: numpy (~=1.23.3)
+Requires-Dist: seaborn (~=0.11.0)
 Benchmark for language models

{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 helm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-helm/benchmark/__init__.py,sha256=HrVUkpDalZyBFNqyRao1CQ3Z05bLWmlT9K7-zMzr9-Y,4397
+helm/benchmark/__init__.py,sha256=haJrJawd2zOTaxV_nkk6-V05vnePuHwCi0DytuJ0898,4450
 helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYCK8pXVo,2173
 helm/benchmark/executor.py,sha256=Vkmc4wmar2MRIavfiUOa2mu8Pp-zXsguYOevbjog4-4,3299
-helm/benchmark/run.py,sha256=_6jBWeTiyUz9A7XmtElM_6KaYs7Z7M-kAKcY6bgzwN4,8537
-helm/benchmark/run_expander.py,sha256=W91zJZ98h2gI_f9IX8cvtA3BxoQxf26W2Wr7FM2xnLM,29077
-helm/benchmark/run_specs.py,sha256=7KDzTtZXVip4avTHV8IKgcHxXt_85EhS8PBtjOBuZMg,69191
-helm/benchmark/runner.py,sha256=wHC3A5XdhQ77ctbtNUNfGLzqeJLkLJJvMdZj69gH6n4,7559
+helm/benchmark/run.py,sha256=AWa862BtEh5aOTjKZ9OkSv3be2ZrU4R1qiwJtRTQwfk,9402
+helm/benchmark/run_expander.py,sha256=vnq-zRmuXLzgr3sS3XYaXJFarNC7-QKc0_DtPjwXq3Q,32952
+helm/benchmark/run_specs.py,sha256=ssBJYMZVMF4XGk6lvCSlQJh6A-Pmh2_ndi_JAwgW0CQ,71441
+helm/benchmark/runner.py,sha256=zYDe8UeB1LFmbpChmRdRqEIZo-X0xWMenOCp2NnZ9Ws,8802
 helm/benchmark/server.py,sha256=HsuVsch1SPjQ4YyZi60kjr3JZeL82h8jgkxTUlfb130,1620
 helm/benchmark/test_data_preprocessor.py,sha256=adT-pgVeWvmZXLUUehxH0C-lMhXhtdxsvYdr69o1BD4,2047
 helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-helm/benchmark/adaptation/adapter_spec.py,sha256=KgO6KGF3sRrM0rKTMulFp4GtwHbGNlOurE48d0Lv5hg,2679
+helm/benchmark/adaptation/adapter_spec.py,sha256=YoxMyN4RJM8GG_DeZ-k0edyARZ69hHLkvOlvKCk-u2o,2811
 helm/benchmark/adaptation/prompt.py,sha256=MATerIUIhFp_BMGvK7bLpNtWH6Oi4kknjBjOkr2bHv4,1948
 helm/benchmark/adaptation/request_state.py,sha256=o3OpZbB0TJFiZ2Nmhvg3vWmByaUSYTffT_WnoNb7w68,2712
 helm/benchmark/adaptation/scenario_state.py,sha256=ZflBuNgvN0JqUhshFcy0kTweO1WJs6j5UCaTxWTMe0o,1747
@@ -18,7 +18,7 @@ helm/benchmark/adaptation/adapters/adapter.py,sha256=8wK28jISxW8rUfXP-_-FfQJRRzc
 helm/benchmark/adaptation/adapters/adapter_factory.py,sha256=N2n-xIoGt_DxlN0LT4GUgVvdoaqhyUU8rSWr_nyfb80,2318
 helm/benchmark/adaptation/adapters/binary_ranking_adapter.py,sha256=3j24nFQuZE0Zl6DMAB4aYUpjieerdSMLsJbpMT9Nzfw,5646
 helm/benchmark/adaptation/adapters/generation_adapter.py,sha256=-on4QAo8hhzJVgAnM6G8lFFqaoiSiVF-KxwwfHwE61A,1927
-helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=qRmFP1wweGIcZX55M2bljaQ22Xof45_oyY7Bxg4c3yQ,12657
+helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=q5K4Hag8LOpfpkeEzwIMPLNpBqMThcB1LXLGr_n_Xfo,13118
 helm/benchmark/adaptation/adapters/language_modeling_adapter.py,sha256=vPo2EVgbMfzmwPPcljoXdDfBW3c80LLKaUhA-RefU2w,11967
 helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py,sha256=pV14yvmH_mRQpeXF0teAxGpJcouSQViipr-aMkNE-AM,1711
 helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py,sha256=4CALqc--PUaEl3cLzmjP9nFSuarCZMKBwrPQxde5TYM,3471
@@ -46,6 +46,7 @@ helm/benchmark/augmentations/space_perturbation.py,sha256=7OdpoibdizoPDBPpLc1ENy
 helm/benchmark/augmentations/synonym_perturbation.py,sha256=2qFx7xparhEPd82tvs59HkAr1hwQWv7asWtmNCbcQrQ,4209
 helm/benchmark/augmentations/test_perturbation.py,sha256=v_U5CmBpA5aXqg4EJUYZrSfGsNbZTwCP0inxz1XNGq0,9991
 helm/benchmark/augmentations/typos_perturbation.py,sha256=nfF1Zw2REKZEnnyPVFWD87MP8L5ANbaZXeI2n70Sonw,2790
+helm/benchmark/contamination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/efficiency_data/inference_denoised_runtimes.json,sha256=ios_dt-_8wtXvkVAx0iI2zwCxqHvk3XKTx31qHPalsI,4203
 helm/benchmark/efficiency_data/inference_idealized_runtimes.json,sha256=5w7reeZc0yc4cjH8kJGxQQSoe8yaRVX2SSlSrx0QWFQ,12348
 helm/benchmark/efficiency_data/training_efficiency.json,sha256=aH2moiBLStOLVi8Ci2KTK5ZkWlTBLK-B3fRfNZwhoSg,9763
@@ -54,7 +55,7 @@ helm/benchmark/metrics/basic_metrics.py,sha256=xBCTgLVdiWVxdpB08MbWFc2nvwCJTgD1-
 helm/benchmark/metrics/bbq_metrics.py,sha256=H44mwKXLJ0PXo-sVKgRHCEKjZGnCahDD6GOQMWpnbOQ,6061
 helm/benchmark/metrics/bias_metrics.py,sha256=uVJFQvDSzvPR1ELu0FNYyExjRy2ThaJRCw8beEMDqJs,11309
 helm/benchmark/metrics/bias_word_lists.py,sha256=mx5JjW3mHffXIqo4GcQN-zENUEttBqQnEjPTz3J3J_4,13909
-helm/benchmark/metrics/classification_metrics.py,sha256=J5us8vRybPa-SVywrH6DYmi6luOFhf2smVwk5D3rkY8,3335
+helm/benchmark/metrics/classification_metrics.py,sha256=1q7gPnWRrx4QwE8T0m269vFJWg_bKfVx21a5spDBbjU,3701
 helm/benchmark/metrics/code_metrics.py,sha256=uWdigk0QyEsfVHQzq9KxkOc-LROvcqWXeui42Mr0YF4,5119
 helm/benchmark/metrics/code_metrics_helper.py,sha256=byyuI1lJgbIDPVJzywaBsam9zFMPPyn28g1grsK9xyA,22336
 helm/benchmark/metrics/copyright_metrics.py,sha256=8sk85mLTasWIgHIXxOho0z_nQYyLqtzSWHSAwd5ayAQ,7560
@@ -68,7 +69,7 @@ helm/benchmark/metrics/ranking_metrics.py,sha256=b3qxTRnr62zz1Gr1dsVDYtdwB8WBIb-
 helm/benchmark/metrics/statistic.py,sha256=9VM5JA1-M_iYCNziWm2qeDZaAQqPQ_ySdaSMcqAeYdM,3048
 helm/benchmark/metrics/summarization_metrics.py,sha256=hHNWGYA1bNfgCg7o1RSiTo7E-SJujHhkKh9G204icoo,16083
 helm/benchmark/metrics/test_bias_metrics.py,sha256=brut1rdnKNtTVJoe6qkllmJwZTFBZkLcyI_4qmqZ_vA,6264
-helm/benchmark/metrics/test_classification_metrics.py,sha256=bVVTw7UM_pUVFzhyjHgx16EBU3X81g8DeD0NGjP1OUA,4215
+helm/benchmark/metrics/test_classification_metrics.py,sha256=usW5ciUYu2ZUUqVjFk4NfZTGNIoBArwia_-8uGOvFpw,5475
 helm/benchmark/metrics/test_metric.py,sha256=S7LGHNCHuhMk582eHylw1tOasUBEf_7F0T4u3tey7b4,757
 helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SIBjLFuLIzGbfg6OiZvM,4162
 helm/benchmark/metrics/test_statistic.py,sha256=WQv9i8wSNTCzlw-L1wir0lmW0g3D4CM_ebpii7IB9Lw,406
@@ -89,12 +90,14 @@ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=gPqoYNI
 helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
 helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/presentation/contamination.py,sha256=5wLwq266sCxT62MdzAXT9V-au6b07HaL44DLj_2qiSk,2788
+helm/benchmark/presentation/create_plots.py,sha256=-YyrhEmfVOMnESJ8m2yk7RWAOYdZkVrLAt2K8XnpNF0,28442
 helm/benchmark/presentation/run_display.py,sha256=HSvV71ZRshMIhHZHGtlbYfRxK9xx1GQgn6YmGPVncME,9892
 helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
 helm/benchmark/presentation/schema.py,sha256=i1utCqiNkbTK9CcDcOFwi7e91KPaKkpFU07ZcBfWXTc,8753
-helm/benchmark/presentation/summarize.py,sha256=UxqU0nNdKv80h3ROwP7fWrxmaivGJ3yUBiNKB2sBWOw,45067
+helm/benchmark/presentation/summarize.py,sha256=BUXog2m_UPbftyzFHx_U4mE2FrG56iv9mvcCdXoZVmI,45071
 helm/benchmark/presentation/table.py,sha256=VzVMwsgP3kItAM6FPRUaTphzJ-ZjriiuFbWlO1rJUMU,2879
 helm/benchmark/presentation/test_contamination.py,sha256=8mnzUzxUW9pXUOuLpU4BBBg0V7Mn1d1s4AQgwy6_kl4,459
+helm/benchmark/presentation/test_create_plots.py,sha256=2q3v2Qdh_hBKCEX9toygXFLIryu1FlcLMt2PXprx7j8,1251
 helm/benchmark/presentation/test_run_entry.py,sha256=M5z4dnVb7fM3PWrZWIZNlG8CT4KnDxjnEE4FBb1ZFNU,621
 helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/benchmark/scenarios/babi_qa_scenario.py,sha256=SkIJjqBuMLYxZR8l9epu9arBeirvJPtQsIBXv4bzkx4,5030
@@ -107,7 +110,7 @@ helm/benchmark/scenarios/civil_comments_scenario.py,sha256=vXq6KxyS5C0-tD8xUmkG5
 helm/benchmark/scenarios/code_scenario.py,sha256=Q_TP_vWewkClvibPFHXpsOjR-CWexYgu5kl4OpfZXNc,11355
 helm/benchmark/scenarios/code_scenario_helper.py,sha256=EbQNfHqhQXaMMPmYT2mG2dRjzYaI2FvcPb9j6NlNHDU,5853
 helm/benchmark/scenarios/commonsense_scenario.py,sha256=9roSJS3iGSNgqxTbLI87xuZGB8IxJkbbtzr-ep0HUn0,10661
-helm/benchmark/scenarios/copyright_scenario.py,sha256=3dzDZ4B2a3ZmY6zMlQ98Ni-9836kPE4V1U6fecYrQHM,3646
+helm/benchmark/scenarios/copyright_scenario.py,sha256=APYQPC-esq3oM2qQxW6JNxa4pkv_yHDKfePjpvvi6nQ,3660
 helm/benchmark/scenarios/covid_dialog_scenario.py,sha256=FmYIuRr81xD_d0iyRa5blPC8OTqpfv8XGTz5XXUOd2E,3958
 helm/benchmark/scenarios/dialogue_scenarios.py,sha256=SPwo1iYiLbPwNtOgAVkTr-dO8FQLshmrfXdjPcayW5A,5616
 helm/benchmark/scenarios/disinformation_scenario.py,sha256=Ff66LxBm8APuMziLfGvTM0WIatrAty5_q_8ObaLW5lo,8491
@@ -116,11 +119,12 @@ helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=oChQzEptlf731
 helm/benchmark/scenarios/entity_matching_scenario.py,sha256=gtrLSCw2JSNnBgFQFsUm4EcICsjVWtp9wsOdqcyBU4k,6863
 helm/benchmark/scenarios/gsm_scenario.py,sha256=PmX0zutkGqnqGirWidUdk166cWv_23RtaTFcVQGBpzc,2619
 helm/benchmark/scenarios/ice_scenario.py,sha256=smrpTOwtMDL-m40zfKfNz9btOGoINZNv3_2oBcLBMmk,16156
+helm/benchmark/scenarios/imdb_listdir.json,sha256=eczxp9gslYYwx5XR86ATnZorIxuujFMDTfzR4h5NCpo,1015402
 helm/benchmark/scenarios/imdb_scenario.py,sha256=VTD0Ur6ATyY9NWxcnkGzn9Iw5vl4d94o0FbFm61ZZTA,6057
 helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=wYaivSqqYYZPjPHTKaS6D7j960dcLIVTfmuZ8awd1Zs,2192
 helm/benchmark/scenarios/legal_support_scenario.py,sha256=K9HfTyHJnnLFvGKNwYQayu7JV4uyNT5wH48wc9ixRa0,3912
-helm/benchmark/scenarios/lex_glue_scenario.py,sha256=k_VofKY5xlAjUEZrZV2ZyyVegGRVYEm0GuSsNbv7u0M,9254
-helm/benchmark/scenarios/lextreme_scenario.py,sha256=b0KfKQbwDv5AFCRala4GwbtJH3KmbKnTsL5CazODfkA,18996
+helm/benchmark/scenarios/lex_glue_scenario.py,sha256=r89KevvM1Kifu3ZkUIXAV8jXktclNtL0-JL9T6qOx_Q,10224
+helm/benchmark/scenarios/lextreme_scenario.py,sha256=tejeKE08YX5MYFXrJRY48rIzO5V5fdwHDWEzg883K2E,20300
 helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=2-GrPB8whoBFcQ5608jrwlAcRJgpkT1P2UehcR6-EYY,5977
 helm/benchmark/scenarios/math_scenario.py,sha256=5PespNtseDOnPgAwtdP0vMkXz1CaJM0BkJsWdeG5gUM,13825
 helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=ZIn7tlBC4baV8CYcU_-mYe2RbYaQ-8dX1Ca_hOvZTfI,3988
@@ -134,32 +138,34 @@ helm/benchmark/scenarios/narrativeqa_scenario.py,sha256=r5TwYkZH_YaCws6BSnjbiDQk
 helm/benchmark/scenarios/natural_qa_scenario.py,sha256=nuL5Qlh26xq7Q_lvzK2sKVN7eNYh7F5SjbIjahKaMNg,12527
 helm/benchmark/scenarios/newsqa_scenario.py,sha256=vMuIZyYxufH2AqhDoIZzzllfq8ScJIhSDH6lM3IUxGM,7242
 helm/benchmark/scenarios/numeracy_scenario.py,sha256=Iwtypyb0249zKYyV6p4YUX4bYke6uIyL6R3aopiGUb8,30552
+helm/benchmark/scenarios/opinions_qa_scenario.py,sha256=dtIYKL9ZiccX_F3-5OrnHXJdNBBTxCoTHc2Kc-XX79E,7380
 helm/benchmark/scenarios/pubmed_qa_scenario.py,sha256=cpMmQWwCDXIYO0btGLhevMT-Mhs9-5Es9cDvQYkIlL0,7493
 helm/benchmark/scenarios/quac_scenario.py,sha256=46nqmeVgkWu6jDGCHl-KHu351bmJj7jx_1p5kPwcOjc,6615
 helm/benchmark/scenarios/raft_scenario.py,sha256=RzewlMVkHJ2XbZ9_9FzBvbdV__BuBRgtX2HyhCnmH1o,4500
 helm/benchmark/scenarios/real_toxicity_prompts_scenario.py,sha256=8DQD7KkilAfxwzHUwCPrPPHeYEz_dEOWSls8yZo15do,2387
-helm/benchmark/scenarios/scenario.py,sha256=aj2golyi1TAAAebfW7eouTWRwZ07KbK034HlLx-q-1g,7164
+helm/benchmark/scenarios/scenario.py,sha256=bdRcv-YoLkxjlpNcq4MXiu8HQgjByHkkLWOdih4ahsM,7365
 helm/benchmark/scenarios/simple_scenarios.py,sha256=rcHzukhjBgvNRqkjcg0cms_zWtAsLPk0xiFN9I25_hI,1947
 helm/benchmark/scenarios/summarization_scenario.py,sha256=fKeRSkXrH26WyfeIhn43_fZxnAO4bIX1Xh5HoKcjOQM,6550
 helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=dtMMeULjw6pcobBRp0r6f9N4VCUKamx1Jy-6xPu85q0,3083
 helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=YvEgO5qVZ4hLpnvjer4CG0Ct1upssZRjZWxnNi1ZUtY,16308
 helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=Pm63shscbGuigg4zWfcw3rOIPP8LgxBCnpcQKAA_CX0,8327
 helm/benchmark/scenarios/test_scenario.py,sha256=o8w8ElDPF-RzeCmecwyvie_nRMYj01b38BufXS-igqY,1612
-helm/benchmark/scenarios/the_pile_scenario.py,sha256=UXxRlaZdVRoWTjw8h5TUhXouq5JLaidPbL5-Itai0KE,4988
+helm/benchmark/scenarios/the_pile_scenario.py,sha256=aCcjZp0wabu8lpPVNAdCr1x6m_3QvgKe7dIGS2qgGm4,4981
 helm/benchmark/scenarios/truthful_qa_scenario.py,sha256=zQgv_qUU5h5ODoQE06rpJa6O_8FwF695cIie3PG7bx8,5969
 helm/benchmark/scenarios/twitter_aae_scenario.py,sha256=_qWotvPagNj3ATnyaww3U1XZtxN-wgueUf34fFmKXQI,2083
 helm/benchmark/scenarios/wikifact_scenario.py,sha256=pucEuLYz5N9qIobnEbJnRUnK6PrkWFdkl7yPuCJj3SE,5778
 helm/benchmark/scenarios/wikitext_103_scenario.py,sha256=rXVbUzOZi4eWM-_HP1gzY5SBmMwOX1vk12WrLkR3NHo,3074
 helm/benchmark/scenarios/wmt_14_scenario.py,sha256=u24E_w0AOXpl3PzEFLmiBcl8qyJEy-1Yc-i4YHgU99M,4356
-helm/benchmark/static/benchmarking.css,sha256=EkLEyXEL5Qwq-312D01y9EaQV2IBa67fw8Bjc3PQPJs,1928
-helm/benchmark/static/benchmarking.js,sha256=NQUoE05neH_YN9BgyNVFwEXX09YDRZOcunK_6tCZomA,47399
+helm/benchmark/static/benchmarking.css,sha256=DGC4Huh4tVD2o9wEeUf3YOc3MYcq2fmJQXvhTjVDumE,2057
+helm/benchmark/static/benchmarking.js,sha256=qiXAY_9fiWZ4ydzhBQAUhylzxNCoIN8ciLlEFMg33uE,49107
 helm/benchmark/static/contamination.yaml,sha256=LbISh56ORvfkkWptm7ZWmlPvWxtls6pBF1TbGiWD7hk,3096
 helm/benchmark/static/general.js,sha256=L3S4CBUED0k7RsjLHCeWjO29ZMFJckZgNTAYAARzaEg,3029
-helm/benchmark/static/index.html,sha256=LZMmoydG2LLqRfvGSpK3eRt1n92o19AELAzxCi-kok0,2994
+helm/benchmark/static/index.html,sha256=yzf_VEGW15CPZOW9a2Z6opoQ0Gg2YQCtj182niUb0fk,3130
 helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheGWu7zNI,3428
 helm/benchmark/static/json-urls-root.js,sha256=G3qenwLgBojh3ukzp_gyMUaZja83ZFqvT1WQ_Rg11BU,98
-helm/benchmark/static/json-urls.js,sha256=UzWTp-dowxZhJuWtoPyi5vpfwlRr561DGYWRHdkvZ1E,1634
-helm/benchmark/static/schema.yaml,sha256=XgXDcd67hvhGCOTFKJGGymIglzYS8jL8QjN2cO3gi1Q,93172
+helm/benchmark/static/json-urls.js,sha256=wvsG2Lrz2XArwwMOl_tGXL9y4mjqzjod4gcqlvVCiQA,1750
+helm/benchmark/static/plot-captions.js,sha256=gTBn-IPPD4BkzryVYj3KkGqLhWqWvBbvufeJaDygQxk,3010
+helm/benchmark/static/schema.yaml,sha256=TQdzlpOPvTwELiU0w0HBAZqKQBmjj1LOkEGM85oa8e4,95657
 helm/benchmark/static/utils.js,sha256=H2PKYjuXZ392DlALCPJ1XRwGxBDRFjL9eTFiTd4vBU8,7338
 helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
 helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
@@ -184,11 +190,13 @@ helm/benchmark/window_services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
 helm/benchmark/window_services/ai21_window_service.py,sha256=Mo0Zzj2a9hDiUg6hTuJWb3ABhBUPlOYS-kHhaE-pHUE,12672
 helm/benchmark/window_services/anthropic_window_service.py,sha256=mA9aWKfc-dbSnpt37k6zDFdNNMfbevAoleCzSaar-uE,799
 helm/benchmark/window_services/bloom_window_service.py,sha256=o7MVedt6khdoj8zikLDuVraEzuoBZk7j4Fzjsas0sD4,1023
-helm/benchmark/window_services/cohere_window_service.py,sha256=MXBRAjuAQGq0iEpU9OLORH1FvHMak1-Nf5-D7k7UO9I,6182
+helm/benchmark/window_services/cohere_window_service.py,sha256=3SJT97CaxNxtUNS9_qKvKMVCA6lvNavS-xo6jAsxPbM,7070
 helm/benchmark/window_services/encoder_decoder_window_service.py,sha256=Dg_rD0RtKIALtpvT4Wi4Am3zgFcIvgxfzdglw5fbbTU,2478
+helm/benchmark/window_services/flan_t5_window_service.py,sha256=39IZX89_tay3bpSGVoWDoekmhW-RUNATQuT-bNyFRTs,915
 helm/benchmark/window_services/gpt2_window_service.py,sha256=d-ys9FHoVI1u1GZOH734JPAOc8W6QZb1N3ZjooKmwz8,990
 helm/benchmark/window_services/gptj_window_service.py,sha256=C2OEl-3ZatwxYVoaQgyvAZ5SBS2TK-2PnM02zUNjhiA,1103
 helm/benchmark/window_services/gptneox_window_service.py,sha256=FD0NDlBxDu1fZ8vlaTsOBh2IKYfrMV73qNGetkrp1P4,944
+helm/benchmark/window_services/huggingface_window_service.py,sha256=tDbhjn81Aw3ZSYMKh4aQU3e8JX1IVYCu_2gPOLaIWD8,1440
 helm/benchmark/window_services/ice_window_service.py,sha256=5z52rP-xAF_jckIPoogyoNFW6FQXqaq3SHybCaBuRn0,2005
 helm/benchmark/window_services/local_window_service.py,sha256=wgNBB-p9Zk-uFLRXYNxrz16_DhWyQ8x9sltSLBhzUh4,4247
 helm/benchmark/window_services/luminous_window_service.py,sha256=0w-nyfXXDwnIDBzU3Y84LZPnQyJGiSbvf5Y_MQh-hCg,1791
@@ -202,6 +210,7 @@ helm/benchmark/window_services/test_ai21_window_service.py,sha256=U_n2mQ5GqD8oYv
 helm/benchmark/window_services/test_bloom_window_service.py,sha256=xeYpsvdR8Ug31BAb4a-PU4Sc0oihpCIMJ5OzTfgUhM0,4221
 helm/benchmark/window_services/test_cohere_window_service.py,sha256=6WpIiuEyfgTZS3BIeAGOR8AAr2djpVIfGbItQRI19Ck,3205
 helm/benchmark/window_services/test_cohere_window_service_utils.py,sha256=sf25f9MeXzoqsbDzZ7d7le13hm8RkDe54nhLtKF2pqo,158150
+helm/benchmark/window_services/test_flan_t5_window_service.py,sha256=xv_EXbiRklveJPQtThYCSYF7qBYwjL7K4wH3Xu5z2Fg,591
 helm/benchmark/window_services/test_gpt2_window_service.py,sha256=3k25pLa_z__g4yoQL40DEXj-T4dGtrgif13N2NXs59U,2568
 helm/benchmark/window_services/test_gptj_window_service.py,sha256=sxsTpozKv9N-wZXtGl1prkQr9Md_q-tnCjO9zt226Co,2267
 helm/benchmark/window_services/test_gptneox_window_service.py,sha256=MCYHZIoulJf_WCx6de7rWB3nqku6wCyAokA-SWIPEks,4140
@@ -216,15 +225,16 @@ helm/benchmark/window_services/test_utils.py,sha256=1k2TlPdDIRjum669jpH3O7UOqm4G
 helm/benchmark/window_services/test_yalm_window_service.py,sha256=NdunSxq-qDzfzYMBYZ-0my6LaU2qUxtm7Ii0c4fyKnY,4273
 helm/benchmark/window_services/tokenizer_service.py,sha256=RNznJBAxcCUMCurb7mbraZULx_ZtB0G7IxbrnUe0Urk,865
 helm/benchmark/window_services/ul2_window_service.py,sha256=R_VEzOb59zQE9mmbTLunQeIvLAtK3-97h-B2_oc0Uxs,1021
+helm/benchmark/window_services/wider_ai21_window_service.py,sha256=VZ6EERN48FSYsmJ_aiwI30SEbobLt27c1QqL29Zg_8M,414
 helm/benchmark/window_services/wider_openai_window_service.py,sha256=cpm4mDEGTY-cmrECcDCL0flONBdh4g40uWNLI-v46BU,539
 helm/benchmark/window_services/window_service.py,sha256=aV4YnbXl7T23runB8xmSWRwV7YtliUlXrveEejOMJ1Y,2885
-helm/benchmark/window_services/window_service_factory.py,sha256=kb9Dxig-HuHssO6dyMT0M_FDu35WgXqlaXs_j88QAd4,4773
+helm/benchmark/window_services/window_service_factory.py,sha256=UDrXRUXGwBMEiO7Cw_nNbr0XM6mzzXrXiyJsvCmW9wg,5845
 helm/benchmark/window_services/yalm_window_service.py,sha256=g7NeXvlRq3FXf2HwRStBMTkbWDxTdEITkrVRe-fv3mg,1805
 helm/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/common/authentication.py,sha256=RlMx29_TSrfU7ujE7dJkxmFub5EqLj2NswV5lAVFFDk,179
 helm/common/cache.py,sha256=ustgsRHX0W3zoLPN05W3mFl9m9JYp9Ppq5cjMbdmm6Y,13116
 helm/common/codec.py,sha256=zm8MP9Aqfh64D2HMZiCPEMoPkkiJxEzvzmuupGvkRh0,5499
-helm/common/general.py,sha256=bIhgtiEIz1zA7cQVH-U_6FuW_bmDzYaff4QZ779tR3U,10087
+helm/common/general.py,sha256=7vFw10h_hTrxUCfxE0LH13hp8Lunp77lbVmucgoeq2Y,10181
 helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
 helm/common/object_spec.py,sha256=COMd4RpYgfulW940a5M_npbsfRBvLkmhjfwDIq4Gpqs,1833
 helm/common/perspective_api_request.py,sha256=WAVwtajNVmi5XJNsPcorGEAVrqkpPSk-Kd3b0hJghbA,2427
@@ -237,7 +247,7 @@ helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/accounts.py,sha256=xq-zVggvueB4D5QK58mFWGPxZe-hnIUAT4D341yd0ac,13503
 helm/proxy/cli.py,sha256=2SOxIF55PDjzXXcDOYRT8m-oyQM_2VyZheKRC3fXDVw,8094
 helm/proxy/example_queries.py,sha256=p1wH-tp1pRUslkAwaJYrrG5aDfmFWK3KYn4M6WQfPqQ,4120
-helm/proxy/models.py,sha256=4cqSq_cBnTRPgSFWafTkuAwJxMg6CA9y-6NBIfeZPv0,25926
+helm/proxy/models.py,sha256=sfdp7DasJXbun-VMZz5p7iD6uhoUW1mFsy64BWfDoV4,29681
 helm/proxy/query.py,sha256=eftbiUICMh8QIHVs-7cLtv_rDXKeKdRPmwjLMu0TDxQ,645
 helm/proxy/retry.py,sha256=GLLDW1iGCwHfgTle8YK7ZB3vV-7XqsHcqeruKoVdsxE,1953
 helm/proxy/server.py,sha256=uBispGXfn39s_Pskd9Xjud0rijTjqXtSKU_2YvE6zGE,7356
@@ -245,31 +255,34 @@ helm/proxy/test_models.py,sha256=hWeDcBw1GkPvyJUd-ABxRVe1FhSUfz8bzyrKYdsqmyY,726
 helm/proxy/test_retry.py,sha256=8h398auzjW9VnlTJWllxR-bdpub-XFp8EN8LWDEnEHM,1049
 helm/proxy/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/clients/ai21_client.py,sha256=c4D64-1mLaFcGsqOb5CYkaXTmAWEJYk9cIjtr6DbgZQ,7770
-helm/proxy/clients/aleph_alpha_client.py,sha256=kh7Wrf9D5aZo5HYbvW1KFdRvdOn1EO64V44tNXlUKlk,6530
+helm/proxy/clients/aleph_alpha_client.py,sha256=VV7dbgh7sYqoSWfNkxxAiQ7i3yPW33rBPnBig9EXL10,7707
 helm/proxy/clients/anthropic_client.py,sha256=vrfc8lS9bxQbUxMxbElV2z5cMDq-JD6yFDTS2cJdFO0,15526
-helm/proxy/clients/auto_client.py,sha256=nzGU_SYaQD8v6eZbKNt1ej2mvPWTsXYn_DDAnPTxfrA,10810
+helm/proxy/clients/auto_client.py,sha256=oktJQ4rCntp7id4vj-d3x_EA3RWGNZyXUMFwzRse--c,11063
 helm/proxy/clients/chat_gpt_client.py,sha256=nG3opHbnzX50r9Ialh3RaRErOZo6k4Q7gVzRHeGQgj8,5312
 helm/proxy/clients/client.py,sha256=bh6FvYFjw6MoHp5n7-KN1asXrIrOC-jfYsg3aW4xMgo,4570
 helm/proxy/clients/cohere_client.py,sha256=KF21m7qUjuhrpEEQv68FNeX0rsWSmxZgw52Oa7CZ5pI,11362
 helm/proxy/clients/google_client.py,sha256=sGGxDWD22c9a9KMzLAFL3vAEDHxp5jSY2W3RDpVDIak,3334
 helm/proxy/clients/goose_ai_client.py,sha256=2tqJK_AhD2-ScXtOTdt9S9khzVjal5pm38BJWiFhwq8,4217
-helm/proxy/clients/huggingface_client.py,sha256=7CYD53G6tGxhFEE6yhlrQM_agfbr9V4hKqpE9OW5ks0,11045
-helm/proxy/clients/huggingface_tokenizer.py,sha256=KOa7xA10bIRN9lcSfUBBJSrES49By2KXqlDzlVdEHWM,3801
+helm/proxy/clients/huggingface_client.py,sha256=LypY3YfyoaGFH83UkYwDARGuoN2JOUk2S-nEaJ6GemI,11813
+helm/proxy/clients/huggingface_model_registry.py,sha256=0WHyWPxxBI4KtTs2Yt6-Cw16FC4XBEe6yqUc0-YSn1Y,3891
+helm/proxy/clients/huggingface_tokenizer.py,sha256=ujtsBupMMrE9efds2205c8NiPTcxHX8XM0UoV9spLK0,4591
 helm/proxy/clients/ice_tokenizer_client.py,sha256=Ui8YhAXoY1Q0vC3icoeFs6X9xAcESF6Tl2EGERGWVGU,2325
 helm/proxy/clients/microsoft_client.py,sha256=-VC8IrgrpSp1_FvRSI_8MSxhNp5I6dMc4qWSHc4Oulg,8237
-helm/proxy/clients/openai_client.py,sha256=rPV74pbiAlkLS4YO6J-dnzpVmsShWKYISD2nJTk6Sds,6025
+helm/proxy/clients/openai_client.py,sha256=frW9fOjYWkRXdfzE88ppxaLhVl9pCnXfheUqeANW6QQ,9415
 helm/proxy/clients/perspective_api_client.py,sha256=-L8IwokuktWPoOu7nXwsfoab_U1QRGCt8xT1SrcGfYE,5491
 helm/proxy/clients/simple_client.py,sha256=GXHTCRB58XAxnUVqgpynidc7h6kaDBOP7TedVHrOpD4,2915
 helm/proxy/clients/test_client.py,sha256=bvkFob_Yoy8bALrVeQ0h757g9RU687JYI0g3AISPFQ8,1268
 helm/proxy/clients/test_huggingface_client.py,sha256=n-6D-RXqwQyxPxCLCSqHxqfK5JA-PdP5ffP17XwTe2I,3520
+helm/proxy/clients/test_huggingface_model_registry.py,sha256=zMboFlwMtDEV7hkd9SZFuItye-vkzz3CE5mWQrw--W4,2554
 helm/proxy/clients/test_huggingface_tokenizer.py,sha256=KmlAXezQ6R7DAEpV85_JRdTRrOJoJxfmtylqybWn5VA,2189
 helm/proxy/clients/test_ice_tokenizer_client.py,sha256=Ugmn5a7QdAPEAbLtreLS5-sji8yrzxy5mhFPAl3rOuI,2404
 helm/proxy/clients/test_yalm_tokenizer_client.py,sha256=tnrYl7T1DcZ9FN09nBWV1gesjWQ3osiUpsGHSm_IypQ,2336
-helm/proxy/clients/together_client.py,sha256=k7TmK940KHdtvb2pxAlX4kHUHrfiZL2HAzM3dgwJowg,5731
+helm/proxy/clients/together_client.py,sha256=epyiYElD0BfAZgUSu4zZKC5Oe8yIVVyJn2RtTwjPMzM,6334
 helm/proxy/clients/yalm_tokenizer_client.py,sha256=cpBoc8eHQoBGQguZsDaVnGWLdZnPgkHjqLSO0B94O0U,2420
 helm/proxy/clients/yalm_tokenizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
-helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py,sha256=z1JEJynC51T70JSuig6TrNwLEUXiS2SVTMvWECya7ww,5743
+helm/proxy/clients/yalm_tokenizer/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
+helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py,sha256=7Y4_nCZptFWzifCJ5aPmM3_OOxhtomIAQVjpJGV1D8g,5954
 helm/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 helm/proxy/services/remote_service.py,sha256=xKS-0P-EqKTPn7odTXDoQPjn9FliQRLaMFUnCEsUmCU,6965
 helm/proxy/services/server_service.py,sha256=gps7PwXqCi8b0yGYC0nQwFdKbfxCriSHt5CD1N1kkJs,5696
@@ -286,9 +299,9 @@ helm/proxy/token_counters/openai_token_counter.py,sha256=gPo_VrkEH07xmprzdfIhmJ_
 helm/proxy/token_counters/test_ai21_token_counter.py,sha256=42J1fCi20kQUwAD18bIa6h9TaP7KZnlgF-mLbvKURro,5508
 helm/proxy/token_counters/test_openai_token_counter.py,sha256=EovaVCZSr9moITZ9-AKiv_YM-D3OUsUDs4iQhEvpazQ,4823
 helm/proxy/token_counters/token_counter.py,sha256=x8KyTR82EedgCQUuneQiVq9AiU1B3_CHPmKPNumClHc,429
-crfm_helm-0.2.1.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
-crfm_helm-0.2.1.dist-info/METADATA,sha256=0U51YnO1QR8Xhq4no4yDaiWTwwLVFj99tf611k5MeX4,1982
-crfm_helm-0.2.1.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
-crfm_helm-0.2.1.dist-info/entry_points.txt,sha256=8vW_ahx0Ar_ubyDTqUavUwXeZ5O8w0gLtdSVagchycU,234
-crfm_helm-0.2.1.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
-crfm_helm-0.2.1.dist-info/RECORD,,
+crfm_helm-0.2.2.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
+crfm_helm-0.2.2.dist-info/METADATA,sha256=_OlkKmj1P7vaZvlpvOnNnzzm3w1IEW6de75SK7TmuPw,2066
+crfm_helm-0.2.2.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
+crfm_helm-0.2.2.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
+crfm_helm-0.2.2.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
+crfm_helm-0.2.2.dist-info/RECORD,,

{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.38.4)
+Generator: bdist_wheel (0.40.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,6 +1,7 @@
 [console_scripts]
 crfm-proxy-cli = helm.proxy.cli:main
 crfm-proxy-server = helm.proxy.server:main
+helm-create-plots = helm.benchmark.presentation.create_plots:main
 helm-run = helm.benchmark.run:main
 helm-server = helm.benchmark.server:main
 helm-summarize = helm.benchmark.presentation.summarize:main

helm/benchmark/__init__.py CHANGED Viewed

@@ -42,6 +42,8 @@ from .scenarios import legal_support_scenario  # noqa
 from .scenarios import entity_matching_scenario  # noqa
 from .scenarios import entity_data_imputation_scenario  # noqa
 from .scenarios import big_bench_scenario  # noqa
+from .scenarios import opinions_qa_scenario  # noqa
 # Biomedical
 from .scenarios import covid_dialog_scenario  # noqa

helm/benchmark/adaptation/adapter_spec.py CHANGED Viewed

@@ -68,6 +68,9 @@ class AdapterSpec:
     # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
+    # If true, randomly sample N training examples; if false, select N consecutive training examples
+    sample_train: bool = True
     # Decoding parameters (inherited by `Request`)
     # Model to make the request to (need to fill in)

helm/benchmark/adaptation/adapters/in_context_learning_adapter.py CHANGED Viewed

@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
     @htrack(None)
     def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
         """
-        Takes a a list of `Instance`s and builds a list of corresponding `RequestState`s.
+        Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
         The reason we don't do this per eval instance is that we create a common set of
         training instances which is shared across all eval instances.
         """
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
         parallelism: int,
     ) -> List[RequestState]:
         self.train_trial_index: int = train_trial_index
-        self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index)
+        self.train_instances: List[Instance] = self.sample_examples(
+            all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
+        )
         hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
         # Generate request_states
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
         return [request_state for result in results for request_state in result]
-    def sample_examples(self, all_train_instances: List[Instance], seed: int) -> List[Instance]:
+    def sample_examples(
+        self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
+    ) -> List[Instance]:
         """
         Sample a random set of train instances to use as examples by following the steps below:
         1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
         random.seed(seed)
         num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
+        examples: List[Instance] = []
+        if not sample_train:
+            # Select sequentially from the train set
+            examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
+            return examples
         unlabeled_instances: List[Instance] = []
         label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
         for instance in all_train_instances:
             if instance.first_correct_reference:
                 label_to_instances[instance.first_correct_reference.output.text].append(instance)
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
             sorted_labels.extend(labels)
         labels_iterable = cycle(sorted_labels)
-        examples: List[Instance] = []
         while num_instances_to_sample > 0:
             next_label: Optional[str] = next(labels_iterable, None)
             if not next_label:
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
         # References (optionally) and output
         output: str
+        delimiter = ","
         if reference_index is None:
             # Put only the correct reference as the output
-            correct_reference: Optional[Reference] = instance.first_correct_reference
-            output = correct_reference.output.text if correct_reference is not None else "n/a"
+            correct_references: List[Reference] = instance.all_correct_references
+            if not correct_references:
+                output = "n/a"
+            else:
+                output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
         else:
             reference = instance.references[reference_index]
             output = reference.output.text

helm/benchmark/contamination/__init__.py ADDED Viewed

File without changes

helm/benchmark/metrics/classification_metrics.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from typing import List
+from typing import List, Optional
 from sklearn.metrics import f1_score
+from sklearn.preprocessing import MultiLabelBinarizer
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.metrics.basic_metrics import normalize_text
@@ -20,8 +21,7 @@ class ClassificationMetric(Metric):
     Note:
     - The set of classes is derived from the correct references from all the instances.
-      This means that classes may be omitted if they never are never used as a correct
-      reference.
+      This means that classes may be omitted if they are never used as a correct reference.
     - Generations that are not in any of the known classes are counted as a
       negative prediction for every class.
     - Perturbed classes are considered different classes from unperturbed
@@ -29,10 +29,16 @@ class ClassificationMetric(Metric):
     - Currently, multi-label classification is not supported.
     """
+    def __init__(self, delimiter: Optional[str] = None):
+        self.delimiter = delimiter
+    def is_multi_label(self) -> bool:
+        return bool(self.delimiter)
     def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
-        y_pred: List[str] = []
-        y_true: List[str] = []
-        for request_state in request_states:
+        y_pred: List[List[str]] = []
+        y_true: List[List[str]] = []
+        for request_state in request_states:  # one request state per instance
             # Only the generation adapter is supported.
             # TODO: Support multiple_choice_* adapters.
             if request_state.reference_index is not None:
@@ -42,24 +48,23 @@ class ClassificationMetric(Metric):
             assert request_state.result is not None
             if len(request_state.result.completions) != 1:
                 raise ValueError("Result must contain exactly one completion")
-            num_correct = 0
-            for reference in request_state.instance.references:
-                if reference.is_correct:
-                    num_correct += 1
-                    y_true.append(normalize_text(reference.output.text))
-            if num_correct != 1:
-                # TODO: Support multi-label classification.
-                raise ValueError("ClassificationMetric does not support multi-label classification")
             if request_state.output_mapping:
                 raise ValueError("ClassificationMetric does not support multiple choice adapters")
-            y_pred.append(normalize_text(request_state.result.completions[0].text))
-        labels = list(set(y_true))
+            references = request_state.instance.all_correct_references
+            if not self.is_multi_label():
+                assert len(references) == 1
+            correct_ref_texts = [normalize_text(ref.output.text) for ref in references if ref.output.text]
+            y_true.append(correct_ref_texts)
+            input_text = request_state.result.completions[0].text
+            predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
+            y_pred.append([normalize_text(pred) for pred in predictions if pred])
+        labels: List[str] = list(set(y for ys in y_true for y in ys))
+        mlb = MultiLabelBinarizer().fit([labels])
+        y_true = mlb.transform(y_true)
+        y_pred = mlb.transform(y_pred)
         return [
-            Stat(MetricName("classification_macro_f1")).add(
-                f1_score(y_pred=y_pred, y_true=y_true, labels=list(labels), average="macro")
-            ),
-            Stat(MetricName("classification_micro_f1")).add(
-                f1_score(y_pred=y_pred, y_true=y_true, labels=list(labels), average="micro")
-            ),
+            Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
+            Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
         ]

helm/benchmark/metrics/test_classification_metrics.py CHANGED Viewed

@@ -63,7 +63,8 @@ def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
 def test_evaluate_instances_binary_generation():
-    metric = ClassificationMetric()
+    metric = ClassificationMetric(delimiter=None)
     request_states = [
         _request_state("yes", [_Option("yes", True)]),
         _request_state("yes", [_Option("yes", True)]),
@@ -86,20 +87,21 @@ def test_evaluate_instances_binary_generation():
 def test_evaluate_instances_multi_class():
-    metric = ClassificationMetric()
+    # Note: no "a" because it would get filtered out by normalize_text()
+    metric = ClassificationMetric(delimiter=None)
     def _options(correct: str):
-        return [_Option(text, text == correct) for text in ["a", "b", "c"]]
+        return [_Option(text, text == correct) for text in ["d", "b", "c"]]
     request_states = [
-        _request_state("a", _options("a")),
-        _request_state("a", _options("a")),
-        _request_state("a", _options("a")),
-        _request_state("a", _options("b")),
+        _request_state("d", _options("d")),
+        _request_state("d", _options("d")),
+        _request_state("d", _options("d")),
+        _request_state("d", _options("b")),
         _request_state("b", _options("b")),
         _request_state("b", _options("b")),
         _request_state("b", _options("c")),
-        _request_state("c", _options("a")),
+        _request_state("c", _options("d")),
         _request_state("c", _options("c")),
         _request_state("invalid", _options("c")),
     ]
@@ -107,9 +109,42 @@ def test_evaluate_instances_multi_class():
         metric.evaluate_instances(request_states),
         _expected_stats(
             {
-                "a": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
+                "d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
                 "b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
                 "c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
             }
         ),
     )
+def test_evaluate_instances_multilabel():
+    # Note: no "a" because it would get filtered out by normalize_text()
+    metric = ClassificationMetric(delimiter=",")
+    def _options(correct: List[str]):
+        return [_Option(text, text in correct) for text in ["d", "b", "c"]]
+    request_states = [
+        _request_state("d,b", _options(["d", "b"])),
+        _request_state("d,b", _options(["d", "c"])),
+        _request_state("d", _options(["d"])),
+        _request_state("c", _options(["b"])),
+        _request_state("b", _options(["b", "c"])),
+        _request_state("d,b", _options(["c"])),
+        _request_state("d,c", _options(["d"])),
+        _request_state("d,b,c", _options(["d", "b", "c"])),
+        _request_state("", []),
+        _request_state("n/a", []),
+        _request_state("invalid", _options(["c"])),
+    ]
+    assert_stats_equal(
+        metric.evaluate_instances(request_states),
+        _expected_stats(
+            {
+                "d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
+                "b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
+                "c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
+            }
+        ),
+    )

crfm-helm 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

crfm-helm 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl