crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
  2. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
  3. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
  5. helm/benchmark/__init__.py +13 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
  9. helm/benchmark/contamination/__init__.py +0 -0
  10. helm/benchmark/metrics/classification_metrics.py +70 -0
  11. helm/benchmark/metrics/machine_translation_metrics.py +36 -0
  12. helm/benchmark/metrics/summarization_metrics.py +7 -8
  13. helm/benchmark/metrics/test_classification_metrics.py +150 -0
  14. helm/benchmark/presentation/create_plots.py +617 -0
  15. helm/benchmark/presentation/run_display.py +7 -48
  16. helm/benchmark/presentation/summarize.py +4 -2
  17. helm/benchmark/presentation/test_create_plots.py +32 -0
  18. helm/benchmark/run.py +144 -48
  19. helm/benchmark/run_expander.py +164 -47
  20. helm/benchmark/run_specs.py +346 -39
  21. helm/benchmark/runner.py +34 -6
  22. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  23. helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
  24. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  25. helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
  26. helm/benchmark/scenarios/lextreme_scenario.py +458 -0
  27. helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
  28. helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
  29. helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
  30. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
  31. helm/benchmark/scenarios/med_qa_scenario.py +96 -0
  32. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  33. helm/benchmark/scenarios/scenario.py +5 -0
  34. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  35. helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
  36. helm/benchmark/static/benchmarking.css +14 -0
  37. helm/benchmark/static/benchmarking.js +43 -0
  38. helm/benchmark/static/index.html +2 -0
  39. helm/benchmark/static/json-urls.js +4 -0
  40. helm/benchmark/static/plot-captions.js +16 -0
  41. helm/benchmark/static/schema.yaml +154 -1
  42. helm/benchmark/window_services/cohere_window_service.py +20 -0
  43. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  44. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  45. helm/benchmark/window_services/santacoder_window_service.py +27 -0
  46. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  47. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  48. helm/benchmark/window_services/window_service_factory.py +34 -7
  49. helm/common/codec.py +123 -0
  50. helm/common/general.py +12 -5
  51. helm/common/test_codec.py +144 -0
  52. helm/proxy/clients/aleph_alpha_client.py +47 -28
  53. helm/proxy/clients/auto_client.py +32 -24
  54. helm/proxy/clients/google_client.py +88 -0
  55. helm/proxy/clients/huggingface_client.py +32 -16
  56. helm/proxy/clients/huggingface_model_registry.py +111 -0
  57. helm/proxy/clients/huggingface_tokenizer.py +25 -7
  58. helm/proxy/clients/openai_client.py +60 -2
  59. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  60. helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
  61. helm/proxy/clients/together_client.py +17 -2
  62. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  63. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  64. helm/proxy/models.py +115 -7
  65. helm/proxy/test_models.py +1 -1
  66. helm/benchmark/presentation/present.py +0 -249
  67. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  68. {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crfm-helm
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Benchmark for language models
5
5
  Home-page: https://github.com/stanford-crfm/helm
6
6
  Author: Stanford CRFM
@@ -12,15 +12,11 @@ Classifier: Programming Language :: Python :: 3.8
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Requires-Python: ~=3.8
14
14
  License-File: LICENSE
15
- Requires-Dist: pytest (~=7.2.0)
16
- Requires-Dist: black (~=22.10.0)
17
- Requires-Dist: mypy (~=0.982)
18
- Requires-Dist: pre-commit (~=2.20.0)
19
- Requires-Dist: flake8 (~=5.0.4)
20
15
  Requires-Dist: zstandard (~=0.18.0)
21
16
  Requires-Dist: tqdm (~=4.64.1)
22
17
  Requires-Dist: pyhocon (~=0.3.59)
23
18
  Requires-Dist: dacite (~=1.6.0)
19
+ Requires-Dist: aleph-alpha-client (~=2.14.0)
24
20
  Requires-Dist: bottle (~=0.12.23)
25
21
  Requires-Dist: gunicorn (~=20.1.0)
26
22
  Requires-Dist: Mako (~=1.2.3)
@@ -28,8 +24,9 @@ Requires-Dist: sqlitedict (~=1.7.0)
28
24
  Requires-Dist: pymongo (~=4.2.0)
29
25
  Requires-Dist: retrying (~=1.3.3)
30
26
  Requires-Dist: websocket-client (~=1.3.2)
31
- Requires-Dist: openai (~=0.25.0)
32
- Requires-Dist: transformers (~=4.22.2)
27
+ Requires-Dist: openai (~=0.27.0)
28
+ Requires-Dist: transformers (~=4.26.1)
29
+ Requires-Dist: tokenizers (~=0.13.2)
33
30
  Requires-Dist: icetk (~=0.0.4)
34
31
  Requires-Dist: protobuf (~=3.20.2)
35
32
  Requires-Dist: google-api-python-client (~=2.64.0)
@@ -40,6 +37,8 @@ Requires-Dist: jsonlines (~=3.1.0)
40
37
  Requires-Dist: sympy (~=1.11.1)
41
38
  Requires-Dist: sentencepiece (~=0.1.97)
42
39
  Requires-Dist: numba (~=0.56.4)
40
+ Requires-Dist: cattrs (~=22.2.0)
41
+ Requires-Dist: xlrd (~=2.0.1)
43
42
  Requires-Dist: importlib-resources (~=5.10.0)
44
43
  Requires-Dist: nltk (~=3.7)
45
44
  Requires-Dist: scipy (~=1.9.1)
@@ -53,5 +52,9 @@ Requires-Dist: spacy (~=3.2.4)
53
52
  Requires-Dist: summ-eval (~=0.892)
54
53
  Requires-Dist: torch (~=1.12.1)
55
54
  Requires-Dist: torchvision (~=0.13.1)
55
+ Requires-Dist: colorcet (~=3.0.1)
56
+ Requires-Dist: matplotlib (~=3.6.0)
57
+ Requires-Dist: numpy (~=1.23.3)
58
+ Requires-Dist: seaborn (~=0.11.0)
56
59
 
57
60
  Benchmark for language models
@@ -1,15 +1,15 @@
1
1
  helm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- helm/benchmark/__init__.py,sha256=XY0Yjn_tSit3lA16scPTqnfRcft4TnllSEHxrdy9v3U,3909
2
+ helm/benchmark/__init__.py,sha256=haJrJawd2zOTaxV_nkk6-V05vnePuHwCi0DytuJ0898,4450
3
3
  helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYCK8pXVo,2173
4
4
  helm/benchmark/executor.py,sha256=Vkmc4wmar2MRIavfiUOa2mu8Pp-zXsguYOevbjog4-4,3299
5
- helm/benchmark/run.py,sha256=EAQsOXMVHJeEVv_iTRS-saU6zmydNAzEeFyCS7nM7u8,5794
6
- helm/benchmark/run_expander.py,sha256=IHzZs8Wsp9Bkw5vw5hSa7NLkYgFlxGr4aLAO2YJyVCc,28842
7
- helm/benchmark/run_specs.py,sha256=jQcAqhMk1kLi0loWNUaZ95BEiFXLRjvYYXOq0TDKaJA,62653
8
- helm/benchmark/runner.py,sha256=aNlOa7OLivyAuDvTJMn6-8CG0xPEQkEJ7VKWEYXnwXU,7593
5
+ helm/benchmark/run.py,sha256=AWa862BtEh5aOTjKZ9OkSv3be2ZrU4R1qiwJtRTQwfk,9402
6
+ helm/benchmark/run_expander.py,sha256=vnq-zRmuXLzgr3sS3XYaXJFarNC7-QKc0_DtPjwXq3Q,32952
7
+ helm/benchmark/run_specs.py,sha256=ssBJYMZVMF4XGk6lvCSlQJh6A-Pmh2_ndi_JAwgW0CQ,71441
8
+ helm/benchmark/runner.py,sha256=zYDe8UeB1LFmbpChmRdRqEIZo-X0xWMenOCp2NnZ9Ws,8802
9
9
  helm/benchmark/server.py,sha256=HsuVsch1SPjQ4YyZi60kjr3JZeL82h8jgkxTUlfb130,1620
10
10
  helm/benchmark/test_data_preprocessor.py,sha256=adT-pgVeWvmZXLUUehxH0C-lMhXhtdxsvYdr69o1BD4,2047
11
11
  helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- helm/benchmark/adaptation/adapter_spec.py,sha256=KgO6KGF3sRrM0rKTMulFp4GtwHbGNlOurE48d0Lv5hg,2679
12
+ helm/benchmark/adaptation/adapter_spec.py,sha256=YoxMyN4RJM8GG_DeZ-k0edyARZ69hHLkvOlvKCk-u2o,2811
13
13
  helm/benchmark/adaptation/prompt.py,sha256=MATerIUIhFp_BMGvK7bLpNtWH6Oi4kknjBjOkr2bHv4,1948
14
14
  helm/benchmark/adaptation/request_state.py,sha256=o3OpZbB0TJFiZ2Nmhvg3vWmByaUSYTffT_WnoNb7w68,2712
15
15
  helm/benchmark/adaptation/scenario_state.py,sha256=ZflBuNgvN0JqUhshFcy0kTweO1WJs6j5UCaTxWTMe0o,1747
@@ -18,7 +18,7 @@ helm/benchmark/adaptation/adapters/adapter.py,sha256=8wK28jISxW8rUfXP-_-FfQJRRzc
18
18
  helm/benchmark/adaptation/adapters/adapter_factory.py,sha256=N2n-xIoGt_DxlN0LT4GUgVvdoaqhyUU8rSWr_nyfb80,2318
19
19
  helm/benchmark/adaptation/adapters/binary_ranking_adapter.py,sha256=3j24nFQuZE0Zl6DMAB4aYUpjieerdSMLsJbpMT9Nzfw,5646
20
20
  helm/benchmark/adaptation/adapters/generation_adapter.py,sha256=-on4QAo8hhzJVgAnM6G8lFFqaoiSiVF-KxwwfHwE61A,1927
21
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=qRmFP1wweGIcZX55M2bljaQ22Xof45_oyY7Bxg4c3yQ,12657
21
+ helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=q5K4Hag8LOpfpkeEzwIMPLNpBqMThcB1LXLGr_n_Xfo,13118
22
22
  helm/benchmark/adaptation/adapters/language_modeling_adapter.py,sha256=vPo2EVgbMfzmwPPcljoXdDfBW3c80LLKaUhA-RefU2w,11967
23
23
  helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py,sha256=pV14yvmH_mRQpeXF0teAxGpJcouSQViipr-aMkNE-AM,1711
24
24
  helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py,sha256=4CALqc--PUaEl3cLzmjP9nFSuarCZMKBwrPQxde5TYM,3471
@@ -30,6 +30,7 @@ helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py,sha256=
30
30
  helm/benchmark/augmentations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  helm/benchmark/augmentations/contraction_expansion_perturbation.py,sha256=bqcXlazuoss4hOxOarQptNEh52SU4Gy58Zph1PZ34W4,4803
32
32
  helm/benchmark/augmentations/contrast_sets_perturbation.py,sha256=fpLtZDgQY8beQLBlCId3gSyvCoPHVq3J7PGzcNdR0kM,3454
33
+ helm/benchmark/augmentations/correct_to_misspelling.json,sha256=L44RiJXlJCa6zQzTLf0MFHCOhFyRDRKfLQNXH-n3XIs,213429
33
34
  helm/benchmark/augmentations/data_augmenter.py,sha256=57LA6h7z1tVMy_xGcW46F1KRi3D4wnv0fi8XeJjsi2c,3849
34
35
  helm/benchmark/augmentations/dialect_perturbation.py,sha256=zy3SJtYAxHf2fMB7w-u5gsEC6q8g-94sKnwNLVp0pFc,6227
35
36
  helm/benchmark/augmentations/extra_space_perturbation.py,sha256=9_pmthcyFfuYu6GsJB03hKkhvDZqqfH7hOeSNZRohSg,835
@@ -45,6 +46,7 @@ helm/benchmark/augmentations/space_perturbation.py,sha256=7OdpoibdizoPDBPpLc1ENy
45
46
  helm/benchmark/augmentations/synonym_perturbation.py,sha256=2qFx7xparhEPd82tvs59HkAr1hwQWv7asWtmNCbcQrQ,4209
46
47
  helm/benchmark/augmentations/test_perturbation.py,sha256=v_U5CmBpA5aXqg4EJUYZrSfGsNbZTwCP0inxz1XNGq0,9991
47
48
  helm/benchmark/augmentations/typos_perturbation.py,sha256=nfF1Zw2REKZEnnyPVFWD87MP8L5ANbaZXeI2n70Sonw,2790
49
+ helm/benchmark/contamination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
50
  helm/benchmark/efficiency_data/inference_denoised_runtimes.json,sha256=ios_dt-_8wtXvkVAx0iI2zwCxqHvk3XKTx31qHPalsI,4203
49
51
  helm/benchmark/efficiency_data/inference_idealized_runtimes.json,sha256=5w7reeZc0yc4cjH8kJGxQQSoe8yaRVX2SSlSrx0QWFQ,12348
50
52
  helm/benchmark/efficiency_data/training_efficiency.json,sha256=aH2moiBLStOLVi8Ci2KTK5ZkWlTBLK-B3fRfNZwhoSg,9763
@@ -53,18 +55,21 @@ helm/benchmark/metrics/basic_metrics.py,sha256=xBCTgLVdiWVxdpB08MbWFc2nvwCJTgD1-
53
55
  helm/benchmark/metrics/bbq_metrics.py,sha256=H44mwKXLJ0PXo-sVKgRHCEKjZGnCahDD6GOQMWpnbOQ,6061
54
56
  helm/benchmark/metrics/bias_metrics.py,sha256=uVJFQvDSzvPR1ELu0FNYyExjRy2ThaJRCw8beEMDqJs,11309
55
57
  helm/benchmark/metrics/bias_word_lists.py,sha256=mx5JjW3mHffXIqo4GcQN-zENUEttBqQnEjPTz3J3J_4,13909
58
+ helm/benchmark/metrics/classification_metrics.py,sha256=1q7gPnWRrx4QwE8T0m269vFJWg_bKfVx21a5spDBbjU,3701
56
59
  helm/benchmark/metrics/code_metrics.py,sha256=uWdigk0QyEsfVHQzq9KxkOc-LROvcqWXeui42Mr0YF4,5119
57
60
  helm/benchmark/metrics/code_metrics_helper.py,sha256=byyuI1lJgbIDPVJzywaBsam9zFMPPyn28g1grsK9xyA,22336
58
61
  helm/benchmark/metrics/copyright_metrics.py,sha256=8sk85mLTasWIgHIXxOho0z_nQYyLqtzSWHSAwd5ayAQ,7560
59
62
  helm/benchmark/metrics/disinformation_metrics.py,sha256=YH8QJ5s8LvIRzp49_O20UvOm8_z7PKyleoOX3hdX0HE,10499
63
+ helm/benchmark/metrics/machine_translation_metrics.py,sha256=Ki7wBa9Odko_wVg1ec1MIoY2fHn0oY_vYrT3r7Ya6tc,1559
60
64
  helm/benchmark/metrics/metric.py,sha256=zF3IHmjGxRXrUoIOEIb2wRbsTPldzgclF8uPiGsZv4g,18789
61
65
  helm/benchmark/metrics/metric_name.py,sha256=hk5WQ6uj_9EjgKKFawPenL2-XOMf-aKvNRkOxlu4nCo,1355
62
66
  helm/benchmark/metrics/metric_service.py,sha256=aJ21wWdc1Spfi3mjrj8JEnsANL45P7wr7fk_EdDObko,709
63
67
  helm/benchmark/metrics/numeracy_metrics.py,sha256=panMWD3a1NPerg3Ix7l6NhR7jGOIQOQV9i_KysBeDA8,2818
64
68
  helm/benchmark/metrics/ranking_metrics.py,sha256=b3qxTRnr62zz1Gr1dsVDYtdwB8WBIb-v98yoRB3Wtvs,17231
65
69
  helm/benchmark/metrics/statistic.py,sha256=9VM5JA1-M_iYCNziWm2qeDZaAQqPQ_ySdaSMcqAeYdM,3048
66
- helm/benchmark/metrics/summarization_metrics.py,sha256=xFQOiiIRM8AnIS9NUt74vzu7dfvCsoc-0Mh9m4fkexc,16011
70
+ helm/benchmark/metrics/summarization_metrics.py,sha256=hHNWGYA1bNfgCg7o1RSiTo7E-SJujHhkKh9G204icoo,16083
67
71
  helm/benchmark/metrics/test_bias_metrics.py,sha256=brut1rdnKNtTVJoe6qkllmJwZTFBZkLcyI_4qmqZ_vA,6264
72
+ helm/benchmark/metrics/test_classification_metrics.py,sha256=usW5ciUYu2ZUUqVjFk4NfZTGNIoBArwia_-8uGOvFpw,5475
68
73
  helm/benchmark/metrics/test_metric.py,sha256=S7LGHNCHuhMk582eHylw1tOasUBEf_7F0T4u3tey7b4,757
69
74
  helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SIBjLFuLIzGbfg6OiZvM,4162
70
75
  helm/benchmark/metrics/test_statistic.py,sha256=WQv9i8wSNTCzlw-L1wir0lmW0g3D4CM_ebpii7IB9Lw,406
@@ -85,13 +90,14 @@ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=gPqoYNI
85
90
  helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
86
91
  helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
92
  helm/benchmark/presentation/contamination.py,sha256=5wLwq266sCxT62MdzAXT9V-au6b07HaL44DLj_2qiSk,2788
88
- helm/benchmark/presentation/present.py,sha256=dHlDGixc7dkHFAQ_yMGAd5ik9G-cQ-sq8MDzKdBKNT0,9083
89
- helm/benchmark/presentation/run_display.py,sha256=JumIiYfm0UElrXNz-iJmjdMWvxLUJ1opWwcgyfFiSwg,12207
93
+ helm/benchmark/presentation/create_plots.py,sha256=-YyrhEmfVOMnESJ8m2yk7RWAOYdZkVrLAt2K8XnpNF0,28442
94
+ helm/benchmark/presentation/run_display.py,sha256=HSvV71ZRshMIhHZHGtlbYfRxK9xx1GQgn6YmGPVncME,9892
90
95
  helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
91
96
  helm/benchmark/presentation/schema.py,sha256=i1utCqiNkbTK9CcDcOFwi7e91KPaKkpFU07ZcBfWXTc,8753
92
- helm/benchmark/presentation/summarize.py,sha256=UxqU0nNdKv80h3ROwP7fWrxmaivGJ3yUBiNKB2sBWOw,45067
97
+ helm/benchmark/presentation/summarize.py,sha256=BUXog2m_UPbftyzFHx_U4mE2FrG56iv9mvcCdXoZVmI,45071
93
98
  helm/benchmark/presentation/table.py,sha256=VzVMwsgP3kItAM6FPRUaTphzJ-ZjriiuFbWlO1rJUMU,2879
94
99
  helm/benchmark/presentation/test_contamination.py,sha256=8mnzUzxUW9pXUOuLpU4BBBg0V7Mn1d1s4AQgwy6_kl4,459
100
+ helm/benchmark/presentation/test_create_plots.py,sha256=2q3v2Qdh_hBKCEX9toygXFLIryu1FlcLMt2PXprx7j8,1251
95
101
  helm/benchmark/presentation/test_run_entry.py,sha256=M5z4dnVb7fM3PWrZWIZNlG8CT4KnDxjnEE4FBb1ZFNU,621
96
102
  helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
103
  helm/benchmark/scenarios/babi_qa_scenario.py,sha256=SkIJjqBuMLYxZR8l9epu9arBeirvJPtQsIBXv4bzkx4,5030
@@ -104,7 +110,8 @@ helm/benchmark/scenarios/civil_comments_scenario.py,sha256=vXq6KxyS5C0-tD8xUmkG5
104
110
  helm/benchmark/scenarios/code_scenario.py,sha256=Q_TP_vWewkClvibPFHXpsOjR-CWexYgu5kl4OpfZXNc,11355
105
111
  helm/benchmark/scenarios/code_scenario_helper.py,sha256=EbQNfHqhQXaMMPmYT2mG2dRjzYaI2FvcPb9j6NlNHDU,5853
106
112
  helm/benchmark/scenarios/commonsense_scenario.py,sha256=9roSJS3iGSNgqxTbLI87xuZGB8IxJkbbtzr-ep0HUn0,10661
107
- helm/benchmark/scenarios/copyright_scenario.py,sha256=3dzDZ4B2a3ZmY6zMlQ98Ni-9836kPE4V1U6fecYrQHM,3646
113
+ helm/benchmark/scenarios/copyright_scenario.py,sha256=APYQPC-esq3oM2qQxW6JNxa4pkv_yHDKfePjpvvi6nQ,3660
114
+ helm/benchmark/scenarios/covid_dialog_scenario.py,sha256=FmYIuRr81xD_d0iyRa5blPC8OTqpfv8XGTz5XXUOd2E,3958
108
115
  helm/benchmark/scenarios/dialogue_scenarios.py,sha256=SPwo1iYiLbPwNtOgAVkTr-dO8FQLshmrfXdjPcayW5A,5616
109
116
  helm/benchmark/scenarios/disinformation_scenario.py,sha256=Ff66LxBm8APuMziLfGvTM0WIatrAty5_q_8ObaLW5lo,8491
110
117
  helm/benchmark/scenarios/dyck_language_scenario.py,sha256=Yua5S2gwLX2C-odJY3LeL-Yj47H1xXChdlI8cratVz4,9300
@@ -112,42 +119,53 @@ helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=oChQzEptlf731
112
119
  helm/benchmark/scenarios/entity_matching_scenario.py,sha256=gtrLSCw2JSNnBgFQFsUm4EcICsjVWtp9wsOdqcyBU4k,6863
113
120
  helm/benchmark/scenarios/gsm_scenario.py,sha256=PmX0zutkGqnqGirWidUdk166cWv_23RtaTFcVQGBpzc,2619
114
121
  helm/benchmark/scenarios/ice_scenario.py,sha256=smrpTOwtMDL-m40zfKfNz9btOGoINZNv3_2oBcLBMmk,16156
122
+ helm/benchmark/scenarios/imdb_listdir.json,sha256=eczxp9gslYYwx5XR86ATnZorIxuujFMDTfzR4h5NCpo,1015402
115
123
  helm/benchmark/scenarios/imdb_scenario.py,sha256=VTD0Ur6ATyY9NWxcnkGzn9Iw5vl4d94o0FbFm61ZZTA,6057
116
124
  helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=wYaivSqqYYZPjPHTKaS6D7j960dcLIVTfmuZ8awd1Zs,2192
117
125
  helm/benchmark/scenarios/legal_support_scenario.py,sha256=K9HfTyHJnnLFvGKNwYQayu7JV4uyNT5wH48wc9ixRa0,3912
126
+ helm/benchmark/scenarios/lex_glue_scenario.py,sha256=r89KevvM1Kifu3ZkUIXAV8jXktclNtL0-JL9T6qOx_Q,10224
127
+ helm/benchmark/scenarios/lextreme_scenario.py,sha256=tejeKE08YX5MYFXrJRY48rIzO5V5fdwHDWEzg883K2E,20300
118
128
  helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=2-GrPB8whoBFcQ5608jrwlAcRJgpkT1P2UehcR6-EYY,5977
119
129
  helm/benchmark/scenarios/math_scenario.py,sha256=5PespNtseDOnPgAwtdP0vMkXz1CaJM0BkJsWdeG5gUM,13825
130
+ helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=ZIn7tlBC4baV8CYcU_-mYe2RbYaQ-8dX1Ca_hOvZTfI,3988
131
+ helm/benchmark/scenarios/med_dialog_scenario.py,sha256=w_s0s5TY6VjnCdmJ5BcSbDAYKZtnb5c7KSP1wYd9z9A,7282
132
+ helm/benchmark/scenarios/med_mcqa_scenario.py,sha256=OgxZbyqzNnADTjYMlejV2I54G7tK3awGecTaxSgO9W8,5022
133
+ helm/benchmark/scenarios/med_paragraph_simplification_scenario.py,sha256=nw6EhUtZq6ABFXxNU0Uh2mG5yiCUDesKqLQdZo7Kr90,7578
134
+ helm/benchmark/scenarios/med_qa_scenario.py,sha256=rHSoIfb50Uq8T7l37m9Wn_fC16otifaP77qAEctf1oI,4404
120
135
  helm/benchmark/scenarios/mmlu_scenario.py,sha256=pzIRmLGikWTgB0AD2VFj64Q3GUsQg7nJzUqTSo-7pZo,3777
121
136
  helm/benchmark/scenarios/msmarco_scenario.py,sha256=_EaKsppb2Ax8f_ETc3cBy27i5w5ajTtIbF3xNWO8lUA,33669
122
137
  helm/benchmark/scenarios/narrativeqa_scenario.py,sha256=r5TwYkZH_YaCws6BSnjbiDQkZH_YM2BtxtDFVl4jj5s,5595
123
138
  helm/benchmark/scenarios/natural_qa_scenario.py,sha256=nuL5Qlh26xq7Q_lvzK2sKVN7eNYh7F5SjbIjahKaMNg,12527
124
139
  helm/benchmark/scenarios/newsqa_scenario.py,sha256=vMuIZyYxufH2AqhDoIZzzllfq8ScJIhSDH6lM3IUxGM,7242
125
140
  helm/benchmark/scenarios/numeracy_scenario.py,sha256=Iwtypyb0249zKYyV6p4YUX4bYke6uIyL6R3aopiGUb8,30552
141
+ helm/benchmark/scenarios/opinions_qa_scenario.py,sha256=dtIYKL9ZiccX_F3-5OrnHXJdNBBTxCoTHc2Kc-XX79E,7380
126
142
  helm/benchmark/scenarios/pubmed_qa_scenario.py,sha256=cpMmQWwCDXIYO0btGLhevMT-Mhs9-5Es9cDvQYkIlL0,7493
127
143
  helm/benchmark/scenarios/quac_scenario.py,sha256=46nqmeVgkWu6jDGCHl-KHu351bmJj7jx_1p5kPwcOjc,6615
128
144
  helm/benchmark/scenarios/raft_scenario.py,sha256=RzewlMVkHJ2XbZ9_9FzBvbdV__BuBRgtX2HyhCnmH1o,4500
129
145
  helm/benchmark/scenarios/real_toxicity_prompts_scenario.py,sha256=8DQD7KkilAfxwzHUwCPrPPHeYEz_dEOWSls8yZo15do,2387
130
- helm/benchmark/scenarios/scenario.py,sha256=aj2golyi1TAAAebfW7eouTWRwZ07KbK034HlLx-q-1g,7164
146
+ helm/benchmark/scenarios/scenario.py,sha256=bdRcv-YoLkxjlpNcq4MXiu8HQgjByHkkLWOdih4ahsM,7365
131
147
  helm/benchmark/scenarios/simple_scenarios.py,sha256=rcHzukhjBgvNRqkjcg0cms_zWtAsLPk0xiFN9I25_hI,1947
132
148
  helm/benchmark/scenarios/summarization_scenario.py,sha256=fKeRSkXrH26WyfeIhn43_fZxnAO4bIX1Xh5HoKcjOQM,6550
133
149
  helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=dtMMeULjw6pcobBRp0r6f9N4VCUKamx1Jy-6xPu85q0,3083
134
150
  helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=YvEgO5qVZ4hLpnvjer4CG0Ct1upssZRjZWxnNi1ZUtY,16308
135
151
  helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=Pm63shscbGuigg4zWfcw3rOIPP8LgxBCnpcQKAA_CX0,8327
136
152
  helm/benchmark/scenarios/test_scenario.py,sha256=o8w8ElDPF-RzeCmecwyvie_nRMYj01b38BufXS-igqY,1612
137
- helm/benchmark/scenarios/the_pile_scenario.py,sha256=UXxRlaZdVRoWTjw8h5TUhXouq5JLaidPbL5-Itai0KE,4988
153
+ helm/benchmark/scenarios/the_pile_scenario.py,sha256=aCcjZp0wabu8lpPVNAdCr1x6m_3QvgKe7dIGS2qgGm4,4981
138
154
  helm/benchmark/scenarios/truthful_qa_scenario.py,sha256=zQgv_qUU5h5ODoQE06rpJa6O_8FwF695cIie3PG7bx8,5969
139
155
  helm/benchmark/scenarios/twitter_aae_scenario.py,sha256=_qWotvPagNj3ATnyaww3U1XZtxN-wgueUf34fFmKXQI,2083
140
156
  helm/benchmark/scenarios/wikifact_scenario.py,sha256=pucEuLYz5N9qIobnEbJnRUnK6PrkWFdkl7yPuCJj3SE,5778
141
157
  helm/benchmark/scenarios/wikitext_103_scenario.py,sha256=rXVbUzOZi4eWM-_HP1gzY5SBmMwOX1vk12WrLkR3NHo,3074
142
- helm/benchmark/static/benchmarking.css,sha256=EkLEyXEL5Qwq-312D01y9EaQV2IBa67fw8Bjc3PQPJs,1928
143
- helm/benchmark/static/benchmarking.js,sha256=NQUoE05neH_YN9BgyNVFwEXX09YDRZOcunK_6tCZomA,47399
158
+ helm/benchmark/scenarios/wmt_14_scenario.py,sha256=u24E_w0AOXpl3PzEFLmiBcl8qyJEy-1Yc-i4YHgU99M,4356
159
+ helm/benchmark/static/benchmarking.css,sha256=DGC4Huh4tVD2o9wEeUf3YOc3MYcq2fmJQXvhTjVDumE,2057
160
+ helm/benchmark/static/benchmarking.js,sha256=qiXAY_9fiWZ4ydzhBQAUhylzxNCoIN8ciLlEFMg33uE,49107
144
161
  helm/benchmark/static/contamination.yaml,sha256=LbISh56ORvfkkWptm7ZWmlPvWxtls6pBF1TbGiWD7hk,3096
145
162
  helm/benchmark/static/general.js,sha256=L3S4CBUED0k7RsjLHCeWjO29ZMFJckZgNTAYAARzaEg,3029
146
- helm/benchmark/static/index.html,sha256=LZMmoydG2LLqRfvGSpK3eRt1n92o19AELAzxCi-kok0,2994
163
+ helm/benchmark/static/index.html,sha256=yzf_VEGW15CPZOW9a2Z6opoQ0Gg2YQCtj182niUb0fk,3130
147
164
  helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheGWu7zNI,3428
148
165
  helm/benchmark/static/json-urls-root.js,sha256=G3qenwLgBojh3ukzp_gyMUaZja83ZFqvT1WQ_Rg11BU,98
149
- helm/benchmark/static/json-urls.js,sha256=UzWTp-dowxZhJuWtoPyi5vpfwlRr561DGYWRHdkvZ1E,1634
150
- helm/benchmark/static/schema.yaml,sha256=c3ipoxCLWCIY2wxNhDJF2JC0mSrC_g0NORnXNWCbs7k,89602
166
+ helm/benchmark/static/json-urls.js,sha256=wvsG2Lrz2XArwwMOl_tGXL9y4mjqzjod4gcqlvVCiQA,1750
167
+ helm/benchmark/static/plot-captions.js,sha256=gTBn-IPPD4BkzryVYj3KkGqLhWqWvBbvufeJaDygQxk,3010
168
+ helm/benchmark/static/schema.yaml,sha256=TQdzlpOPvTwELiU0w0HBAZqKQBmjj1LOkEGM85oa8e4,95657
151
169
  helm/benchmark/static/utils.js,sha256=H2PKYjuXZ392DlALCPJ1XRwGxBDRFjL9eTFiTd4vBU8,7338
152
170
  helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
153
171
  helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
@@ -172,23 +190,27 @@ helm/benchmark/window_services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
172
190
  helm/benchmark/window_services/ai21_window_service.py,sha256=Mo0Zzj2a9hDiUg6hTuJWb3ABhBUPlOYS-kHhaE-pHUE,12672
173
191
  helm/benchmark/window_services/anthropic_window_service.py,sha256=mA9aWKfc-dbSnpt37k6zDFdNNMfbevAoleCzSaar-uE,799
174
192
  helm/benchmark/window_services/bloom_window_service.py,sha256=o7MVedt6khdoj8zikLDuVraEzuoBZk7j4Fzjsas0sD4,1023
175
- helm/benchmark/window_services/cohere_window_service.py,sha256=MXBRAjuAQGq0iEpU9OLORH1FvHMak1-Nf5-D7k7UO9I,6182
193
+ helm/benchmark/window_services/cohere_window_service.py,sha256=3SJT97CaxNxtUNS9_qKvKMVCA6lvNavS-xo6jAsxPbM,7070
176
194
  helm/benchmark/window_services/encoder_decoder_window_service.py,sha256=Dg_rD0RtKIALtpvT4Wi4Am3zgFcIvgxfzdglw5fbbTU,2478
195
+ helm/benchmark/window_services/flan_t5_window_service.py,sha256=39IZX89_tay3bpSGVoWDoekmhW-RUNATQuT-bNyFRTs,915
177
196
  helm/benchmark/window_services/gpt2_window_service.py,sha256=d-ys9FHoVI1u1GZOH734JPAOc8W6QZb1N3ZjooKmwz8,990
178
197
  helm/benchmark/window_services/gptj_window_service.py,sha256=C2OEl-3ZatwxYVoaQgyvAZ5SBS2TK-2PnM02zUNjhiA,1103
179
198
  helm/benchmark/window_services/gptneox_window_service.py,sha256=FD0NDlBxDu1fZ8vlaTsOBh2IKYfrMV73qNGetkrp1P4,944
199
+ helm/benchmark/window_services/huggingface_window_service.py,sha256=tDbhjn81Aw3ZSYMKh4aQU3e8JX1IVYCu_2gPOLaIWD8,1440
180
200
  helm/benchmark/window_services/ice_window_service.py,sha256=5z52rP-xAF_jckIPoogyoNFW6FQXqaq3SHybCaBuRn0,2005
181
201
  helm/benchmark/window_services/local_window_service.py,sha256=wgNBB-p9Zk-uFLRXYNxrz16_DhWyQ8x9sltSLBhzUh4,4247
182
202
  helm/benchmark/window_services/luminous_window_service.py,sha256=0w-nyfXXDwnIDBzU3Y84LZPnQyJGiSbvf5Y_MQh-hCg,1791
183
203
  helm/benchmark/window_services/mt_nlg_window_service.py,sha256=7zvEEZhqfdefUUFiDaqbJqrBu3Pt7BBuT_Si-4swV5s,838
184
204
  helm/benchmark/window_services/openai_window_service.py,sha256=Oguy_ewlL2Uydq-B1QrqqHWKTenL4mtwtY24RNqhFCM,466
185
205
  helm/benchmark/window_services/opt_window_service.py,sha256=ilj1G_pslwYeRZ2fhRMXsg2WjQ4rCfTUwuSmjItH-t4,1050
206
+ helm/benchmark/window_services/santacoder_window_service.py,sha256=sfbuAAdhEkQkFON_bHxDI9Ek3jZiwCF6upa41bsjUO4,674
186
207
  helm/benchmark/window_services/t0pp_window_service.py,sha256=oa1vJRiyFPbkTb8eYnfjZNyKFJSvceipOq0U3Ys5e04,1196
187
208
  helm/benchmark/window_services/t511b_window_service.py,sha256=8CvkSfuG_Bg17gAEre_bmM4tUwoi9fAWpPxx6qCJwAE,1005
188
209
  helm/benchmark/window_services/test_ai21_window_service.py,sha256=U_n2mQ5GqD8oYv7e9vfNkACDic2-zDVuUIHN4meSC-I,8177
189
210
  helm/benchmark/window_services/test_bloom_window_service.py,sha256=xeYpsvdR8Ug31BAb4a-PU4Sc0oihpCIMJ5OzTfgUhM0,4221
190
211
  helm/benchmark/window_services/test_cohere_window_service.py,sha256=6WpIiuEyfgTZS3BIeAGOR8AAr2djpVIfGbItQRI19Ck,3205
191
212
  helm/benchmark/window_services/test_cohere_window_service_utils.py,sha256=sf25f9MeXzoqsbDzZ7d7le13hm8RkDe54nhLtKF2pqo,158150
213
+ helm/benchmark/window_services/test_flan_t5_window_service.py,sha256=xv_EXbiRklveJPQtThYCSYF7qBYwjL7K4wH3Xu5z2Fg,591
192
214
  helm/benchmark/window_services/test_gpt2_window_service.py,sha256=3k25pLa_z__g4yoQL40DEXj-T4dGtrgif13N2NXs59U,2568
193
215
  helm/benchmark/window_services/test_gptj_window_service.py,sha256=sxsTpozKv9N-wZXtGl1prkQr9Md_q-tnCjO9zt226Co,2267
194
216
  helm/benchmark/window_services/test_gptneox_window_service.py,sha256=MCYHZIoulJf_WCx6de7rWB3nqku6wCyAokA-SWIPEks,4140
@@ -203,57 +225,64 @@ helm/benchmark/window_services/test_utils.py,sha256=1k2TlPdDIRjum669jpH3O7UOqm4G
203
225
  helm/benchmark/window_services/test_yalm_window_service.py,sha256=NdunSxq-qDzfzYMBYZ-0my6LaU2qUxtm7Ii0c4fyKnY,4273
204
226
  helm/benchmark/window_services/tokenizer_service.py,sha256=RNznJBAxcCUMCurb7mbraZULx_ZtB0G7IxbrnUe0Urk,865
205
227
  helm/benchmark/window_services/ul2_window_service.py,sha256=R_VEzOb59zQE9mmbTLunQeIvLAtK3-97h-B2_oc0Uxs,1021
228
+ helm/benchmark/window_services/wider_ai21_window_service.py,sha256=VZ6EERN48FSYsmJ_aiwI30SEbobLt27c1QqL29Zg_8M,414
206
229
  helm/benchmark/window_services/wider_openai_window_service.py,sha256=cpm4mDEGTY-cmrECcDCL0flONBdh4g40uWNLI-v46BU,539
207
230
  helm/benchmark/window_services/window_service.py,sha256=aV4YnbXl7T23runB8xmSWRwV7YtliUlXrveEejOMJ1Y,2885
208
- helm/benchmark/window_services/window_service_factory.py,sha256=03UL8pE98Bh3lDSXe93m-R4j4riUJm1o7RkZ-UZfAgg,4403
231
+ helm/benchmark/window_services/window_service_factory.py,sha256=UDrXRUXGwBMEiO7Cw_nNbr0XM6mzzXrXiyJsvCmW9wg,5845
209
232
  helm/benchmark/window_services/yalm_window_service.py,sha256=g7NeXvlRq3FXf2HwRStBMTkbWDxTdEITkrVRe-fv3mg,1805
210
233
  helm/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
211
234
  helm/common/authentication.py,sha256=RlMx29_TSrfU7ujE7dJkxmFub5EqLj2NswV5lAVFFDk,179
212
235
  helm/common/cache.py,sha256=ustgsRHX0W3zoLPN05W3mFl9m9JYp9Ppq5cjMbdmm6Y,13116
213
- helm/common/general.py,sha256=bIhgtiEIz1zA7cQVH-U_6FuW_bmDzYaff4QZ779tR3U,10087
236
+ helm/common/codec.py,sha256=zm8MP9Aqfh64D2HMZiCPEMoPkkiJxEzvzmuupGvkRh0,5499
237
+ helm/common/general.py,sha256=7vFw10h_hTrxUCfxE0LH13hp8Lunp77lbVmucgoeq2Y,10181
214
238
  helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
215
239
  helm/common/object_spec.py,sha256=COMd4RpYgfulW940a5M_npbsfRBvLkmhjfwDIq4Gpqs,1833
216
240
  helm/common/perspective_api_request.py,sha256=WAVwtajNVmi5XJNsPcorGEAVrqkpPSk-Kd3b0hJghbA,2427
217
241
  helm/common/request.py,sha256=QBpkZBpU1nCTqtEM8Ekki3bZgxpASuVRC3sQ_4RYaRE,5793
218
242
  helm/common/test_cache.py,sha256=XqboYHQAkFWIHPsuIjuageRSLeN7QoATKF7wwxggPqE,7054
243
+ helm/common/test_codec.py,sha256=igL--k-2DwAy0eoMr8D9Xs8MOjBoT0LutbMPzDlTNkM,5885
219
244
  helm/common/test_general.py,sha256=zOxSwWNgWnWHsXKcG4NZ50GkWicn4uZ4jPVypSwFaQE,1672
220
245
  helm/common/tokenization_request.py,sha256=aDyf4A6QlTgISXy4IyXJVQytrOLwYVX9-TCa2CK2h1M,3226
221
246
  helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
222
247
  helm/proxy/accounts.py,sha256=xq-zVggvueB4D5QK58mFWGPxZe-hnIUAT4D341yd0ac,13503
223
248
  helm/proxy/cli.py,sha256=2SOxIF55PDjzXXcDOYRT8m-oyQM_2VyZheKRC3fXDVw,8094
224
249
  helm/proxy/example_queries.py,sha256=p1wH-tp1pRUslkAwaJYrrG5aDfmFWK3KYn4M6WQfPqQ,4120
225
- helm/proxy/models.py,sha256=N-5PYF0CQij7UrWl5v4hJ2TAlTsMjF2wghK9bZtaGEc,24838
250
+ helm/proxy/models.py,sha256=sfdp7DasJXbun-VMZz5p7iD6uhoUW1mFsy64BWfDoV4,29681
226
251
  helm/proxy/query.py,sha256=eftbiUICMh8QIHVs-7cLtv_rDXKeKdRPmwjLMu0TDxQ,645
227
252
  helm/proxy/retry.py,sha256=GLLDW1iGCwHfgTle8YK7ZB3vV-7XqsHcqeruKoVdsxE,1953
228
253
  helm/proxy/server.py,sha256=uBispGXfn39s_Pskd9Xjud0rijTjqXtSKU_2YvE6zGE,7356
229
- helm/proxy/test_models.py,sha256=-vb1s5WFhv10qvCLxwVhpoSY-yXSVF8CUQ2GwK8QXJU,782
254
+ helm/proxy/test_models.py,sha256=hWeDcBw1GkPvyJUd-ABxRVe1FhSUfz8bzyrKYdsqmyY,726
230
255
  helm/proxy/test_retry.py,sha256=8h398auzjW9VnlTJWllxR-bdpub-XFp8EN8LWDEnEHM,1049
231
256
  helm/proxy/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
232
257
  helm/proxy/clients/ai21_client.py,sha256=c4D64-1mLaFcGsqOb5CYkaXTmAWEJYk9cIjtr6DbgZQ,7770
233
- helm/proxy/clients/aleph_alpha_client.py,sha256=kh7Wrf9D5aZo5HYbvW1KFdRvdOn1EO64V44tNXlUKlk,6530
258
+ helm/proxy/clients/aleph_alpha_client.py,sha256=VV7dbgh7sYqoSWfNkxxAiQ7i3yPW33rBPnBig9EXL10,7707
234
259
  helm/proxy/clients/anthropic_client.py,sha256=vrfc8lS9bxQbUxMxbElV2z5cMDq-JD6yFDTS2cJdFO0,15526
235
- helm/proxy/clients/auto_client.py,sha256=6m9datYzA3izIAlv1M14RTfzwjOAafILp9gsoml9ud4,10635
260
+ helm/proxy/clients/auto_client.py,sha256=oktJQ4rCntp7id4vj-d3x_EA3RWGNZyXUMFwzRse--c,11063
236
261
  helm/proxy/clients/chat_gpt_client.py,sha256=nG3opHbnzX50r9Ialh3RaRErOZo6k4Q7gVzRHeGQgj8,5312
237
262
  helm/proxy/clients/client.py,sha256=bh6FvYFjw6MoHp5n7-KN1asXrIrOC-jfYsg3aW4xMgo,4570
238
263
  helm/proxy/clients/cohere_client.py,sha256=KF21m7qUjuhrpEEQv68FNeX0rsWSmxZgw52Oa7CZ5pI,11362
264
+ helm/proxy/clients/google_client.py,sha256=sGGxDWD22c9a9KMzLAFL3vAEDHxp5jSY2W3RDpVDIak,3334
239
265
  helm/proxy/clients/goose_ai_client.py,sha256=2tqJK_AhD2-ScXtOTdt9S9khzVjal5pm38BJWiFhwq8,4217
240
- helm/proxy/clients/huggingface_client.py,sha256=W3R4IfA75tZuSEI3oH_b3fSQjKIAXAmH98ebYonZVOg,10805
241
- helm/proxy/clients/huggingface_tokenizer.py,sha256=4tsAW9oLW3NczBWWLAZuvE1gL-5HAj0p0Vi6c0UIB9M,3679
266
+ helm/proxy/clients/huggingface_client.py,sha256=LypY3YfyoaGFH83UkYwDARGuoN2JOUk2S-nEaJ6GemI,11813
267
+ helm/proxy/clients/huggingface_model_registry.py,sha256=0WHyWPxxBI4KtTs2Yt6-Cw16FC4XBEe6yqUc0-YSn1Y,3891
268
+ helm/proxy/clients/huggingface_tokenizer.py,sha256=ujtsBupMMrE9efds2205c8NiPTcxHX8XM0UoV9spLK0,4591
242
269
  helm/proxy/clients/ice_tokenizer_client.py,sha256=Ui8YhAXoY1Q0vC3icoeFs6X9xAcESF6Tl2EGERGWVGU,2325
243
270
  helm/proxy/clients/microsoft_client.py,sha256=-VC8IrgrpSp1_FvRSI_8MSxhNp5I6dMc4qWSHc4Oulg,8237
244
- helm/proxy/clients/openai_client.py,sha256=rPV74pbiAlkLS4YO6J-dnzpVmsShWKYISD2nJTk6Sds,6025
271
+ helm/proxy/clients/openai_client.py,sha256=frW9fOjYWkRXdfzE88ppxaLhVl9pCnXfheUqeANW6QQ,9415
245
272
  helm/proxy/clients/perspective_api_client.py,sha256=-L8IwokuktWPoOu7nXwsfoab_U1QRGCt8xT1SrcGfYE,5491
246
273
  helm/proxy/clients/simple_client.py,sha256=GXHTCRB58XAxnUVqgpynidc7h6kaDBOP7TedVHrOpD4,2915
247
274
  helm/proxy/clients/test_client.py,sha256=bvkFob_Yoy8bALrVeQ0h757g9RU687JYI0g3AISPFQ8,1268
248
275
  helm/proxy/clients/test_huggingface_client.py,sha256=n-6D-RXqwQyxPxCLCSqHxqfK5JA-PdP5ffP17XwTe2I,3520
249
- helm/proxy/clients/test_huggingface_tokenizer.py,sha256=OtG8kVSSsXlYkTBiSF9eHODHmUad99GUu2oYR2zrRME,2072
276
+ helm/proxy/clients/test_huggingface_model_registry.py,sha256=zMboFlwMtDEV7hkd9SZFuItye-vkzz3CE5mWQrw--W4,2554
277
+ helm/proxy/clients/test_huggingface_tokenizer.py,sha256=KmlAXezQ6R7DAEpV85_JRdTRrOJoJxfmtylqybWn5VA,2189
250
278
  helm/proxy/clients/test_ice_tokenizer_client.py,sha256=Ugmn5a7QdAPEAbLtreLS5-sji8yrzxy5mhFPAl3rOuI,2404
251
279
  helm/proxy/clients/test_yalm_tokenizer_client.py,sha256=tnrYl7T1DcZ9FN09nBWV1gesjWQ3osiUpsGHSm_IypQ,2336
252
- helm/proxy/clients/together_client.py,sha256=k7TmK940KHdtvb2pxAlX4kHUHrfiZL2HAzM3dgwJowg,5731
280
+ helm/proxy/clients/together_client.py,sha256=epyiYElD0BfAZgUSu4zZKC5Oe8yIVVyJn2RtTwjPMzM,6334
253
281
  helm/proxy/clients/yalm_tokenizer_client.py,sha256=cpBoc8eHQoBGQguZsDaVnGWLdZnPgkHjqLSO0B94O0U,2420
254
282
  helm/proxy/clients/yalm_tokenizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
255
283
  helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
256
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py,sha256=z1JEJynC51T70JSuig6TrNwLEUXiS2SVTMvWECya7ww,5743
284
+ helm/proxy/clients/yalm_tokenizer/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
285
+ helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py,sha256=7Y4_nCZptFWzifCJ5aPmM3_OOxhtomIAQVjpJGV1D8g,5954
257
286
  helm/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
258
287
  helm/proxy/services/remote_service.py,sha256=xKS-0P-EqKTPn7odTXDoQPjn9FliQRLaMFUnCEsUmCU,6965
259
288
  helm/proxy/services/server_service.py,sha256=gps7PwXqCi8b0yGYC0nQwFdKbfxCriSHt5CD1N1kkJs,5696
@@ -270,9 +299,9 @@ helm/proxy/token_counters/openai_token_counter.py,sha256=gPo_VrkEH07xmprzdfIhmJ_
270
299
  helm/proxy/token_counters/test_ai21_token_counter.py,sha256=42J1fCi20kQUwAD18bIa6h9TaP7KZnlgF-mLbvKURro,5508
271
300
  helm/proxy/token_counters/test_openai_token_counter.py,sha256=EovaVCZSr9moITZ9-AKiv_YM-D3OUsUDs4iQhEvpazQ,4823
272
301
  helm/proxy/token_counters/token_counter.py,sha256=x8KyTR82EedgCQUuneQiVq9AiU1B3_CHPmKPNumClHc,429
273
- crfm_helm-0.2.0.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
274
- crfm_helm-0.2.0.dist-info/METADATA,sha256=rkgCIgDkX6yAGYGEzrFrlbIO_2MEeIGXlXneQad-mx0,1949
275
- crfm_helm-0.2.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
276
- crfm_helm-0.2.0.dist-info/entry_points.txt,sha256=o2pZIIQCZp4hBs4ZzZkKK0qvThIXXK57YV584ANCK7E,251
277
- crfm_helm-0.2.0.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
278
- crfm_helm-0.2.0.dist-info/RECORD,,
302
+ crfm_helm-0.2.2.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
303
+ crfm_helm-0.2.2.dist-info/METADATA,sha256=_OlkKmj1P7vaZvlpvOnNnzzm3w1IEW6de75SK7TmuPw,2066
304
+ crfm_helm-0.2.2.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
305
+ crfm_helm-0.2.2.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
306
+ crfm_helm-0.2.2.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
307
+ crfm_helm-0.2.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.38.4)
2
+ Generator: bdist_wheel (0.40.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,6 +1,7 @@
1
1
  [console_scripts]
2
2
  crfm-proxy-cli = helm.proxy.cli:main
3
3
  crfm-proxy-server = helm.proxy.server:main
4
- helm-run = helm.benchmark.presentation.present:main
4
+ helm-create-plots = helm.benchmark.presentation.create_plots:main
5
+ helm-run = helm.benchmark.run:main
5
6
  helm-server = helm.benchmark.server:main
6
7
  helm-summarize = helm.benchmark.presentation.summarize:main
@@ -42,12 +42,24 @@ from .scenarios import legal_support_scenario # noqa
42
42
  from .scenarios import entity_matching_scenario # noqa
43
43
  from .scenarios import entity_data_imputation_scenario # noqa
44
44
  from .scenarios import big_bench_scenario # noqa
45
+ from .scenarios import opinions_qa_scenario # noqa
46
+
47
+
48
+ # Biomedical
49
+ from .scenarios import covid_dialog_scenario # noqa
50
+ from .scenarios import me_q_sum_scenario # noqa
51
+ from .scenarios import med_dialog_scenario # noqa
52
+ from .scenarios import med_mcqa_scenario # noqa
53
+ from .scenarios import med_paragraph_simplification_scenario # noqa
54
+ from .scenarios import med_qa_scenario # noqa
45
55
  from .scenarios import pubmed_qa_scenario # noqa
56
+ from .scenarios import wmt_14_scenario # noqa
46
57
 
47
58
  # Metrics
48
59
  from .metrics import basic_metrics # noqa
49
60
  from .metrics import bbq_metrics # noqa
50
61
  from .metrics import bias_metrics # noqa
62
+ from .metrics import classification_metrics # noqa
51
63
  from .metrics import code_metrics # noqa
52
64
  from .metrics import copyright_metrics # noqa
53
65
  from .metrics import disinformation_metrics # noqa
@@ -56,6 +68,7 @@ from .metrics import ranking_metrics # noqa
56
68
  from .metrics import summarization_metrics # noqa
57
69
  from .metrics import toxicity_metrics # noqa
58
70
  from .metrics import tokens_metric # noqa
71
+ from .metrics import machine_translation_metrics # noqa
59
72
 
60
73
  # Perturbations for data augmentation
61
74
  from .augmentations.extra_space_perturbation import ExtraSpacePerturbation # noqa
@@ -68,6 +68,9 @@ class AdapterSpec:
68
68
  # set of training instances. Used to compute error bars.
69
69
  num_train_trials: int = 1
70
70
 
71
+ # If true, randomly sample N training examples; if false, select N consecutive training examples
72
+ sample_train: bool = True
73
+
71
74
  # Decoding parameters (inherited by `Request`)
72
75
 
73
76
  # Model to make the request to (need to fill in)
@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
23
23
  @htrack(None)
24
24
  def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
25
25
  """
26
- Takes a a list of `Instance`s and builds a list of corresponding `RequestState`s.
26
+ Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
27
27
  The reason we don't do this per eval instance is that we create a common set of
28
28
  training instances which is shared across all eval instances.
29
29
  """
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
65
65
  parallelism: int,
66
66
  ) -> List[RequestState]:
67
67
  self.train_trial_index: int = train_trial_index
68
- self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index)
68
+ self.train_instances: List[Instance] = self.sample_examples(
69
+ all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
70
+ )
69
71
  hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
70
72
 
71
73
  # Generate request_states
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
93
95
 
94
96
  return [request_state for result in results for request_state in result]
95
97
 
96
- def sample_examples(self, all_train_instances: List[Instance], seed: int) -> List[Instance]:
98
+ def sample_examples(
99
+ self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
100
+ ) -> List[Instance]:
97
101
  """
98
102
  Sample a random set of train instances to use as examples by following the steps below:
99
103
  1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
121
125
  random.seed(seed)
122
126
  num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
123
127
 
128
+ examples: List[Instance] = []
129
+ if not sample_train:
130
+ # Select sequentially from the train set
131
+ examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
132
+ return examples
133
+
124
134
  unlabeled_instances: List[Instance] = []
125
135
  label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
126
-
127
136
  for instance in all_train_instances:
128
137
  if instance.first_correct_reference:
129
138
  label_to_instances[instance.first_correct_reference.output.text].append(instance)
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
145
154
  sorted_labels.extend(labels)
146
155
 
147
156
  labels_iterable = cycle(sorted_labels)
148
- examples: List[Instance] = []
149
157
  while num_instances_to_sample > 0:
150
158
  next_label: Optional[str] = next(labels_iterable, None)
151
159
  if not next_label:
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
218
226
 
219
227
  # References (optionally) and output
220
228
  output: str
229
+
230
+ delimiter = ","
221
231
  if reference_index is None:
222
232
  # Put only the correct reference as the output
223
- correct_reference: Optional[Reference] = instance.first_correct_reference
224
- output = correct_reference.output.text if correct_reference is not None else "n/a"
233
+ correct_references: List[Reference] = instance.all_correct_references
234
+ if not correct_references:
235
+ output = "n/a"
236
+ else:
237
+ output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
225
238
  else:
226
239
  reference = instance.references[reference_index]
227
240
  output = reference.output.text