crfm-helm 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +10 -8
  2. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +50 -37
  3. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
  4. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +1 -0
  5. helm/benchmark/__init__.py +2 -0
  6. helm/benchmark/adaptation/adapter_spec.py +3 -0
  7. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
  8. helm/benchmark/contamination/__init__.py +0 -0
  9. helm/benchmark/metrics/classification_metrics.py +28 -23
  10. helm/benchmark/metrics/test_classification_metrics.py +44 -9
  11. helm/benchmark/presentation/create_plots.py +617 -0
  12. helm/benchmark/presentation/summarize.py +4 -2
  13. helm/benchmark/presentation/test_create_plots.py +32 -0
  14. helm/benchmark/run.py +23 -1
  15. helm/benchmark/run_expander.py +161 -47
  16. helm/benchmark/run_specs.py +84 -10
  17. helm/benchmark/runner.py +31 -3
  18. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  19. helm/benchmark/scenarios/imdb_listdir.json +50014 -0
  20. helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
  21. helm/benchmark/scenarios/lextreme_scenario.py +37 -25
  22. helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
  23. helm/benchmark/scenarios/scenario.py +5 -0
  24. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  25. helm/benchmark/static/benchmarking.css +14 -0
  26. helm/benchmark/static/benchmarking.js +43 -0
  27. helm/benchmark/static/index.html +2 -0
  28. helm/benchmark/static/json-urls.js +4 -0
  29. helm/benchmark/static/plot-captions.js +16 -0
  30. helm/benchmark/static/schema.yaml +66 -8
  31. helm/benchmark/window_services/cohere_window_service.py +20 -0
  32. helm/benchmark/window_services/flan_t5_window_service.py +29 -0
  33. helm/benchmark/window_services/huggingface_window_service.py +39 -0
  34. helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
  35. helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
  36. helm/benchmark/window_services/window_service_factory.py +27 -6
  37. helm/common/general.py +12 -5
  38. helm/proxy/clients/aleph_alpha_client.py +47 -28
  39. helm/proxy/clients/auto_client.py +28 -24
  40. helm/proxy/clients/huggingface_client.py +30 -17
  41. helm/proxy/clients/huggingface_model_registry.py +111 -0
  42. helm/proxy/clients/huggingface_tokenizer.py +23 -7
  43. helm/proxy/clients/openai_client.py +60 -2
  44. helm/proxy/clients/test_huggingface_model_registry.py +57 -0
  45. helm/proxy/clients/together_client.py +17 -2
  46. helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
  47. helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
  48. helm/proxy/models.py +82 -2
  49. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
  50. {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crfm-helm
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Benchmark for language models
5
5
  Home-page: https://github.com/stanford-crfm/helm
6
6
  Author: Stanford CRFM
@@ -12,15 +12,11 @@ Classifier: Programming Language :: Python :: 3.8
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Requires-Python: ~=3.8
14
14
  License-File: LICENSE
15
- Requires-Dist: pytest (~=7.2.0)
16
- Requires-Dist: black (~=22.10.0)
17
- Requires-Dist: mypy (~=0.982)
18
- Requires-Dist: pre-commit (~=2.20.0)
19
- Requires-Dist: flake8 (~=5.0.4)
20
15
  Requires-Dist: zstandard (~=0.18.0)
21
16
  Requires-Dist: tqdm (~=4.64.1)
22
17
  Requires-Dist: pyhocon (~=0.3.59)
23
18
  Requires-Dist: dacite (~=1.6.0)
19
+ Requires-Dist: aleph-alpha-client (~=2.14.0)
24
20
  Requires-Dist: bottle (~=0.12.23)
25
21
  Requires-Dist: gunicorn (~=20.1.0)
26
22
  Requires-Dist: Mako (~=1.2.3)
@@ -28,8 +24,9 @@ Requires-Dist: sqlitedict (~=1.7.0)
28
24
  Requires-Dist: pymongo (~=4.2.0)
29
25
  Requires-Dist: retrying (~=1.3.3)
30
26
  Requires-Dist: websocket-client (~=1.3.2)
31
- Requires-Dist: openai (~=0.25.0)
32
- Requires-Dist: transformers (~=4.22.2)
27
+ Requires-Dist: openai (~=0.27.0)
28
+ Requires-Dist: transformers (~=4.26.1)
29
+ Requires-Dist: tokenizers (~=0.13.2)
33
30
  Requires-Dist: icetk (~=0.0.4)
34
31
  Requires-Dist: protobuf (~=3.20.2)
35
32
  Requires-Dist: google-api-python-client (~=2.64.0)
@@ -41,6 +38,7 @@ Requires-Dist: sympy (~=1.11.1)
41
38
  Requires-Dist: sentencepiece (~=0.1.97)
42
39
  Requires-Dist: numba (~=0.56.4)
43
40
  Requires-Dist: cattrs (~=22.2.0)
41
+ Requires-Dist: xlrd (~=2.0.1)
44
42
  Requires-Dist: importlib-resources (~=5.10.0)
45
43
  Requires-Dist: nltk (~=3.7)
46
44
  Requires-Dist: scipy (~=1.9.1)
@@ -54,5 +52,9 @@ Requires-Dist: spacy (~=3.2.4)
54
52
  Requires-Dist: summ-eval (~=0.892)
55
53
  Requires-Dist: torch (~=1.12.1)
56
54
  Requires-Dist: torchvision (~=0.13.1)
55
+ Requires-Dist: colorcet (~=3.0.1)
56
+ Requires-Dist: matplotlib (~=3.6.0)
57
+ Requires-Dist: numpy (~=1.23.3)
58
+ Requires-Dist: seaborn (~=0.11.0)
57
59
 
58
60
  Benchmark for language models
@@ -1,15 +1,15 @@
1
1
  helm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- helm/benchmark/__init__.py,sha256=HrVUkpDalZyBFNqyRao1CQ3Z05bLWmlT9K7-zMzr9-Y,4397
2
+ helm/benchmark/__init__.py,sha256=haJrJawd2zOTaxV_nkk6-V05vnePuHwCi0DytuJ0898,4450
3
3
  helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYCK8pXVo,2173
4
4
  helm/benchmark/executor.py,sha256=Vkmc4wmar2MRIavfiUOa2mu8Pp-zXsguYOevbjog4-4,3299
5
- helm/benchmark/run.py,sha256=_6jBWeTiyUz9A7XmtElM_6KaYs7Z7M-kAKcY6bgzwN4,8537
6
- helm/benchmark/run_expander.py,sha256=W91zJZ98h2gI_f9IX8cvtA3BxoQxf26W2Wr7FM2xnLM,29077
7
- helm/benchmark/run_specs.py,sha256=7KDzTtZXVip4avTHV8IKgcHxXt_85EhS8PBtjOBuZMg,69191
8
- helm/benchmark/runner.py,sha256=wHC3A5XdhQ77ctbtNUNfGLzqeJLkLJJvMdZj69gH6n4,7559
5
+ helm/benchmark/run.py,sha256=AWa862BtEh5aOTjKZ9OkSv3be2ZrU4R1qiwJtRTQwfk,9402
6
+ helm/benchmark/run_expander.py,sha256=vnq-zRmuXLzgr3sS3XYaXJFarNC7-QKc0_DtPjwXq3Q,32952
7
+ helm/benchmark/run_specs.py,sha256=ssBJYMZVMF4XGk6lvCSlQJh6A-Pmh2_ndi_JAwgW0CQ,71441
8
+ helm/benchmark/runner.py,sha256=zYDe8UeB1LFmbpChmRdRqEIZo-X0xWMenOCp2NnZ9Ws,8802
9
9
  helm/benchmark/server.py,sha256=HsuVsch1SPjQ4YyZi60kjr3JZeL82h8jgkxTUlfb130,1620
10
10
  helm/benchmark/test_data_preprocessor.py,sha256=adT-pgVeWvmZXLUUehxH0C-lMhXhtdxsvYdr69o1BD4,2047
11
11
  helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- helm/benchmark/adaptation/adapter_spec.py,sha256=KgO6KGF3sRrM0rKTMulFp4GtwHbGNlOurE48d0Lv5hg,2679
12
+ helm/benchmark/adaptation/adapter_spec.py,sha256=YoxMyN4RJM8GG_DeZ-k0edyARZ69hHLkvOlvKCk-u2o,2811
13
13
  helm/benchmark/adaptation/prompt.py,sha256=MATerIUIhFp_BMGvK7bLpNtWH6Oi4kknjBjOkr2bHv4,1948
14
14
  helm/benchmark/adaptation/request_state.py,sha256=o3OpZbB0TJFiZ2Nmhvg3vWmByaUSYTffT_WnoNb7w68,2712
15
15
  helm/benchmark/adaptation/scenario_state.py,sha256=ZflBuNgvN0JqUhshFcy0kTweO1WJs6j5UCaTxWTMe0o,1747
@@ -18,7 +18,7 @@ helm/benchmark/adaptation/adapters/adapter.py,sha256=8wK28jISxW8rUfXP-_-FfQJRRzc
18
18
  helm/benchmark/adaptation/adapters/adapter_factory.py,sha256=N2n-xIoGt_DxlN0LT4GUgVvdoaqhyUU8rSWr_nyfb80,2318
19
19
  helm/benchmark/adaptation/adapters/binary_ranking_adapter.py,sha256=3j24nFQuZE0Zl6DMAB4aYUpjieerdSMLsJbpMT9Nzfw,5646
20
20
  helm/benchmark/adaptation/adapters/generation_adapter.py,sha256=-on4QAo8hhzJVgAnM6G8lFFqaoiSiVF-KxwwfHwE61A,1927
21
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=qRmFP1wweGIcZX55M2bljaQ22Xof45_oyY7Bxg4c3yQ,12657
21
+ helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=q5K4Hag8LOpfpkeEzwIMPLNpBqMThcB1LXLGr_n_Xfo,13118
22
22
  helm/benchmark/adaptation/adapters/language_modeling_adapter.py,sha256=vPo2EVgbMfzmwPPcljoXdDfBW3c80LLKaUhA-RefU2w,11967
23
23
  helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py,sha256=pV14yvmH_mRQpeXF0teAxGpJcouSQViipr-aMkNE-AM,1711
24
24
  helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py,sha256=4CALqc--PUaEl3cLzmjP9nFSuarCZMKBwrPQxde5TYM,3471
@@ -46,6 +46,7 @@ helm/benchmark/augmentations/space_perturbation.py,sha256=7OdpoibdizoPDBPpLc1ENy
46
46
  helm/benchmark/augmentations/synonym_perturbation.py,sha256=2qFx7xparhEPd82tvs59HkAr1hwQWv7asWtmNCbcQrQ,4209
47
47
  helm/benchmark/augmentations/test_perturbation.py,sha256=v_U5CmBpA5aXqg4EJUYZrSfGsNbZTwCP0inxz1XNGq0,9991
48
48
  helm/benchmark/augmentations/typos_perturbation.py,sha256=nfF1Zw2REKZEnnyPVFWD87MP8L5ANbaZXeI2n70Sonw,2790
49
+ helm/benchmark/contamination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
50
  helm/benchmark/efficiency_data/inference_denoised_runtimes.json,sha256=ios_dt-_8wtXvkVAx0iI2zwCxqHvk3XKTx31qHPalsI,4203
50
51
  helm/benchmark/efficiency_data/inference_idealized_runtimes.json,sha256=5w7reeZc0yc4cjH8kJGxQQSoe8yaRVX2SSlSrx0QWFQ,12348
51
52
  helm/benchmark/efficiency_data/training_efficiency.json,sha256=aH2moiBLStOLVi8Ci2KTK5ZkWlTBLK-B3fRfNZwhoSg,9763
@@ -54,7 +55,7 @@ helm/benchmark/metrics/basic_metrics.py,sha256=xBCTgLVdiWVxdpB08MbWFc2nvwCJTgD1-
54
55
  helm/benchmark/metrics/bbq_metrics.py,sha256=H44mwKXLJ0PXo-sVKgRHCEKjZGnCahDD6GOQMWpnbOQ,6061
55
56
  helm/benchmark/metrics/bias_metrics.py,sha256=uVJFQvDSzvPR1ELu0FNYyExjRy2ThaJRCw8beEMDqJs,11309
56
57
  helm/benchmark/metrics/bias_word_lists.py,sha256=mx5JjW3mHffXIqo4GcQN-zENUEttBqQnEjPTz3J3J_4,13909
57
- helm/benchmark/metrics/classification_metrics.py,sha256=J5us8vRybPa-SVywrH6DYmi6luOFhf2smVwk5D3rkY8,3335
58
+ helm/benchmark/metrics/classification_metrics.py,sha256=1q7gPnWRrx4QwE8T0m269vFJWg_bKfVx21a5spDBbjU,3701
58
59
  helm/benchmark/metrics/code_metrics.py,sha256=uWdigk0QyEsfVHQzq9KxkOc-LROvcqWXeui42Mr0YF4,5119
59
60
  helm/benchmark/metrics/code_metrics_helper.py,sha256=byyuI1lJgbIDPVJzywaBsam9zFMPPyn28g1grsK9xyA,22336
60
61
  helm/benchmark/metrics/copyright_metrics.py,sha256=8sk85mLTasWIgHIXxOho0z_nQYyLqtzSWHSAwd5ayAQ,7560
@@ -68,7 +69,7 @@ helm/benchmark/metrics/ranking_metrics.py,sha256=b3qxTRnr62zz1Gr1dsVDYtdwB8WBIb-
68
69
  helm/benchmark/metrics/statistic.py,sha256=9VM5JA1-M_iYCNziWm2qeDZaAQqPQ_ySdaSMcqAeYdM,3048
69
70
  helm/benchmark/metrics/summarization_metrics.py,sha256=hHNWGYA1bNfgCg7o1RSiTo7E-SJujHhkKh9G204icoo,16083
70
71
  helm/benchmark/metrics/test_bias_metrics.py,sha256=brut1rdnKNtTVJoe6qkllmJwZTFBZkLcyI_4qmqZ_vA,6264
71
- helm/benchmark/metrics/test_classification_metrics.py,sha256=bVVTw7UM_pUVFzhyjHgx16EBU3X81g8DeD0NGjP1OUA,4215
72
+ helm/benchmark/metrics/test_classification_metrics.py,sha256=usW5ciUYu2ZUUqVjFk4NfZTGNIoBArwia_-8uGOvFpw,5475
72
73
  helm/benchmark/metrics/test_metric.py,sha256=S7LGHNCHuhMk582eHylw1tOasUBEf_7F0T4u3tey7b4,757
73
74
  helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SIBjLFuLIzGbfg6OiZvM,4162
74
75
  helm/benchmark/metrics/test_statistic.py,sha256=WQv9i8wSNTCzlw-L1wir0lmW0g3D4CM_ebpii7IB9Lw,406
@@ -89,12 +90,14 @@ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=gPqoYNI
89
90
  helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
90
91
  helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
92
  helm/benchmark/presentation/contamination.py,sha256=5wLwq266sCxT62MdzAXT9V-au6b07HaL44DLj_2qiSk,2788
93
+ helm/benchmark/presentation/create_plots.py,sha256=-YyrhEmfVOMnESJ8m2yk7RWAOYdZkVrLAt2K8XnpNF0,28442
92
94
  helm/benchmark/presentation/run_display.py,sha256=HSvV71ZRshMIhHZHGtlbYfRxK9xx1GQgn6YmGPVncME,9892
93
95
  helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
94
96
  helm/benchmark/presentation/schema.py,sha256=i1utCqiNkbTK9CcDcOFwi7e91KPaKkpFU07ZcBfWXTc,8753
95
- helm/benchmark/presentation/summarize.py,sha256=UxqU0nNdKv80h3ROwP7fWrxmaivGJ3yUBiNKB2sBWOw,45067
97
+ helm/benchmark/presentation/summarize.py,sha256=BUXog2m_UPbftyzFHx_U4mE2FrG56iv9mvcCdXoZVmI,45071
96
98
  helm/benchmark/presentation/table.py,sha256=VzVMwsgP3kItAM6FPRUaTphzJ-ZjriiuFbWlO1rJUMU,2879
97
99
  helm/benchmark/presentation/test_contamination.py,sha256=8mnzUzxUW9pXUOuLpU4BBBg0V7Mn1d1s4AQgwy6_kl4,459
100
+ helm/benchmark/presentation/test_create_plots.py,sha256=2q3v2Qdh_hBKCEX9toygXFLIryu1FlcLMt2PXprx7j8,1251
98
101
  helm/benchmark/presentation/test_run_entry.py,sha256=M5z4dnVb7fM3PWrZWIZNlG8CT4KnDxjnEE4FBb1ZFNU,621
99
102
  helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
100
103
  helm/benchmark/scenarios/babi_qa_scenario.py,sha256=SkIJjqBuMLYxZR8l9epu9arBeirvJPtQsIBXv4bzkx4,5030
@@ -107,7 +110,7 @@ helm/benchmark/scenarios/civil_comments_scenario.py,sha256=vXq6KxyS5C0-tD8xUmkG5
107
110
  helm/benchmark/scenarios/code_scenario.py,sha256=Q_TP_vWewkClvibPFHXpsOjR-CWexYgu5kl4OpfZXNc,11355
108
111
  helm/benchmark/scenarios/code_scenario_helper.py,sha256=EbQNfHqhQXaMMPmYT2mG2dRjzYaI2FvcPb9j6NlNHDU,5853
109
112
  helm/benchmark/scenarios/commonsense_scenario.py,sha256=9roSJS3iGSNgqxTbLI87xuZGB8IxJkbbtzr-ep0HUn0,10661
110
- helm/benchmark/scenarios/copyright_scenario.py,sha256=3dzDZ4B2a3ZmY6zMlQ98Ni-9836kPE4V1U6fecYrQHM,3646
113
+ helm/benchmark/scenarios/copyright_scenario.py,sha256=APYQPC-esq3oM2qQxW6JNxa4pkv_yHDKfePjpvvi6nQ,3660
111
114
  helm/benchmark/scenarios/covid_dialog_scenario.py,sha256=FmYIuRr81xD_d0iyRa5blPC8OTqpfv8XGTz5XXUOd2E,3958
112
115
  helm/benchmark/scenarios/dialogue_scenarios.py,sha256=SPwo1iYiLbPwNtOgAVkTr-dO8FQLshmrfXdjPcayW5A,5616
113
116
  helm/benchmark/scenarios/disinformation_scenario.py,sha256=Ff66LxBm8APuMziLfGvTM0WIatrAty5_q_8ObaLW5lo,8491
@@ -116,11 +119,12 @@ helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=oChQzEptlf731
116
119
  helm/benchmark/scenarios/entity_matching_scenario.py,sha256=gtrLSCw2JSNnBgFQFsUm4EcICsjVWtp9wsOdqcyBU4k,6863
117
120
  helm/benchmark/scenarios/gsm_scenario.py,sha256=PmX0zutkGqnqGirWidUdk166cWv_23RtaTFcVQGBpzc,2619
118
121
  helm/benchmark/scenarios/ice_scenario.py,sha256=smrpTOwtMDL-m40zfKfNz9btOGoINZNv3_2oBcLBMmk,16156
122
+ helm/benchmark/scenarios/imdb_listdir.json,sha256=eczxp9gslYYwx5XR86ATnZorIxuujFMDTfzR4h5NCpo,1015402
119
123
  helm/benchmark/scenarios/imdb_scenario.py,sha256=VTD0Ur6ATyY9NWxcnkGzn9Iw5vl4d94o0FbFm61ZZTA,6057
120
124
  helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=wYaivSqqYYZPjPHTKaS6D7j960dcLIVTfmuZ8awd1Zs,2192
121
125
  helm/benchmark/scenarios/legal_support_scenario.py,sha256=K9HfTyHJnnLFvGKNwYQayu7JV4uyNT5wH48wc9ixRa0,3912
122
- helm/benchmark/scenarios/lex_glue_scenario.py,sha256=k_VofKY5xlAjUEZrZV2ZyyVegGRVYEm0GuSsNbv7u0M,9254
123
- helm/benchmark/scenarios/lextreme_scenario.py,sha256=b0KfKQbwDv5AFCRala4GwbtJH3KmbKnTsL5CazODfkA,18996
126
+ helm/benchmark/scenarios/lex_glue_scenario.py,sha256=r89KevvM1Kifu3ZkUIXAV8jXktclNtL0-JL9T6qOx_Q,10224
127
+ helm/benchmark/scenarios/lextreme_scenario.py,sha256=tejeKE08YX5MYFXrJRY48rIzO5V5fdwHDWEzg883K2E,20300
124
128
  helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=2-GrPB8whoBFcQ5608jrwlAcRJgpkT1P2UehcR6-EYY,5977
125
129
  helm/benchmark/scenarios/math_scenario.py,sha256=5PespNtseDOnPgAwtdP0vMkXz1CaJM0BkJsWdeG5gUM,13825
126
130
  helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=ZIn7tlBC4baV8CYcU_-mYe2RbYaQ-8dX1Ca_hOvZTfI,3988
@@ -134,32 +138,34 @@ helm/benchmark/scenarios/narrativeqa_scenario.py,sha256=r5TwYkZH_YaCws6BSnjbiDQk
134
138
  helm/benchmark/scenarios/natural_qa_scenario.py,sha256=nuL5Qlh26xq7Q_lvzK2sKVN7eNYh7F5SjbIjahKaMNg,12527
135
139
  helm/benchmark/scenarios/newsqa_scenario.py,sha256=vMuIZyYxufH2AqhDoIZzzllfq8ScJIhSDH6lM3IUxGM,7242
136
140
  helm/benchmark/scenarios/numeracy_scenario.py,sha256=Iwtypyb0249zKYyV6p4YUX4bYke6uIyL6R3aopiGUb8,30552
141
+ helm/benchmark/scenarios/opinions_qa_scenario.py,sha256=dtIYKL9ZiccX_F3-5OrnHXJdNBBTxCoTHc2Kc-XX79E,7380
137
142
  helm/benchmark/scenarios/pubmed_qa_scenario.py,sha256=cpMmQWwCDXIYO0btGLhevMT-Mhs9-5Es9cDvQYkIlL0,7493
138
143
  helm/benchmark/scenarios/quac_scenario.py,sha256=46nqmeVgkWu6jDGCHl-KHu351bmJj7jx_1p5kPwcOjc,6615
139
144
  helm/benchmark/scenarios/raft_scenario.py,sha256=RzewlMVkHJ2XbZ9_9FzBvbdV__BuBRgtX2HyhCnmH1o,4500
140
145
  helm/benchmark/scenarios/real_toxicity_prompts_scenario.py,sha256=8DQD7KkilAfxwzHUwCPrPPHeYEz_dEOWSls8yZo15do,2387
141
- helm/benchmark/scenarios/scenario.py,sha256=aj2golyi1TAAAebfW7eouTWRwZ07KbK034HlLx-q-1g,7164
146
+ helm/benchmark/scenarios/scenario.py,sha256=bdRcv-YoLkxjlpNcq4MXiu8HQgjByHkkLWOdih4ahsM,7365
142
147
  helm/benchmark/scenarios/simple_scenarios.py,sha256=rcHzukhjBgvNRqkjcg0cms_zWtAsLPk0xiFN9I25_hI,1947
143
148
  helm/benchmark/scenarios/summarization_scenario.py,sha256=fKeRSkXrH26WyfeIhn43_fZxnAO4bIX1Xh5HoKcjOQM,6550
144
149
  helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=dtMMeULjw6pcobBRp0r6f9N4VCUKamx1Jy-6xPu85q0,3083
145
150
  helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=YvEgO5qVZ4hLpnvjer4CG0Ct1upssZRjZWxnNi1ZUtY,16308
146
151
  helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=Pm63shscbGuigg4zWfcw3rOIPP8LgxBCnpcQKAA_CX0,8327
147
152
  helm/benchmark/scenarios/test_scenario.py,sha256=o8w8ElDPF-RzeCmecwyvie_nRMYj01b38BufXS-igqY,1612
148
- helm/benchmark/scenarios/the_pile_scenario.py,sha256=UXxRlaZdVRoWTjw8h5TUhXouq5JLaidPbL5-Itai0KE,4988
153
+ helm/benchmark/scenarios/the_pile_scenario.py,sha256=aCcjZp0wabu8lpPVNAdCr1x6m_3QvgKe7dIGS2qgGm4,4981
149
154
  helm/benchmark/scenarios/truthful_qa_scenario.py,sha256=zQgv_qUU5h5ODoQE06rpJa6O_8FwF695cIie3PG7bx8,5969
150
155
  helm/benchmark/scenarios/twitter_aae_scenario.py,sha256=_qWotvPagNj3ATnyaww3U1XZtxN-wgueUf34fFmKXQI,2083
151
156
  helm/benchmark/scenarios/wikifact_scenario.py,sha256=pucEuLYz5N9qIobnEbJnRUnK6PrkWFdkl7yPuCJj3SE,5778
152
157
  helm/benchmark/scenarios/wikitext_103_scenario.py,sha256=rXVbUzOZi4eWM-_HP1gzY5SBmMwOX1vk12WrLkR3NHo,3074
153
158
  helm/benchmark/scenarios/wmt_14_scenario.py,sha256=u24E_w0AOXpl3PzEFLmiBcl8qyJEy-1Yc-i4YHgU99M,4356
154
- helm/benchmark/static/benchmarking.css,sha256=EkLEyXEL5Qwq-312D01y9EaQV2IBa67fw8Bjc3PQPJs,1928
155
- helm/benchmark/static/benchmarking.js,sha256=NQUoE05neH_YN9BgyNVFwEXX09YDRZOcunK_6tCZomA,47399
159
+ helm/benchmark/static/benchmarking.css,sha256=DGC4Huh4tVD2o9wEeUf3YOc3MYcq2fmJQXvhTjVDumE,2057
160
+ helm/benchmark/static/benchmarking.js,sha256=qiXAY_9fiWZ4ydzhBQAUhylzxNCoIN8ciLlEFMg33uE,49107
156
161
  helm/benchmark/static/contamination.yaml,sha256=LbISh56ORvfkkWptm7ZWmlPvWxtls6pBF1TbGiWD7hk,3096
157
162
  helm/benchmark/static/general.js,sha256=L3S4CBUED0k7RsjLHCeWjO29ZMFJckZgNTAYAARzaEg,3029
158
- helm/benchmark/static/index.html,sha256=LZMmoydG2LLqRfvGSpK3eRt1n92o19AELAzxCi-kok0,2994
163
+ helm/benchmark/static/index.html,sha256=yzf_VEGW15CPZOW9a2Z6opoQ0Gg2YQCtj182niUb0fk,3130
159
164
  helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheGWu7zNI,3428
160
165
  helm/benchmark/static/json-urls-root.js,sha256=G3qenwLgBojh3ukzp_gyMUaZja83ZFqvT1WQ_Rg11BU,98
161
- helm/benchmark/static/json-urls.js,sha256=UzWTp-dowxZhJuWtoPyi5vpfwlRr561DGYWRHdkvZ1E,1634
162
- helm/benchmark/static/schema.yaml,sha256=XgXDcd67hvhGCOTFKJGGymIglzYS8jL8QjN2cO3gi1Q,93172
166
+ helm/benchmark/static/json-urls.js,sha256=wvsG2Lrz2XArwwMOl_tGXL9y4mjqzjod4gcqlvVCiQA,1750
167
+ helm/benchmark/static/plot-captions.js,sha256=gTBn-IPPD4BkzryVYj3KkGqLhWqWvBbvufeJaDygQxk,3010
168
+ helm/benchmark/static/schema.yaml,sha256=TQdzlpOPvTwELiU0w0HBAZqKQBmjj1LOkEGM85oa8e4,95657
163
169
  helm/benchmark/static/utils.js,sha256=H2PKYjuXZ392DlALCPJ1XRwGxBDRFjL9eTFiTd4vBU8,7338
164
170
  helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
165
171
  helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
@@ -184,11 +190,13 @@ helm/benchmark/window_services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
184
190
  helm/benchmark/window_services/ai21_window_service.py,sha256=Mo0Zzj2a9hDiUg6hTuJWb3ABhBUPlOYS-kHhaE-pHUE,12672
185
191
  helm/benchmark/window_services/anthropic_window_service.py,sha256=mA9aWKfc-dbSnpt37k6zDFdNNMfbevAoleCzSaar-uE,799
186
192
  helm/benchmark/window_services/bloom_window_service.py,sha256=o7MVedt6khdoj8zikLDuVraEzuoBZk7j4Fzjsas0sD4,1023
187
- helm/benchmark/window_services/cohere_window_service.py,sha256=MXBRAjuAQGq0iEpU9OLORH1FvHMak1-Nf5-D7k7UO9I,6182
193
+ helm/benchmark/window_services/cohere_window_service.py,sha256=3SJT97CaxNxtUNS9_qKvKMVCA6lvNavS-xo6jAsxPbM,7070
188
194
  helm/benchmark/window_services/encoder_decoder_window_service.py,sha256=Dg_rD0RtKIALtpvT4Wi4Am3zgFcIvgxfzdglw5fbbTU,2478
195
+ helm/benchmark/window_services/flan_t5_window_service.py,sha256=39IZX89_tay3bpSGVoWDoekmhW-RUNATQuT-bNyFRTs,915
189
196
  helm/benchmark/window_services/gpt2_window_service.py,sha256=d-ys9FHoVI1u1GZOH734JPAOc8W6QZb1N3ZjooKmwz8,990
190
197
  helm/benchmark/window_services/gptj_window_service.py,sha256=C2OEl-3ZatwxYVoaQgyvAZ5SBS2TK-2PnM02zUNjhiA,1103
191
198
  helm/benchmark/window_services/gptneox_window_service.py,sha256=FD0NDlBxDu1fZ8vlaTsOBh2IKYfrMV73qNGetkrp1P4,944
199
+ helm/benchmark/window_services/huggingface_window_service.py,sha256=tDbhjn81Aw3ZSYMKh4aQU3e8JX1IVYCu_2gPOLaIWD8,1440
192
200
  helm/benchmark/window_services/ice_window_service.py,sha256=5z52rP-xAF_jckIPoogyoNFW6FQXqaq3SHybCaBuRn0,2005
193
201
  helm/benchmark/window_services/local_window_service.py,sha256=wgNBB-p9Zk-uFLRXYNxrz16_DhWyQ8x9sltSLBhzUh4,4247
194
202
  helm/benchmark/window_services/luminous_window_service.py,sha256=0w-nyfXXDwnIDBzU3Y84LZPnQyJGiSbvf5Y_MQh-hCg,1791
@@ -202,6 +210,7 @@ helm/benchmark/window_services/test_ai21_window_service.py,sha256=U_n2mQ5GqD8oYv
202
210
  helm/benchmark/window_services/test_bloom_window_service.py,sha256=xeYpsvdR8Ug31BAb4a-PU4Sc0oihpCIMJ5OzTfgUhM0,4221
203
211
  helm/benchmark/window_services/test_cohere_window_service.py,sha256=6WpIiuEyfgTZS3BIeAGOR8AAr2djpVIfGbItQRI19Ck,3205
204
212
  helm/benchmark/window_services/test_cohere_window_service_utils.py,sha256=sf25f9MeXzoqsbDzZ7d7le13hm8RkDe54nhLtKF2pqo,158150
213
+ helm/benchmark/window_services/test_flan_t5_window_service.py,sha256=xv_EXbiRklveJPQtThYCSYF7qBYwjL7K4wH3Xu5z2Fg,591
205
214
  helm/benchmark/window_services/test_gpt2_window_service.py,sha256=3k25pLa_z__g4yoQL40DEXj-T4dGtrgif13N2NXs59U,2568
206
215
  helm/benchmark/window_services/test_gptj_window_service.py,sha256=sxsTpozKv9N-wZXtGl1prkQr9Md_q-tnCjO9zt226Co,2267
207
216
  helm/benchmark/window_services/test_gptneox_window_service.py,sha256=MCYHZIoulJf_WCx6de7rWB3nqku6wCyAokA-SWIPEks,4140
@@ -216,15 +225,16 @@ helm/benchmark/window_services/test_utils.py,sha256=1k2TlPdDIRjum669jpH3O7UOqm4G
216
225
  helm/benchmark/window_services/test_yalm_window_service.py,sha256=NdunSxq-qDzfzYMBYZ-0my6LaU2qUxtm7Ii0c4fyKnY,4273
217
226
  helm/benchmark/window_services/tokenizer_service.py,sha256=RNznJBAxcCUMCurb7mbraZULx_ZtB0G7IxbrnUe0Urk,865
218
227
  helm/benchmark/window_services/ul2_window_service.py,sha256=R_VEzOb59zQE9mmbTLunQeIvLAtK3-97h-B2_oc0Uxs,1021
228
+ helm/benchmark/window_services/wider_ai21_window_service.py,sha256=VZ6EERN48FSYsmJ_aiwI30SEbobLt27c1QqL29Zg_8M,414
219
229
  helm/benchmark/window_services/wider_openai_window_service.py,sha256=cpm4mDEGTY-cmrECcDCL0flONBdh4g40uWNLI-v46BU,539
220
230
  helm/benchmark/window_services/window_service.py,sha256=aV4YnbXl7T23runB8xmSWRwV7YtliUlXrveEejOMJ1Y,2885
221
- helm/benchmark/window_services/window_service_factory.py,sha256=kb9Dxig-HuHssO6dyMT0M_FDu35WgXqlaXs_j88QAd4,4773
231
+ helm/benchmark/window_services/window_service_factory.py,sha256=UDrXRUXGwBMEiO7Cw_nNbr0XM6mzzXrXiyJsvCmW9wg,5845
222
232
  helm/benchmark/window_services/yalm_window_service.py,sha256=g7NeXvlRq3FXf2HwRStBMTkbWDxTdEITkrVRe-fv3mg,1805
223
233
  helm/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
224
234
  helm/common/authentication.py,sha256=RlMx29_TSrfU7ujE7dJkxmFub5EqLj2NswV5lAVFFDk,179
225
235
  helm/common/cache.py,sha256=ustgsRHX0W3zoLPN05W3mFl9m9JYp9Ppq5cjMbdmm6Y,13116
226
236
  helm/common/codec.py,sha256=zm8MP9Aqfh64D2HMZiCPEMoPkkiJxEzvzmuupGvkRh0,5499
227
- helm/common/general.py,sha256=bIhgtiEIz1zA7cQVH-U_6FuW_bmDzYaff4QZ779tR3U,10087
237
+ helm/common/general.py,sha256=7vFw10h_hTrxUCfxE0LH13hp8Lunp77lbVmucgoeq2Y,10181
228
238
  helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
229
239
  helm/common/object_spec.py,sha256=COMd4RpYgfulW940a5M_npbsfRBvLkmhjfwDIq4Gpqs,1833
230
240
  helm/common/perspective_api_request.py,sha256=WAVwtajNVmi5XJNsPcorGEAVrqkpPSk-Kd3b0hJghbA,2427
@@ -237,7 +247,7 @@ helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
237
247
  helm/proxy/accounts.py,sha256=xq-zVggvueB4D5QK58mFWGPxZe-hnIUAT4D341yd0ac,13503
238
248
  helm/proxy/cli.py,sha256=2SOxIF55PDjzXXcDOYRT8m-oyQM_2VyZheKRC3fXDVw,8094
239
249
  helm/proxy/example_queries.py,sha256=p1wH-tp1pRUslkAwaJYrrG5aDfmFWK3KYn4M6WQfPqQ,4120
240
- helm/proxy/models.py,sha256=4cqSq_cBnTRPgSFWafTkuAwJxMg6CA9y-6NBIfeZPv0,25926
250
+ helm/proxy/models.py,sha256=sfdp7DasJXbun-VMZz5p7iD6uhoUW1mFsy64BWfDoV4,29681
241
251
  helm/proxy/query.py,sha256=eftbiUICMh8QIHVs-7cLtv_rDXKeKdRPmwjLMu0TDxQ,645
242
252
  helm/proxy/retry.py,sha256=GLLDW1iGCwHfgTle8YK7ZB3vV-7XqsHcqeruKoVdsxE,1953
243
253
  helm/proxy/server.py,sha256=uBispGXfn39s_Pskd9Xjud0rijTjqXtSKU_2YvE6zGE,7356
@@ -245,31 +255,34 @@ helm/proxy/test_models.py,sha256=hWeDcBw1GkPvyJUd-ABxRVe1FhSUfz8bzyrKYdsqmyY,726
245
255
  helm/proxy/test_retry.py,sha256=8h398auzjW9VnlTJWllxR-bdpub-XFp8EN8LWDEnEHM,1049
246
256
  helm/proxy/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
247
257
  helm/proxy/clients/ai21_client.py,sha256=c4D64-1mLaFcGsqOb5CYkaXTmAWEJYk9cIjtr6DbgZQ,7770
248
- helm/proxy/clients/aleph_alpha_client.py,sha256=kh7Wrf9D5aZo5HYbvW1KFdRvdOn1EO64V44tNXlUKlk,6530
258
+ helm/proxy/clients/aleph_alpha_client.py,sha256=VV7dbgh7sYqoSWfNkxxAiQ7i3yPW33rBPnBig9EXL10,7707
249
259
  helm/proxy/clients/anthropic_client.py,sha256=vrfc8lS9bxQbUxMxbElV2z5cMDq-JD6yFDTS2cJdFO0,15526
250
- helm/proxy/clients/auto_client.py,sha256=nzGU_SYaQD8v6eZbKNt1ej2mvPWTsXYn_DDAnPTxfrA,10810
260
+ helm/proxy/clients/auto_client.py,sha256=oktJQ4rCntp7id4vj-d3x_EA3RWGNZyXUMFwzRse--c,11063
251
261
  helm/proxy/clients/chat_gpt_client.py,sha256=nG3opHbnzX50r9Ialh3RaRErOZo6k4Q7gVzRHeGQgj8,5312
252
262
  helm/proxy/clients/client.py,sha256=bh6FvYFjw6MoHp5n7-KN1asXrIrOC-jfYsg3aW4xMgo,4570
253
263
  helm/proxy/clients/cohere_client.py,sha256=KF21m7qUjuhrpEEQv68FNeX0rsWSmxZgw52Oa7CZ5pI,11362
254
264
  helm/proxy/clients/google_client.py,sha256=sGGxDWD22c9a9KMzLAFL3vAEDHxp5jSY2W3RDpVDIak,3334
255
265
  helm/proxy/clients/goose_ai_client.py,sha256=2tqJK_AhD2-ScXtOTdt9S9khzVjal5pm38BJWiFhwq8,4217
256
- helm/proxy/clients/huggingface_client.py,sha256=7CYD53G6tGxhFEE6yhlrQM_agfbr9V4hKqpE9OW5ks0,11045
257
- helm/proxy/clients/huggingface_tokenizer.py,sha256=KOa7xA10bIRN9lcSfUBBJSrES49By2KXqlDzlVdEHWM,3801
266
+ helm/proxy/clients/huggingface_client.py,sha256=LypY3YfyoaGFH83UkYwDARGuoN2JOUk2S-nEaJ6GemI,11813
267
+ helm/proxy/clients/huggingface_model_registry.py,sha256=0WHyWPxxBI4KtTs2Yt6-Cw16FC4XBEe6yqUc0-YSn1Y,3891
268
+ helm/proxy/clients/huggingface_tokenizer.py,sha256=ujtsBupMMrE9efds2205c8NiPTcxHX8XM0UoV9spLK0,4591
258
269
  helm/proxy/clients/ice_tokenizer_client.py,sha256=Ui8YhAXoY1Q0vC3icoeFs6X9xAcESF6Tl2EGERGWVGU,2325
259
270
  helm/proxy/clients/microsoft_client.py,sha256=-VC8IrgrpSp1_FvRSI_8MSxhNp5I6dMc4qWSHc4Oulg,8237
260
- helm/proxy/clients/openai_client.py,sha256=rPV74pbiAlkLS4YO6J-dnzpVmsShWKYISD2nJTk6Sds,6025
271
+ helm/proxy/clients/openai_client.py,sha256=frW9fOjYWkRXdfzE88ppxaLhVl9pCnXfheUqeANW6QQ,9415
261
272
  helm/proxy/clients/perspective_api_client.py,sha256=-L8IwokuktWPoOu7nXwsfoab_U1QRGCt8xT1SrcGfYE,5491
262
273
  helm/proxy/clients/simple_client.py,sha256=GXHTCRB58XAxnUVqgpynidc7h6kaDBOP7TedVHrOpD4,2915
263
274
  helm/proxy/clients/test_client.py,sha256=bvkFob_Yoy8bALrVeQ0h757g9RU687JYI0g3AISPFQ8,1268
264
275
  helm/proxy/clients/test_huggingface_client.py,sha256=n-6D-RXqwQyxPxCLCSqHxqfK5JA-PdP5ffP17XwTe2I,3520
276
+ helm/proxy/clients/test_huggingface_model_registry.py,sha256=zMboFlwMtDEV7hkd9SZFuItye-vkzz3CE5mWQrw--W4,2554
265
277
  helm/proxy/clients/test_huggingface_tokenizer.py,sha256=KmlAXezQ6R7DAEpV85_JRdTRrOJoJxfmtylqybWn5VA,2189
266
278
  helm/proxy/clients/test_ice_tokenizer_client.py,sha256=Ugmn5a7QdAPEAbLtreLS5-sji8yrzxy5mhFPAl3rOuI,2404
267
279
  helm/proxy/clients/test_yalm_tokenizer_client.py,sha256=tnrYl7T1DcZ9FN09nBWV1gesjWQ3osiUpsGHSm_IypQ,2336
268
- helm/proxy/clients/together_client.py,sha256=k7TmK940KHdtvb2pxAlX4kHUHrfiZL2HAzM3dgwJowg,5731
280
+ helm/proxy/clients/together_client.py,sha256=epyiYElD0BfAZgUSu4zZKC5Oe8yIVVyJn2RtTwjPMzM,6334
269
281
  helm/proxy/clients/yalm_tokenizer_client.py,sha256=cpBoc8eHQoBGQguZsDaVnGWLdZnPgkHjqLSO0B94O0U,2420
270
282
  helm/proxy/clients/yalm_tokenizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
271
283
  helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
272
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py,sha256=z1JEJynC51T70JSuig6TrNwLEUXiS2SVTMvWECya7ww,5743
284
+ helm/proxy/clients/yalm_tokenizer/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
285
+ helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py,sha256=7Y4_nCZptFWzifCJ5aPmM3_OOxhtomIAQVjpJGV1D8g,5954
273
286
  helm/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
274
287
  helm/proxy/services/remote_service.py,sha256=xKS-0P-EqKTPn7odTXDoQPjn9FliQRLaMFUnCEsUmCU,6965
275
288
  helm/proxy/services/server_service.py,sha256=gps7PwXqCi8b0yGYC0nQwFdKbfxCriSHt5CD1N1kkJs,5696
@@ -286,9 +299,9 @@ helm/proxy/token_counters/openai_token_counter.py,sha256=gPo_VrkEH07xmprzdfIhmJ_
286
299
  helm/proxy/token_counters/test_ai21_token_counter.py,sha256=42J1fCi20kQUwAD18bIa6h9TaP7KZnlgF-mLbvKURro,5508
287
300
  helm/proxy/token_counters/test_openai_token_counter.py,sha256=EovaVCZSr9moITZ9-AKiv_YM-D3OUsUDs4iQhEvpazQ,4823
288
301
  helm/proxy/token_counters/token_counter.py,sha256=x8KyTR82EedgCQUuneQiVq9AiU1B3_CHPmKPNumClHc,429
289
- crfm_helm-0.2.1.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
290
- crfm_helm-0.2.1.dist-info/METADATA,sha256=0U51YnO1QR8Xhq4no4yDaiWTwwLVFj99tf611k5MeX4,1982
291
- crfm_helm-0.2.1.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
292
- crfm_helm-0.2.1.dist-info/entry_points.txt,sha256=8vW_ahx0Ar_ubyDTqUavUwXeZ5O8w0gLtdSVagchycU,234
293
- crfm_helm-0.2.1.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
294
- crfm_helm-0.2.1.dist-info/RECORD,,
302
+ crfm_helm-0.2.2.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
303
+ crfm_helm-0.2.2.dist-info/METADATA,sha256=_OlkKmj1P7vaZvlpvOnNnzzm3w1IEW6de75SK7TmuPw,2066
304
+ crfm_helm-0.2.2.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
305
+ crfm_helm-0.2.2.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
306
+ crfm_helm-0.2.2.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
307
+ crfm_helm-0.2.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.38.4)
2
+ Generator: bdist_wheel (0.40.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,6 +1,7 @@
1
1
  [console_scripts]
2
2
  crfm-proxy-cli = helm.proxy.cli:main
3
3
  crfm-proxy-server = helm.proxy.server:main
4
+ helm-create-plots = helm.benchmark.presentation.create_plots:main
4
5
  helm-run = helm.benchmark.run:main
5
6
  helm-server = helm.benchmark.server:main
6
7
  helm-summarize = helm.benchmark.presentation.summarize:main
@@ -42,6 +42,8 @@ from .scenarios import legal_support_scenario # noqa
42
42
  from .scenarios import entity_matching_scenario # noqa
43
43
  from .scenarios import entity_data_imputation_scenario # noqa
44
44
  from .scenarios import big_bench_scenario # noqa
45
+ from .scenarios import opinions_qa_scenario # noqa
46
+
45
47
 
46
48
  # Biomedical
47
49
  from .scenarios import covid_dialog_scenario # noqa
@@ -68,6 +68,9 @@ class AdapterSpec:
68
68
  # set of training instances. Used to compute error bars.
69
69
  num_train_trials: int = 1
70
70
 
71
+ # If true, randomly sample N training examples; if false, select N consecutive training examples
72
+ sample_train: bool = True
73
+
71
74
  # Decoding parameters (inherited by `Request`)
72
75
 
73
76
  # Model to make the request to (need to fill in)
@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
23
23
  @htrack(None)
24
24
  def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
25
25
  """
26
- Takes a a list of `Instance`s and builds a list of corresponding `RequestState`s.
26
+ Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
27
27
  The reason we don't do this per eval instance is that we create a common set of
28
28
  training instances which is shared across all eval instances.
29
29
  """
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
65
65
  parallelism: int,
66
66
  ) -> List[RequestState]:
67
67
  self.train_trial_index: int = train_trial_index
68
- self.train_instances: List[Instance] = self.sample_examples(all_train_instances, seed=train_trial_index)
68
+ self.train_instances: List[Instance] = self.sample_examples(
69
+ all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
70
+ )
69
71
  hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
70
72
 
71
73
  # Generate request_states
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
93
95
 
94
96
  return [request_state for result in results for request_state in result]
95
97
 
96
- def sample_examples(self, all_train_instances: List[Instance], seed: int) -> List[Instance]:
98
+ def sample_examples(
99
+ self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
100
+ ) -> List[Instance]:
97
101
  """
98
102
  Sample a random set of train instances to use as examples by following the steps below:
99
103
  1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
121
125
  random.seed(seed)
122
126
  num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
123
127
 
128
+ examples: List[Instance] = []
129
+ if not sample_train:
130
+ # Select sequentially from the train set
131
+ examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
132
+ return examples
133
+
124
134
  unlabeled_instances: List[Instance] = []
125
135
  label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
126
-
127
136
  for instance in all_train_instances:
128
137
  if instance.first_correct_reference:
129
138
  label_to_instances[instance.first_correct_reference.output.text].append(instance)
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
145
154
  sorted_labels.extend(labels)
146
155
 
147
156
  labels_iterable = cycle(sorted_labels)
148
- examples: List[Instance] = []
149
157
  while num_instances_to_sample > 0:
150
158
  next_label: Optional[str] = next(labels_iterable, None)
151
159
  if not next_label:
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
218
226
 
219
227
  # References (optionally) and output
220
228
  output: str
229
+
230
+ delimiter = ","
221
231
  if reference_index is None:
222
232
  # Put only the correct reference as the output
223
- correct_reference: Optional[Reference] = instance.first_correct_reference
224
- output = correct_reference.output.text if correct_reference is not None else "n/a"
233
+ correct_references: List[Reference] = instance.all_correct_references
234
+ if not correct_references:
235
+ output = "n/a"
236
+ else:
237
+ output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
225
238
  else:
226
239
  reference = instance.references[reference_index]
227
240
  output = reference.output.text
File without changes
@@ -1,6 +1,7 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
 
3
3
  from sklearn.metrics import f1_score
4
+ from sklearn.preprocessing import MultiLabelBinarizer
4
5
 
5
6
  from helm.benchmark.adaptation.request_state import RequestState
6
7
  from helm.benchmark.metrics.basic_metrics import normalize_text
@@ -20,8 +21,7 @@ class ClassificationMetric(Metric):
20
21
 
21
22
  Note:
22
23
  - The set of classes is derived from the correct references from all the instances.
23
- This means that classes may be omitted if they never are never used as a correct
24
- reference.
24
+ This means that classes may be omitted if they are never used as a correct reference.
25
25
  - Generations that are not in any of the known classes are counted as a
26
26
  negative prediction for every class.
27
27
  - Perturbed classes are considered different classes from unperturbed
@@ -29,10 +29,16 @@ class ClassificationMetric(Metric):
29
29
  - Currently, multi-label classification is not supported.
30
30
  """
31
31
 
32
+ def __init__(self, delimiter: Optional[str] = None):
33
+ self.delimiter = delimiter
34
+
35
+ def is_multi_label(self) -> bool:
36
+ return bool(self.delimiter)
37
+
32
38
  def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
33
- y_pred: List[str] = []
34
- y_true: List[str] = []
35
- for request_state in request_states:
39
+ y_pred: List[List[str]] = []
40
+ y_true: List[List[str]] = []
41
+ for request_state in request_states: # one request state per instance
36
42
  # Only the generation adapter is supported.
37
43
  # TODO: Support multiple_choice_* adapters.
38
44
  if request_state.reference_index is not None:
@@ -42,24 +48,23 @@ class ClassificationMetric(Metric):
42
48
  assert request_state.result is not None
43
49
  if len(request_state.result.completions) != 1:
44
50
  raise ValueError("Result must contain exactly one completion")
45
-
46
- num_correct = 0
47
- for reference in request_state.instance.references:
48
- if reference.is_correct:
49
- num_correct += 1
50
- y_true.append(normalize_text(reference.output.text))
51
- if num_correct != 1:
52
- # TODO: Support multi-label classification.
53
- raise ValueError("ClassificationMetric does not support multi-label classification")
54
51
  if request_state.output_mapping:
55
52
  raise ValueError("ClassificationMetric does not support multiple choice adapters")
56
- y_pred.append(normalize_text(request_state.result.completions[0].text))
57
- labels = list(set(y_true))
53
+
54
+ references = request_state.instance.all_correct_references
55
+ if not self.is_multi_label():
56
+ assert len(references) == 1
57
+ correct_ref_texts = [normalize_text(ref.output.text) for ref in references if ref.output.text]
58
+ y_true.append(correct_ref_texts)
59
+
60
+ input_text = request_state.result.completions[0].text
61
+ predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
62
+ y_pred.append([normalize_text(pred) for pred in predictions if pred])
63
+ labels: List[str] = list(set(y for ys in y_true for y in ys))
64
+ mlb = MultiLabelBinarizer().fit([labels])
65
+ y_true = mlb.transform(y_true)
66
+ y_pred = mlb.transform(y_pred)
58
67
  return [
59
- Stat(MetricName("classification_macro_f1")).add(
60
- f1_score(y_pred=y_pred, y_true=y_true, labels=list(labels), average="macro")
61
- ),
62
- Stat(MetricName("classification_micro_f1")).add(
63
- f1_score(y_pred=y_pred, y_true=y_true, labels=list(labels), average="micro")
64
- ),
68
+ Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
69
+ Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
65
70
  ]
@@ -63,7 +63,8 @@ def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
63
63
 
64
64
 
65
65
  def test_evaluate_instances_binary_generation():
66
- metric = ClassificationMetric()
66
+ metric = ClassificationMetric(delimiter=None)
67
+
67
68
  request_states = [
68
69
  _request_state("yes", [_Option("yes", True)]),
69
70
  _request_state("yes", [_Option("yes", True)]),
@@ -86,20 +87,21 @@ def test_evaluate_instances_binary_generation():
86
87
 
87
88
 
88
89
  def test_evaluate_instances_multi_class():
89
- metric = ClassificationMetric()
90
+ # Note: no "a" because it would get filtered out by normalize_text()
91
+ metric = ClassificationMetric(delimiter=None)
90
92
 
91
93
  def _options(correct: str):
92
- return [_Option(text, text == correct) for text in ["a", "b", "c"]]
94
+ return [_Option(text, text == correct) for text in ["d", "b", "c"]]
93
95
 
94
96
  request_states = [
95
- _request_state("a", _options("a")),
96
- _request_state("a", _options("a")),
97
- _request_state("a", _options("a")),
98
- _request_state("a", _options("b")),
97
+ _request_state("d", _options("d")),
98
+ _request_state("d", _options("d")),
99
+ _request_state("d", _options("d")),
100
+ _request_state("d", _options("b")),
99
101
  _request_state("b", _options("b")),
100
102
  _request_state("b", _options("b")),
101
103
  _request_state("b", _options("c")),
102
- _request_state("c", _options("a")),
104
+ _request_state("c", _options("d")),
103
105
  _request_state("c", _options("c")),
104
106
  _request_state("invalid", _options("c")),
105
107
  ]
@@ -107,9 +109,42 @@ def test_evaluate_instances_multi_class():
107
109
  metric.evaluate_instances(request_states),
108
110
  _expected_stats(
109
111
  {
110
- "a": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
112
+ "d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
111
113
  "b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
112
114
  "c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
113
115
  }
114
116
  ),
115
117
  )
118
+
119
+
120
+ def test_evaluate_instances_multilabel():
121
+ # Note: no "a" because it would get filtered out by normalize_text()
122
+ metric = ClassificationMetric(delimiter=",")
123
+
124
+ def _options(correct: List[str]):
125
+ return [_Option(text, text in correct) for text in ["d", "b", "c"]]
126
+
127
+ request_states = [
128
+ _request_state("d,b", _options(["d", "b"])),
129
+ _request_state("d,b", _options(["d", "c"])),
130
+ _request_state("d", _options(["d"])),
131
+ _request_state("c", _options(["b"])),
132
+ _request_state("b", _options(["b", "c"])),
133
+ _request_state("d,b", _options(["c"])),
134
+ _request_state("d,c", _options(["d"])),
135
+ _request_state("d,b,c", _options(["d", "b", "c"])),
136
+ _request_state("", []),
137
+ _request_state("n/a", []),
138
+ _request_state("invalid", _options(["c"])),
139
+ ]
140
+
141
+ assert_stats_equal(
142
+ metric.evaluate_instances(request_states),
143
+ _expected_stats(
144
+ {
145
+ "d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
146
+ "b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
147
+ "c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
148
+ }
149
+ ),
150
+ )