crfm-helm 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +10 -8
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +50 -37
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +1 -0
- helm/benchmark/__init__.py +2 -0
- helm/benchmark/adaptation/adapter_spec.py +3 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- helm/benchmark/contamination/__init__.py +0 -0
- helm/benchmark/metrics/classification_metrics.py +28 -23
- helm/benchmark/metrics/test_classification_metrics.py +44 -9
- helm/benchmark/presentation/create_plots.py +617 -0
- helm/benchmark/presentation/summarize.py +4 -2
- helm/benchmark/presentation/test_create_plots.py +32 -0
- helm/benchmark/run.py +23 -1
- helm/benchmark/run_expander.py +161 -47
- helm/benchmark/run_specs.py +84 -10
- helm/benchmark/runner.py +31 -3
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
- helm/benchmark/scenarios/lextreme_scenario.py +37 -25
- helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- helm/benchmark/scenarios/scenario.py +5 -0
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/static/benchmarking.css +14 -0
- helm/benchmark/static/benchmarking.js +43 -0
- helm/benchmark/static/index.html +2 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/plot-captions.js +16 -0
- helm/benchmark/static/schema.yaml +66 -8
- helm/benchmark/window_services/cohere_window_service.py +20 -0
- helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- helm/benchmark/window_services/huggingface_window_service.py +39 -0
- helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- helm/benchmark/window_services/window_service_factory.py +27 -6
- helm/common/general.py +12 -5
- helm/proxy/clients/aleph_alpha_client.py +47 -28
- helm/proxy/clients/auto_client.py +28 -24
- helm/proxy/clients/huggingface_client.py +30 -17
- helm/proxy/clients/huggingface_model_registry.py +111 -0
- helm/proxy/clients/huggingface_tokenizer.py +23 -7
- helm/proxy/clients/openai_client.py +60 -2
- helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- helm/proxy/clients/together_client.py +17 -2
- helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- helm/proxy/models.py +82 -2
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: crfm-helm
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Benchmark for language models
|
|
5
5
|
Home-page: https://github.com/stanford-crfm/helm
|
|
6
6
|
Author: Stanford CRFM
|
|
@@ -12,15 +12,11 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
12
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
13
|
Requires-Python: ~=3.8
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: pytest (~=7.2.0)
|
|
16
|
-
Requires-Dist: black (~=22.10.0)
|
|
17
|
-
Requires-Dist: mypy (~=0.982)
|
|
18
|
-
Requires-Dist: pre-commit (~=2.20.0)
|
|
19
|
-
Requires-Dist: flake8 (~=5.0.4)
|
|
20
15
|
Requires-Dist: zstandard (~=0.18.0)
|
|
21
16
|
Requires-Dist: tqdm (~=4.64.1)
|
|
22
17
|
Requires-Dist: pyhocon (~=0.3.59)
|
|
23
18
|
Requires-Dist: dacite (~=1.6.0)
|
|
19
|
+
Requires-Dist: aleph-alpha-client (~=2.14.0)
|
|
24
20
|
Requires-Dist: bottle (~=0.12.23)
|
|
25
21
|
Requires-Dist: gunicorn (~=20.1.0)
|
|
26
22
|
Requires-Dist: Mako (~=1.2.3)
|
|
@@ -28,8 +24,9 @@ Requires-Dist: sqlitedict (~=1.7.0)
|
|
|
28
24
|
Requires-Dist: pymongo (~=4.2.0)
|
|
29
25
|
Requires-Dist: retrying (~=1.3.3)
|
|
30
26
|
Requires-Dist: websocket-client (~=1.3.2)
|
|
31
|
-
Requires-Dist: openai (~=0.
|
|
32
|
-
Requires-Dist: transformers (~=4.
|
|
27
|
+
Requires-Dist: openai (~=0.27.0)
|
|
28
|
+
Requires-Dist: transformers (~=4.26.1)
|
|
29
|
+
Requires-Dist: tokenizers (~=0.13.2)
|
|
33
30
|
Requires-Dist: icetk (~=0.0.4)
|
|
34
31
|
Requires-Dist: protobuf (~=3.20.2)
|
|
35
32
|
Requires-Dist: google-api-python-client (~=2.64.0)
|
|
@@ -41,6 +38,7 @@ Requires-Dist: sympy (~=1.11.1)
|
|
|
41
38
|
Requires-Dist: sentencepiece (~=0.1.97)
|
|
42
39
|
Requires-Dist: numba (~=0.56.4)
|
|
43
40
|
Requires-Dist: cattrs (~=22.2.0)
|
|
41
|
+
Requires-Dist: xlrd (~=2.0.1)
|
|
44
42
|
Requires-Dist: importlib-resources (~=5.10.0)
|
|
45
43
|
Requires-Dist: nltk (~=3.7)
|
|
46
44
|
Requires-Dist: scipy (~=1.9.1)
|
|
@@ -54,5 +52,9 @@ Requires-Dist: spacy (~=3.2.4)
|
|
|
54
52
|
Requires-Dist: summ-eval (~=0.892)
|
|
55
53
|
Requires-Dist: torch (~=1.12.1)
|
|
56
54
|
Requires-Dist: torchvision (~=0.13.1)
|
|
55
|
+
Requires-Dist: colorcet (~=3.0.1)
|
|
56
|
+
Requires-Dist: matplotlib (~=3.6.0)
|
|
57
|
+
Requires-Dist: numpy (~=1.23.3)
|
|
58
|
+
Requires-Dist: seaborn (~=0.11.0)
|
|
57
59
|
|
|
58
60
|
Benchmark for language models
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
helm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
helm/benchmark/__init__.py,sha256=
|
|
2
|
+
helm/benchmark/__init__.py,sha256=haJrJawd2zOTaxV_nkk6-V05vnePuHwCi0DytuJ0898,4450
|
|
3
3
|
helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYCK8pXVo,2173
|
|
4
4
|
helm/benchmark/executor.py,sha256=Vkmc4wmar2MRIavfiUOa2mu8Pp-zXsguYOevbjog4-4,3299
|
|
5
|
-
helm/benchmark/run.py,sha256=
|
|
6
|
-
helm/benchmark/run_expander.py,sha256=
|
|
7
|
-
helm/benchmark/run_specs.py,sha256=
|
|
8
|
-
helm/benchmark/runner.py,sha256=
|
|
5
|
+
helm/benchmark/run.py,sha256=AWa862BtEh5aOTjKZ9OkSv3be2ZrU4R1qiwJtRTQwfk,9402
|
|
6
|
+
helm/benchmark/run_expander.py,sha256=vnq-zRmuXLzgr3sS3XYaXJFarNC7-QKc0_DtPjwXq3Q,32952
|
|
7
|
+
helm/benchmark/run_specs.py,sha256=ssBJYMZVMF4XGk6lvCSlQJh6A-Pmh2_ndi_JAwgW0CQ,71441
|
|
8
|
+
helm/benchmark/runner.py,sha256=zYDe8UeB1LFmbpChmRdRqEIZo-X0xWMenOCp2NnZ9Ws,8802
|
|
9
9
|
helm/benchmark/server.py,sha256=HsuVsch1SPjQ4YyZi60kjr3JZeL82h8jgkxTUlfb130,1620
|
|
10
10
|
helm/benchmark/test_data_preprocessor.py,sha256=adT-pgVeWvmZXLUUehxH0C-lMhXhtdxsvYdr69o1BD4,2047
|
|
11
11
|
helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
helm/benchmark/adaptation/adapter_spec.py,sha256=
|
|
12
|
+
helm/benchmark/adaptation/adapter_spec.py,sha256=YoxMyN4RJM8GG_DeZ-k0edyARZ69hHLkvOlvKCk-u2o,2811
|
|
13
13
|
helm/benchmark/adaptation/prompt.py,sha256=MATerIUIhFp_BMGvK7bLpNtWH6Oi4kknjBjOkr2bHv4,1948
|
|
14
14
|
helm/benchmark/adaptation/request_state.py,sha256=o3OpZbB0TJFiZ2Nmhvg3vWmByaUSYTffT_WnoNb7w68,2712
|
|
15
15
|
helm/benchmark/adaptation/scenario_state.py,sha256=ZflBuNgvN0JqUhshFcy0kTweO1WJs6j5UCaTxWTMe0o,1747
|
|
@@ -18,7 +18,7 @@ helm/benchmark/adaptation/adapters/adapter.py,sha256=8wK28jISxW8rUfXP-_-FfQJRRzc
|
|
|
18
18
|
helm/benchmark/adaptation/adapters/adapter_factory.py,sha256=N2n-xIoGt_DxlN0LT4GUgVvdoaqhyUU8rSWr_nyfb80,2318
|
|
19
19
|
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py,sha256=3j24nFQuZE0Zl6DMAB4aYUpjieerdSMLsJbpMT9Nzfw,5646
|
|
20
20
|
helm/benchmark/adaptation/adapters/generation_adapter.py,sha256=-on4QAo8hhzJVgAnM6G8lFFqaoiSiVF-KxwwfHwE61A,1927
|
|
21
|
-
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=
|
|
21
|
+
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=q5K4Hag8LOpfpkeEzwIMPLNpBqMThcB1LXLGr_n_Xfo,13118
|
|
22
22
|
helm/benchmark/adaptation/adapters/language_modeling_adapter.py,sha256=vPo2EVgbMfzmwPPcljoXdDfBW3c80LLKaUhA-RefU2w,11967
|
|
23
23
|
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py,sha256=pV14yvmH_mRQpeXF0teAxGpJcouSQViipr-aMkNE-AM,1711
|
|
24
24
|
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py,sha256=4CALqc--PUaEl3cLzmjP9nFSuarCZMKBwrPQxde5TYM,3471
|
|
@@ -46,6 +46,7 @@ helm/benchmark/augmentations/space_perturbation.py,sha256=7OdpoibdizoPDBPpLc1ENy
|
|
|
46
46
|
helm/benchmark/augmentations/synonym_perturbation.py,sha256=2qFx7xparhEPd82tvs59HkAr1hwQWv7asWtmNCbcQrQ,4209
|
|
47
47
|
helm/benchmark/augmentations/test_perturbation.py,sha256=v_U5CmBpA5aXqg4EJUYZrSfGsNbZTwCP0inxz1XNGq0,9991
|
|
48
48
|
helm/benchmark/augmentations/typos_perturbation.py,sha256=nfF1Zw2REKZEnnyPVFWD87MP8L5ANbaZXeI2n70Sonw,2790
|
|
49
|
+
helm/benchmark/contamination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
50
|
helm/benchmark/efficiency_data/inference_denoised_runtimes.json,sha256=ios_dt-_8wtXvkVAx0iI2zwCxqHvk3XKTx31qHPalsI,4203
|
|
50
51
|
helm/benchmark/efficiency_data/inference_idealized_runtimes.json,sha256=5w7reeZc0yc4cjH8kJGxQQSoe8yaRVX2SSlSrx0QWFQ,12348
|
|
51
52
|
helm/benchmark/efficiency_data/training_efficiency.json,sha256=aH2moiBLStOLVi8Ci2KTK5ZkWlTBLK-B3fRfNZwhoSg,9763
|
|
@@ -54,7 +55,7 @@ helm/benchmark/metrics/basic_metrics.py,sha256=xBCTgLVdiWVxdpB08MbWFc2nvwCJTgD1-
|
|
|
54
55
|
helm/benchmark/metrics/bbq_metrics.py,sha256=H44mwKXLJ0PXo-sVKgRHCEKjZGnCahDD6GOQMWpnbOQ,6061
|
|
55
56
|
helm/benchmark/metrics/bias_metrics.py,sha256=uVJFQvDSzvPR1ELu0FNYyExjRy2ThaJRCw8beEMDqJs,11309
|
|
56
57
|
helm/benchmark/metrics/bias_word_lists.py,sha256=mx5JjW3mHffXIqo4GcQN-zENUEttBqQnEjPTz3J3J_4,13909
|
|
57
|
-
helm/benchmark/metrics/classification_metrics.py,sha256=
|
|
58
|
+
helm/benchmark/metrics/classification_metrics.py,sha256=1q7gPnWRrx4QwE8T0m269vFJWg_bKfVx21a5spDBbjU,3701
|
|
58
59
|
helm/benchmark/metrics/code_metrics.py,sha256=uWdigk0QyEsfVHQzq9KxkOc-LROvcqWXeui42Mr0YF4,5119
|
|
59
60
|
helm/benchmark/metrics/code_metrics_helper.py,sha256=byyuI1lJgbIDPVJzywaBsam9zFMPPyn28g1grsK9xyA,22336
|
|
60
61
|
helm/benchmark/metrics/copyright_metrics.py,sha256=8sk85mLTasWIgHIXxOho0z_nQYyLqtzSWHSAwd5ayAQ,7560
|
|
@@ -68,7 +69,7 @@ helm/benchmark/metrics/ranking_metrics.py,sha256=b3qxTRnr62zz1Gr1dsVDYtdwB8WBIb-
|
|
|
68
69
|
helm/benchmark/metrics/statistic.py,sha256=9VM5JA1-M_iYCNziWm2qeDZaAQqPQ_ySdaSMcqAeYdM,3048
|
|
69
70
|
helm/benchmark/metrics/summarization_metrics.py,sha256=hHNWGYA1bNfgCg7o1RSiTo7E-SJujHhkKh9G204icoo,16083
|
|
70
71
|
helm/benchmark/metrics/test_bias_metrics.py,sha256=brut1rdnKNtTVJoe6qkllmJwZTFBZkLcyI_4qmqZ_vA,6264
|
|
71
|
-
helm/benchmark/metrics/test_classification_metrics.py,sha256=
|
|
72
|
+
helm/benchmark/metrics/test_classification_metrics.py,sha256=usW5ciUYu2ZUUqVjFk4NfZTGNIoBArwia_-8uGOvFpw,5475
|
|
72
73
|
helm/benchmark/metrics/test_metric.py,sha256=S7LGHNCHuhMk582eHylw1tOasUBEf_7F0T4u3tey7b4,757
|
|
73
74
|
helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SIBjLFuLIzGbfg6OiZvM,4162
|
|
74
75
|
helm/benchmark/metrics/test_statistic.py,sha256=WQv9i8wSNTCzlw-L1wir0lmW0g3D4CM_ebpii7IB9Lw,406
|
|
@@ -89,12 +90,14 @@ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=gPqoYNI
|
|
|
89
90
|
helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
|
|
90
91
|
helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
91
92
|
helm/benchmark/presentation/contamination.py,sha256=5wLwq266sCxT62MdzAXT9V-au6b07HaL44DLj_2qiSk,2788
|
|
93
|
+
helm/benchmark/presentation/create_plots.py,sha256=-YyrhEmfVOMnESJ8m2yk7RWAOYdZkVrLAt2K8XnpNF0,28442
|
|
92
94
|
helm/benchmark/presentation/run_display.py,sha256=HSvV71ZRshMIhHZHGtlbYfRxK9xx1GQgn6YmGPVncME,9892
|
|
93
95
|
helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
|
|
94
96
|
helm/benchmark/presentation/schema.py,sha256=i1utCqiNkbTK9CcDcOFwi7e91KPaKkpFU07ZcBfWXTc,8753
|
|
95
|
-
helm/benchmark/presentation/summarize.py,sha256=
|
|
97
|
+
helm/benchmark/presentation/summarize.py,sha256=BUXog2m_UPbftyzFHx_U4mE2FrG56iv9mvcCdXoZVmI,45071
|
|
96
98
|
helm/benchmark/presentation/table.py,sha256=VzVMwsgP3kItAM6FPRUaTphzJ-ZjriiuFbWlO1rJUMU,2879
|
|
97
99
|
helm/benchmark/presentation/test_contamination.py,sha256=8mnzUzxUW9pXUOuLpU4BBBg0V7Mn1d1s4AQgwy6_kl4,459
|
|
100
|
+
helm/benchmark/presentation/test_create_plots.py,sha256=2q3v2Qdh_hBKCEX9toygXFLIryu1FlcLMt2PXprx7j8,1251
|
|
98
101
|
helm/benchmark/presentation/test_run_entry.py,sha256=M5z4dnVb7fM3PWrZWIZNlG8CT4KnDxjnEE4FBb1ZFNU,621
|
|
99
102
|
helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
100
103
|
helm/benchmark/scenarios/babi_qa_scenario.py,sha256=SkIJjqBuMLYxZR8l9epu9arBeirvJPtQsIBXv4bzkx4,5030
|
|
@@ -107,7 +110,7 @@ helm/benchmark/scenarios/civil_comments_scenario.py,sha256=vXq6KxyS5C0-tD8xUmkG5
|
|
|
107
110
|
helm/benchmark/scenarios/code_scenario.py,sha256=Q_TP_vWewkClvibPFHXpsOjR-CWexYgu5kl4OpfZXNc,11355
|
|
108
111
|
helm/benchmark/scenarios/code_scenario_helper.py,sha256=EbQNfHqhQXaMMPmYT2mG2dRjzYaI2FvcPb9j6NlNHDU,5853
|
|
109
112
|
helm/benchmark/scenarios/commonsense_scenario.py,sha256=9roSJS3iGSNgqxTbLI87xuZGB8IxJkbbtzr-ep0HUn0,10661
|
|
110
|
-
helm/benchmark/scenarios/copyright_scenario.py,sha256=
|
|
113
|
+
helm/benchmark/scenarios/copyright_scenario.py,sha256=APYQPC-esq3oM2qQxW6JNxa4pkv_yHDKfePjpvvi6nQ,3660
|
|
111
114
|
helm/benchmark/scenarios/covid_dialog_scenario.py,sha256=FmYIuRr81xD_d0iyRa5blPC8OTqpfv8XGTz5XXUOd2E,3958
|
|
112
115
|
helm/benchmark/scenarios/dialogue_scenarios.py,sha256=SPwo1iYiLbPwNtOgAVkTr-dO8FQLshmrfXdjPcayW5A,5616
|
|
113
116
|
helm/benchmark/scenarios/disinformation_scenario.py,sha256=Ff66LxBm8APuMziLfGvTM0WIatrAty5_q_8ObaLW5lo,8491
|
|
@@ -116,11 +119,12 @@ helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=oChQzEptlf731
|
|
|
116
119
|
helm/benchmark/scenarios/entity_matching_scenario.py,sha256=gtrLSCw2JSNnBgFQFsUm4EcICsjVWtp9wsOdqcyBU4k,6863
|
|
117
120
|
helm/benchmark/scenarios/gsm_scenario.py,sha256=PmX0zutkGqnqGirWidUdk166cWv_23RtaTFcVQGBpzc,2619
|
|
118
121
|
helm/benchmark/scenarios/ice_scenario.py,sha256=smrpTOwtMDL-m40zfKfNz9btOGoINZNv3_2oBcLBMmk,16156
|
|
122
|
+
helm/benchmark/scenarios/imdb_listdir.json,sha256=eczxp9gslYYwx5XR86ATnZorIxuujFMDTfzR4h5NCpo,1015402
|
|
119
123
|
helm/benchmark/scenarios/imdb_scenario.py,sha256=VTD0Ur6ATyY9NWxcnkGzn9Iw5vl4d94o0FbFm61ZZTA,6057
|
|
120
124
|
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=wYaivSqqYYZPjPHTKaS6D7j960dcLIVTfmuZ8awd1Zs,2192
|
|
121
125
|
helm/benchmark/scenarios/legal_support_scenario.py,sha256=K9HfTyHJnnLFvGKNwYQayu7JV4uyNT5wH48wc9ixRa0,3912
|
|
122
|
-
helm/benchmark/scenarios/lex_glue_scenario.py,sha256=
|
|
123
|
-
helm/benchmark/scenarios/lextreme_scenario.py,sha256=
|
|
126
|
+
helm/benchmark/scenarios/lex_glue_scenario.py,sha256=r89KevvM1Kifu3ZkUIXAV8jXktclNtL0-JL9T6qOx_Q,10224
|
|
127
|
+
helm/benchmark/scenarios/lextreme_scenario.py,sha256=tejeKE08YX5MYFXrJRY48rIzO5V5fdwHDWEzg883K2E,20300
|
|
124
128
|
helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=2-GrPB8whoBFcQ5608jrwlAcRJgpkT1P2UehcR6-EYY,5977
|
|
125
129
|
helm/benchmark/scenarios/math_scenario.py,sha256=5PespNtseDOnPgAwtdP0vMkXz1CaJM0BkJsWdeG5gUM,13825
|
|
126
130
|
helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=ZIn7tlBC4baV8CYcU_-mYe2RbYaQ-8dX1Ca_hOvZTfI,3988
|
|
@@ -134,32 +138,34 @@ helm/benchmark/scenarios/narrativeqa_scenario.py,sha256=r5TwYkZH_YaCws6BSnjbiDQk
|
|
|
134
138
|
helm/benchmark/scenarios/natural_qa_scenario.py,sha256=nuL5Qlh26xq7Q_lvzK2sKVN7eNYh7F5SjbIjahKaMNg,12527
|
|
135
139
|
helm/benchmark/scenarios/newsqa_scenario.py,sha256=vMuIZyYxufH2AqhDoIZzzllfq8ScJIhSDH6lM3IUxGM,7242
|
|
136
140
|
helm/benchmark/scenarios/numeracy_scenario.py,sha256=Iwtypyb0249zKYyV6p4YUX4bYke6uIyL6R3aopiGUb8,30552
|
|
141
|
+
helm/benchmark/scenarios/opinions_qa_scenario.py,sha256=dtIYKL9ZiccX_F3-5OrnHXJdNBBTxCoTHc2Kc-XX79E,7380
|
|
137
142
|
helm/benchmark/scenarios/pubmed_qa_scenario.py,sha256=cpMmQWwCDXIYO0btGLhevMT-Mhs9-5Es9cDvQYkIlL0,7493
|
|
138
143
|
helm/benchmark/scenarios/quac_scenario.py,sha256=46nqmeVgkWu6jDGCHl-KHu351bmJj7jx_1p5kPwcOjc,6615
|
|
139
144
|
helm/benchmark/scenarios/raft_scenario.py,sha256=RzewlMVkHJ2XbZ9_9FzBvbdV__BuBRgtX2HyhCnmH1o,4500
|
|
140
145
|
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py,sha256=8DQD7KkilAfxwzHUwCPrPPHeYEz_dEOWSls8yZo15do,2387
|
|
141
|
-
helm/benchmark/scenarios/scenario.py,sha256=
|
|
146
|
+
helm/benchmark/scenarios/scenario.py,sha256=bdRcv-YoLkxjlpNcq4MXiu8HQgjByHkkLWOdih4ahsM,7365
|
|
142
147
|
helm/benchmark/scenarios/simple_scenarios.py,sha256=rcHzukhjBgvNRqkjcg0cms_zWtAsLPk0xiFN9I25_hI,1947
|
|
143
148
|
helm/benchmark/scenarios/summarization_scenario.py,sha256=fKeRSkXrH26WyfeIhn43_fZxnAO4bIX1Xh5HoKcjOQM,6550
|
|
144
149
|
helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=dtMMeULjw6pcobBRp0r6f9N4VCUKamx1Jy-6xPu85q0,3083
|
|
145
150
|
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=YvEgO5qVZ4hLpnvjer4CG0Ct1upssZRjZWxnNi1ZUtY,16308
|
|
146
151
|
helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=Pm63shscbGuigg4zWfcw3rOIPP8LgxBCnpcQKAA_CX0,8327
|
|
147
152
|
helm/benchmark/scenarios/test_scenario.py,sha256=o8w8ElDPF-RzeCmecwyvie_nRMYj01b38BufXS-igqY,1612
|
|
148
|
-
helm/benchmark/scenarios/the_pile_scenario.py,sha256=
|
|
153
|
+
helm/benchmark/scenarios/the_pile_scenario.py,sha256=aCcjZp0wabu8lpPVNAdCr1x6m_3QvgKe7dIGS2qgGm4,4981
|
|
149
154
|
helm/benchmark/scenarios/truthful_qa_scenario.py,sha256=zQgv_qUU5h5ODoQE06rpJa6O_8FwF695cIie3PG7bx8,5969
|
|
150
155
|
helm/benchmark/scenarios/twitter_aae_scenario.py,sha256=_qWotvPagNj3ATnyaww3U1XZtxN-wgueUf34fFmKXQI,2083
|
|
151
156
|
helm/benchmark/scenarios/wikifact_scenario.py,sha256=pucEuLYz5N9qIobnEbJnRUnK6PrkWFdkl7yPuCJj3SE,5778
|
|
152
157
|
helm/benchmark/scenarios/wikitext_103_scenario.py,sha256=rXVbUzOZi4eWM-_HP1gzY5SBmMwOX1vk12WrLkR3NHo,3074
|
|
153
158
|
helm/benchmark/scenarios/wmt_14_scenario.py,sha256=u24E_w0AOXpl3PzEFLmiBcl8qyJEy-1Yc-i4YHgU99M,4356
|
|
154
|
-
helm/benchmark/static/benchmarking.css,sha256=
|
|
155
|
-
helm/benchmark/static/benchmarking.js,sha256=
|
|
159
|
+
helm/benchmark/static/benchmarking.css,sha256=DGC4Huh4tVD2o9wEeUf3YOc3MYcq2fmJQXvhTjVDumE,2057
|
|
160
|
+
helm/benchmark/static/benchmarking.js,sha256=qiXAY_9fiWZ4ydzhBQAUhylzxNCoIN8ciLlEFMg33uE,49107
|
|
156
161
|
helm/benchmark/static/contamination.yaml,sha256=LbISh56ORvfkkWptm7ZWmlPvWxtls6pBF1TbGiWD7hk,3096
|
|
157
162
|
helm/benchmark/static/general.js,sha256=L3S4CBUED0k7RsjLHCeWjO29ZMFJckZgNTAYAARzaEg,3029
|
|
158
|
-
helm/benchmark/static/index.html,sha256=
|
|
163
|
+
helm/benchmark/static/index.html,sha256=yzf_VEGW15CPZOW9a2Z6opoQ0Gg2YQCtj182niUb0fk,3130
|
|
159
164
|
helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheGWu7zNI,3428
|
|
160
165
|
helm/benchmark/static/json-urls-root.js,sha256=G3qenwLgBojh3ukzp_gyMUaZja83ZFqvT1WQ_Rg11BU,98
|
|
161
|
-
helm/benchmark/static/json-urls.js,sha256=
|
|
162
|
-
helm/benchmark/static/
|
|
166
|
+
helm/benchmark/static/json-urls.js,sha256=wvsG2Lrz2XArwwMOl_tGXL9y4mjqzjod4gcqlvVCiQA,1750
|
|
167
|
+
helm/benchmark/static/plot-captions.js,sha256=gTBn-IPPD4BkzryVYj3KkGqLhWqWvBbvufeJaDygQxk,3010
|
|
168
|
+
helm/benchmark/static/schema.yaml,sha256=TQdzlpOPvTwELiU0w0HBAZqKQBmjj1LOkEGM85oa8e4,95657
|
|
163
169
|
helm/benchmark/static/utils.js,sha256=H2PKYjuXZ392DlALCPJ1XRwGxBDRFjL9eTFiTd4vBU8,7338
|
|
164
170
|
helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
|
|
165
171
|
helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
|
|
@@ -184,11 +190,13 @@ helm/benchmark/window_services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
|
|
|
184
190
|
helm/benchmark/window_services/ai21_window_service.py,sha256=Mo0Zzj2a9hDiUg6hTuJWb3ABhBUPlOYS-kHhaE-pHUE,12672
|
|
185
191
|
helm/benchmark/window_services/anthropic_window_service.py,sha256=mA9aWKfc-dbSnpt37k6zDFdNNMfbevAoleCzSaar-uE,799
|
|
186
192
|
helm/benchmark/window_services/bloom_window_service.py,sha256=o7MVedt6khdoj8zikLDuVraEzuoBZk7j4Fzjsas0sD4,1023
|
|
187
|
-
helm/benchmark/window_services/cohere_window_service.py,sha256=
|
|
193
|
+
helm/benchmark/window_services/cohere_window_service.py,sha256=3SJT97CaxNxtUNS9_qKvKMVCA6lvNavS-xo6jAsxPbM,7070
|
|
188
194
|
helm/benchmark/window_services/encoder_decoder_window_service.py,sha256=Dg_rD0RtKIALtpvT4Wi4Am3zgFcIvgxfzdglw5fbbTU,2478
|
|
195
|
+
helm/benchmark/window_services/flan_t5_window_service.py,sha256=39IZX89_tay3bpSGVoWDoekmhW-RUNATQuT-bNyFRTs,915
|
|
189
196
|
helm/benchmark/window_services/gpt2_window_service.py,sha256=d-ys9FHoVI1u1GZOH734JPAOc8W6QZb1N3ZjooKmwz8,990
|
|
190
197
|
helm/benchmark/window_services/gptj_window_service.py,sha256=C2OEl-3ZatwxYVoaQgyvAZ5SBS2TK-2PnM02zUNjhiA,1103
|
|
191
198
|
helm/benchmark/window_services/gptneox_window_service.py,sha256=FD0NDlBxDu1fZ8vlaTsOBh2IKYfrMV73qNGetkrp1P4,944
|
|
199
|
+
helm/benchmark/window_services/huggingface_window_service.py,sha256=tDbhjn81Aw3ZSYMKh4aQU3e8JX1IVYCu_2gPOLaIWD8,1440
|
|
192
200
|
helm/benchmark/window_services/ice_window_service.py,sha256=5z52rP-xAF_jckIPoogyoNFW6FQXqaq3SHybCaBuRn0,2005
|
|
193
201
|
helm/benchmark/window_services/local_window_service.py,sha256=wgNBB-p9Zk-uFLRXYNxrz16_DhWyQ8x9sltSLBhzUh4,4247
|
|
194
202
|
helm/benchmark/window_services/luminous_window_service.py,sha256=0w-nyfXXDwnIDBzU3Y84LZPnQyJGiSbvf5Y_MQh-hCg,1791
|
|
@@ -202,6 +210,7 @@ helm/benchmark/window_services/test_ai21_window_service.py,sha256=U_n2mQ5GqD8oYv
|
|
|
202
210
|
helm/benchmark/window_services/test_bloom_window_service.py,sha256=xeYpsvdR8Ug31BAb4a-PU4Sc0oihpCIMJ5OzTfgUhM0,4221
|
|
203
211
|
helm/benchmark/window_services/test_cohere_window_service.py,sha256=6WpIiuEyfgTZS3BIeAGOR8AAr2djpVIfGbItQRI19Ck,3205
|
|
204
212
|
helm/benchmark/window_services/test_cohere_window_service_utils.py,sha256=sf25f9MeXzoqsbDzZ7d7le13hm8RkDe54nhLtKF2pqo,158150
|
|
213
|
+
helm/benchmark/window_services/test_flan_t5_window_service.py,sha256=xv_EXbiRklveJPQtThYCSYF7qBYwjL7K4wH3Xu5z2Fg,591
|
|
205
214
|
helm/benchmark/window_services/test_gpt2_window_service.py,sha256=3k25pLa_z__g4yoQL40DEXj-T4dGtrgif13N2NXs59U,2568
|
|
206
215
|
helm/benchmark/window_services/test_gptj_window_service.py,sha256=sxsTpozKv9N-wZXtGl1prkQr9Md_q-tnCjO9zt226Co,2267
|
|
207
216
|
helm/benchmark/window_services/test_gptneox_window_service.py,sha256=MCYHZIoulJf_WCx6de7rWB3nqku6wCyAokA-SWIPEks,4140
|
|
@@ -216,15 +225,16 @@ helm/benchmark/window_services/test_utils.py,sha256=1k2TlPdDIRjum669jpH3O7UOqm4G
|
|
|
216
225
|
helm/benchmark/window_services/test_yalm_window_service.py,sha256=NdunSxq-qDzfzYMBYZ-0my6LaU2qUxtm7Ii0c4fyKnY,4273
|
|
217
226
|
helm/benchmark/window_services/tokenizer_service.py,sha256=RNznJBAxcCUMCurb7mbraZULx_ZtB0G7IxbrnUe0Urk,865
|
|
218
227
|
helm/benchmark/window_services/ul2_window_service.py,sha256=R_VEzOb59zQE9mmbTLunQeIvLAtK3-97h-B2_oc0Uxs,1021
|
|
228
|
+
helm/benchmark/window_services/wider_ai21_window_service.py,sha256=VZ6EERN48FSYsmJ_aiwI30SEbobLt27c1QqL29Zg_8M,414
|
|
219
229
|
helm/benchmark/window_services/wider_openai_window_service.py,sha256=cpm4mDEGTY-cmrECcDCL0flONBdh4g40uWNLI-v46BU,539
|
|
220
230
|
helm/benchmark/window_services/window_service.py,sha256=aV4YnbXl7T23runB8xmSWRwV7YtliUlXrveEejOMJ1Y,2885
|
|
221
|
-
helm/benchmark/window_services/window_service_factory.py,sha256=
|
|
231
|
+
helm/benchmark/window_services/window_service_factory.py,sha256=UDrXRUXGwBMEiO7Cw_nNbr0XM6mzzXrXiyJsvCmW9wg,5845
|
|
222
232
|
helm/benchmark/window_services/yalm_window_service.py,sha256=g7NeXvlRq3FXf2HwRStBMTkbWDxTdEITkrVRe-fv3mg,1805
|
|
223
233
|
helm/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
224
234
|
helm/common/authentication.py,sha256=RlMx29_TSrfU7ujE7dJkxmFub5EqLj2NswV5lAVFFDk,179
|
|
225
235
|
helm/common/cache.py,sha256=ustgsRHX0W3zoLPN05W3mFl9m9JYp9Ppq5cjMbdmm6Y,13116
|
|
226
236
|
helm/common/codec.py,sha256=zm8MP9Aqfh64D2HMZiCPEMoPkkiJxEzvzmuupGvkRh0,5499
|
|
227
|
-
helm/common/general.py,sha256=
|
|
237
|
+
helm/common/general.py,sha256=7vFw10h_hTrxUCfxE0LH13hp8Lunp77lbVmucgoeq2Y,10181
|
|
228
238
|
helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
|
|
229
239
|
helm/common/object_spec.py,sha256=COMd4RpYgfulW940a5M_npbsfRBvLkmhjfwDIq4Gpqs,1833
|
|
230
240
|
helm/common/perspective_api_request.py,sha256=WAVwtajNVmi5XJNsPcorGEAVrqkpPSk-Kd3b0hJghbA,2427
|
|
@@ -237,7 +247,7 @@ helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
237
247
|
helm/proxy/accounts.py,sha256=xq-zVggvueB4D5QK58mFWGPxZe-hnIUAT4D341yd0ac,13503
|
|
238
248
|
helm/proxy/cli.py,sha256=2SOxIF55PDjzXXcDOYRT8m-oyQM_2VyZheKRC3fXDVw,8094
|
|
239
249
|
helm/proxy/example_queries.py,sha256=p1wH-tp1pRUslkAwaJYrrG5aDfmFWK3KYn4M6WQfPqQ,4120
|
|
240
|
-
helm/proxy/models.py,sha256=
|
|
250
|
+
helm/proxy/models.py,sha256=sfdp7DasJXbun-VMZz5p7iD6uhoUW1mFsy64BWfDoV4,29681
|
|
241
251
|
helm/proxy/query.py,sha256=eftbiUICMh8QIHVs-7cLtv_rDXKeKdRPmwjLMu0TDxQ,645
|
|
242
252
|
helm/proxy/retry.py,sha256=GLLDW1iGCwHfgTle8YK7ZB3vV-7XqsHcqeruKoVdsxE,1953
|
|
243
253
|
helm/proxy/server.py,sha256=uBispGXfn39s_Pskd9Xjud0rijTjqXtSKU_2YvE6zGE,7356
|
|
@@ -245,31 +255,34 @@ helm/proxy/test_models.py,sha256=hWeDcBw1GkPvyJUd-ABxRVe1FhSUfz8bzyrKYdsqmyY,726
|
|
|
245
255
|
helm/proxy/test_retry.py,sha256=8h398auzjW9VnlTJWllxR-bdpub-XFp8EN8LWDEnEHM,1049
|
|
246
256
|
helm/proxy/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
247
257
|
helm/proxy/clients/ai21_client.py,sha256=c4D64-1mLaFcGsqOb5CYkaXTmAWEJYk9cIjtr6DbgZQ,7770
|
|
248
|
-
helm/proxy/clients/aleph_alpha_client.py,sha256=
|
|
258
|
+
helm/proxy/clients/aleph_alpha_client.py,sha256=VV7dbgh7sYqoSWfNkxxAiQ7i3yPW33rBPnBig9EXL10,7707
|
|
249
259
|
helm/proxy/clients/anthropic_client.py,sha256=vrfc8lS9bxQbUxMxbElV2z5cMDq-JD6yFDTS2cJdFO0,15526
|
|
250
|
-
helm/proxy/clients/auto_client.py,sha256=
|
|
260
|
+
helm/proxy/clients/auto_client.py,sha256=oktJQ4rCntp7id4vj-d3x_EA3RWGNZyXUMFwzRse--c,11063
|
|
251
261
|
helm/proxy/clients/chat_gpt_client.py,sha256=nG3opHbnzX50r9Ialh3RaRErOZo6k4Q7gVzRHeGQgj8,5312
|
|
252
262
|
helm/proxy/clients/client.py,sha256=bh6FvYFjw6MoHp5n7-KN1asXrIrOC-jfYsg3aW4xMgo,4570
|
|
253
263
|
helm/proxy/clients/cohere_client.py,sha256=KF21m7qUjuhrpEEQv68FNeX0rsWSmxZgw52Oa7CZ5pI,11362
|
|
254
264
|
helm/proxy/clients/google_client.py,sha256=sGGxDWD22c9a9KMzLAFL3vAEDHxp5jSY2W3RDpVDIak,3334
|
|
255
265
|
helm/proxy/clients/goose_ai_client.py,sha256=2tqJK_AhD2-ScXtOTdt9S9khzVjal5pm38BJWiFhwq8,4217
|
|
256
|
-
helm/proxy/clients/huggingface_client.py,sha256=
|
|
257
|
-
helm/proxy/clients/
|
|
266
|
+
helm/proxy/clients/huggingface_client.py,sha256=LypY3YfyoaGFH83UkYwDARGuoN2JOUk2S-nEaJ6GemI,11813
|
|
267
|
+
helm/proxy/clients/huggingface_model_registry.py,sha256=0WHyWPxxBI4KtTs2Yt6-Cw16FC4XBEe6yqUc0-YSn1Y,3891
|
|
268
|
+
helm/proxy/clients/huggingface_tokenizer.py,sha256=ujtsBupMMrE9efds2205c8NiPTcxHX8XM0UoV9spLK0,4591
|
|
258
269
|
helm/proxy/clients/ice_tokenizer_client.py,sha256=Ui8YhAXoY1Q0vC3icoeFs6X9xAcESF6Tl2EGERGWVGU,2325
|
|
259
270
|
helm/proxy/clients/microsoft_client.py,sha256=-VC8IrgrpSp1_FvRSI_8MSxhNp5I6dMc4qWSHc4Oulg,8237
|
|
260
|
-
helm/proxy/clients/openai_client.py,sha256=
|
|
271
|
+
helm/proxy/clients/openai_client.py,sha256=frW9fOjYWkRXdfzE88ppxaLhVl9pCnXfheUqeANW6QQ,9415
|
|
261
272
|
helm/proxy/clients/perspective_api_client.py,sha256=-L8IwokuktWPoOu7nXwsfoab_U1QRGCt8xT1SrcGfYE,5491
|
|
262
273
|
helm/proxy/clients/simple_client.py,sha256=GXHTCRB58XAxnUVqgpynidc7h6kaDBOP7TedVHrOpD4,2915
|
|
263
274
|
helm/proxy/clients/test_client.py,sha256=bvkFob_Yoy8bALrVeQ0h757g9RU687JYI0g3AISPFQ8,1268
|
|
264
275
|
helm/proxy/clients/test_huggingface_client.py,sha256=n-6D-RXqwQyxPxCLCSqHxqfK5JA-PdP5ffP17XwTe2I,3520
|
|
276
|
+
helm/proxy/clients/test_huggingface_model_registry.py,sha256=zMboFlwMtDEV7hkd9SZFuItye-vkzz3CE5mWQrw--W4,2554
|
|
265
277
|
helm/proxy/clients/test_huggingface_tokenizer.py,sha256=KmlAXezQ6R7DAEpV85_JRdTRrOJoJxfmtylqybWn5VA,2189
|
|
266
278
|
helm/proxy/clients/test_ice_tokenizer_client.py,sha256=Ugmn5a7QdAPEAbLtreLS5-sji8yrzxy5mhFPAl3rOuI,2404
|
|
267
279
|
helm/proxy/clients/test_yalm_tokenizer_client.py,sha256=tnrYl7T1DcZ9FN09nBWV1gesjWQ3osiUpsGHSm_IypQ,2336
|
|
268
|
-
helm/proxy/clients/together_client.py,sha256=
|
|
280
|
+
helm/proxy/clients/together_client.py,sha256=epyiYElD0BfAZgUSu4zZKC5Oe8yIVVyJn2RtTwjPMzM,6334
|
|
269
281
|
helm/proxy/clients/yalm_tokenizer_client.py,sha256=cpBoc8eHQoBGQguZsDaVnGWLdZnPgkHjqLSO0B94O0U,2420
|
|
270
282
|
helm/proxy/clients/yalm_tokenizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
271
283
|
helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
|
|
272
|
-
helm/proxy/clients/yalm_tokenizer/
|
|
284
|
+
helm/proxy/clients/yalm_tokenizer/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
|
|
285
|
+
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py,sha256=7Y4_nCZptFWzifCJ5aPmM3_OOxhtomIAQVjpJGV1D8g,5954
|
|
273
286
|
helm/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
274
287
|
helm/proxy/services/remote_service.py,sha256=xKS-0P-EqKTPn7odTXDoQPjn9FliQRLaMFUnCEsUmCU,6965
|
|
275
288
|
helm/proxy/services/server_service.py,sha256=gps7PwXqCi8b0yGYC0nQwFdKbfxCriSHt5CD1N1kkJs,5696
|
|
@@ -286,9 +299,9 @@ helm/proxy/token_counters/openai_token_counter.py,sha256=gPo_VrkEH07xmprzdfIhmJ_
|
|
|
286
299
|
helm/proxy/token_counters/test_ai21_token_counter.py,sha256=42J1fCi20kQUwAD18bIa6h9TaP7KZnlgF-mLbvKURro,5508
|
|
287
300
|
helm/proxy/token_counters/test_openai_token_counter.py,sha256=EovaVCZSr9moITZ9-AKiv_YM-D3OUsUDs4iQhEvpazQ,4823
|
|
288
301
|
helm/proxy/token_counters/token_counter.py,sha256=x8KyTR82EedgCQUuneQiVq9AiU1B3_CHPmKPNumClHc,429
|
|
289
|
-
crfm_helm-0.2.
|
|
290
|
-
crfm_helm-0.2.
|
|
291
|
-
crfm_helm-0.2.
|
|
292
|
-
crfm_helm-0.2.
|
|
293
|
-
crfm_helm-0.2.
|
|
294
|
-
crfm_helm-0.2.
|
|
302
|
+
crfm_helm-0.2.2.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
|
|
303
|
+
crfm_helm-0.2.2.dist-info/METADATA,sha256=_OlkKmj1P7vaZvlpvOnNnzzm3w1IEW6de75SK7TmuPw,2066
|
|
304
|
+
crfm_helm-0.2.2.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
305
|
+
crfm_helm-0.2.2.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
|
|
306
|
+
crfm_helm-0.2.2.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
|
|
307
|
+
crfm_helm-0.2.2.dist-info/RECORD,,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
crfm-proxy-cli = helm.proxy.cli:main
|
|
3
3
|
crfm-proxy-server = helm.proxy.server:main
|
|
4
|
+
helm-create-plots = helm.benchmark.presentation.create_plots:main
|
|
4
5
|
helm-run = helm.benchmark.run:main
|
|
5
6
|
helm-server = helm.benchmark.server:main
|
|
6
7
|
helm-summarize = helm.benchmark.presentation.summarize:main
|
helm/benchmark/__init__.py
CHANGED
|
@@ -42,6 +42,8 @@ from .scenarios import legal_support_scenario # noqa
|
|
|
42
42
|
from .scenarios import entity_matching_scenario # noqa
|
|
43
43
|
from .scenarios import entity_data_imputation_scenario # noqa
|
|
44
44
|
from .scenarios import big_bench_scenario # noqa
|
|
45
|
+
from .scenarios import opinions_qa_scenario # noqa
|
|
46
|
+
|
|
45
47
|
|
|
46
48
|
# Biomedical
|
|
47
49
|
from .scenarios import covid_dialog_scenario # noqa
|
|
@@ -68,6 +68,9 @@ class AdapterSpec:
|
|
|
68
68
|
# set of training instances. Used to compute error bars.
|
|
69
69
|
num_train_trials: int = 1
|
|
70
70
|
|
|
71
|
+
# If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
72
|
+
sample_train: bool = True
|
|
73
|
+
|
|
71
74
|
# Decoding parameters (inherited by `Request`)
|
|
72
75
|
|
|
73
76
|
# Model to make the request to (need to fill in)
|
|
@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
23
23
|
@htrack(None)
|
|
24
24
|
def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
|
|
25
25
|
"""
|
|
26
|
-
Takes a
|
|
26
|
+
Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
|
|
27
27
|
The reason we don't do this per eval instance is that we create a common set of
|
|
28
28
|
training instances which is shared across all eval instances.
|
|
29
29
|
"""
|
|
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
65
65
|
parallelism: int,
|
|
66
66
|
) -> List[RequestState]:
|
|
67
67
|
self.train_trial_index: int = train_trial_index
|
|
68
|
-
self.train_instances: List[Instance] = self.sample_examples(
|
|
68
|
+
self.train_instances: List[Instance] = self.sample_examples(
|
|
69
|
+
all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
|
|
70
|
+
)
|
|
69
71
|
hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
|
|
70
72
|
|
|
71
73
|
# Generate request_states
|
|
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
93
95
|
|
|
94
96
|
return [request_state for result in results for request_state in result]
|
|
95
97
|
|
|
96
|
-
def sample_examples(
|
|
98
|
+
def sample_examples(
|
|
99
|
+
self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
|
|
100
|
+
) -> List[Instance]:
|
|
97
101
|
"""
|
|
98
102
|
Sample a random set of train instances to use as examples by following the steps below:
|
|
99
103
|
1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
|
|
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
121
125
|
random.seed(seed)
|
|
122
126
|
num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
|
|
123
127
|
|
|
128
|
+
examples: List[Instance] = []
|
|
129
|
+
if not sample_train:
|
|
130
|
+
# Select sequentially from the train set
|
|
131
|
+
examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
|
|
132
|
+
return examples
|
|
133
|
+
|
|
124
134
|
unlabeled_instances: List[Instance] = []
|
|
125
135
|
label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
|
|
126
|
-
|
|
127
136
|
for instance in all_train_instances:
|
|
128
137
|
if instance.first_correct_reference:
|
|
129
138
|
label_to_instances[instance.first_correct_reference.output.text].append(instance)
|
|
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
145
154
|
sorted_labels.extend(labels)
|
|
146
155
|
|
|
147
156
|
labels_iterable = cycle(sorted_labels)
|
|
148
|
-
examples: List[Instance] = []
|
|
149
157
|
while num_instances_to_sample > 0:
|
|
150
158
|
next_label: Optional[str] = next(labels_iterable, None)
|
|
151
159
|
if not next_label:
|
|
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
218
226
|
|
|
219
227
|
# References (optionally) and output
|
|
220
228
|
output: str
|
|
229
|
+
|
|
230
|
+
delimiter = ","
|
|
221
231
|
if reference_index is None:
|
|
222
232
|
# Put only the correct reference as the output
|
|
223
|
-
|
|
224
|
-
|
|
233
|
+
correct_references: List[Reference] = instance.all_correct_references
|
|
234
|
+
if not correct_references:
|
|
235
|
+
output = "n/a"
|
|
236
|
+
else:
|
|
237
|
+
output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
|
|
225
238
|
else:
|
|
226
239
|
reference = instance.references[reference_index]
|
|
227
240
|
output = reference.output.text
|
|
File without changes
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
from sklearn.metrics import f1_score
|
|
4
|
+
from sklearn.preprocessing import MultiLabelBinarizer
|
|
4
5
|
|
|
5
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
7
|
from helm.benchmark.metrics.basic_metrics import normalize_text
|
|
@@ -20,8 +21,7 @@ class ClassificationMetric(Metric):
|
|
|
20
21
|
|
|
21
22
|
Note:
|
|
22
23
|
- The set of classes is derived from the correct references from all the instances.
|
|
23
|
-
This means that classes may be omitted if they
|
|
24
|
-
reference.
|
|
24
|
+
This means that classes may be omitted if they are never used as a correct reference.
|
|
25
25
|
- Generations that are not in any of the known classes are counted as a
|
|
26
26
|
negative prediction for every class.
|
|
27
27
|
- Perturbed classes are considered different classes from unperturbed
|
|
@@ -29,10 +29,16 @@ class ClassificationMetric(Metric):
|
|
|
29
29
|
- Currently, multi-label classification is not supported.
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
|
+
def __init__(self, delimiter: Optional[str] = None):
|
|
33
|
+
self.delimiter = delimiter
|
|
34
|
+
|
|
35
|
+
def is_multi_label(self) -> bool:
|
|
36
|
+
return bool(self.delimiter)
|
|
37
|
+
|
|
32
38
|
def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
|
|
33
|
-
y_pred: List[str] = []
|
|
34
|
-
y_true: List[str] = []
|
|
35
|
-
for request_state in request_states:
|
|
39
|
+
y_pred: List[List[str]] = []
|
|
40
|
+
y_true: List[List[str]] = []
|
|
41
|
+
for request_state in request_states: # one request state per instance
|
|
36
42
|
# Only the generation adapter is supported.
|
|
37
43
|
# TODO: Support multiple_choice_* adapters.
|
|
38
44
|
if request_state.reference_index is not None:
|
|
@@ -42,24 +48,23 @@ class ClassificationMetric(Metric):
|
|
|
42
48
|
assert request_state.result is not None
|
|
43
49
|
if len(request_state.result.completions) != 1:
|
|
44
50
|
raise ValueError("Result must contain exactly one completion")
|
|
45
|
-
|
|
46
|
-
num_correct = 0
|
|
47
|
-
for reference in request_state.instance.references:
|
|
48
|
-
if reference.is_correct:
|
|
49
|
-
num_correct += 1
|
|
50
|
-
y_true.append(normalize_text(reference.output.text))
|
|
51
|
-
if num_correct != 1:
|
|
52
|
-
# TODO: Support multi-label classification.
|
|
53
|
-
raise ValueError("ClassificationMetric does not support multi-label classification")
|
|
54
51
|
if request_state.output_mapping:
|
|
55
52
|
raise ValueError("ClassificationMetric does not support multiple choice adapters")
|
|
56
|
-
|
|
57
|
-
|
|
53
|
+
|
|
54
|
+
references = request_state.instance.all_correct_references
|
|
55
|
+
if not self.is_multi_label():
|
|
56
|
+
assert len(references) == 1
|
|
57
|
+
correct_ref_texts = [normalize_text(ref.output.text) for ref in references if ref.output.text]
|
|
58
|
+
y_true.append(correct_ref_texts)
|
|
59
|
+
|
|
60
|
+
input_text = request_state.result.completions[0].text
|
|
61
|
+
predictions = input_text.split(self.delimiter) if self.is_multi_label() else [input_text]
|
|
62
|
+
y_pred.append([normalize_text(pred) for pred in predictions if pred])
|
|
63
|
+
labels: List[str] = list(set(y for ys in y_true for y in ys))
|
|
64
|
+
mlb = MultiLabelBinarizer().fit([labels])
|
|
65
|
+
y_true = mlb.transform(y_true)
|
|
66
|
+
y_pred = mlb.transform(y_pred)
|
|
58
67
|
return [
|
|
59
|
-
Stat(MetricName("classification_macro_f1")).add(
|
|
60
|
-
|
|
61
|
-
),
|
|
62
|
-
Stat(MetricName("classification_micro_f1")).add(
|
|
63
|
-
f1_score(y_pred=y_pred, y_true=y_true, labels=list(labels), average="micro")
|
|
64
|
-
),
|
|
68
|
+
Stat(MetricName("classification_macro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="macro")),
|
|
69
|
+
Stat(MetricName("classification_micro_f1")).add(f1_score(y_pred=y_pred, y_true=y_true, average="micro")),
|
|
65
70
|
]
|
|
@@ -63,7 +63,8 @@ def _expected_stats(all_classes_counts: Dict[str, Dict[str, int]]):
|
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
def test_evaluate_instances_binary_generation():
|
|
66
|
-
metric = ClassificationMetric()
|
|
66
|
+
metric = ClassificationMetric(delimiter=None)
|
|
67
|
+
|
|
67
68
|
request_states = [
|
|
68
69
|
_request_state("yes", [_Option("yes", True)]),
|
|
69
70
|
_request_state("yes", [_Option("yes", True)]),
|
|
@@ -86,20 +87,21 @@ def test_evaluate_instances_binary_generation():
|
|
|
86
87
|
|
|
87
88
|
|
|
88
89
|
def test_evaluate_instances_multi_class():
|
|
89
|
-
|
|
90
|
+
# Note: no "a" because it would get filtered out by normalize_text()
|
|
91
|
+
metric = ClassificationMetric(delimiter=None)
|
|
90
92
|
|
|
91
93
|
def _options(correct: str):
|
|
92
|
-
return [_Option(text, text == correct) for text in ["
|
|
94
|
+
return [_Option(text, text == correct) for text in ["d", "b", "c"]]
|
|
93
95
|
|
|
94
96
|
request_states = [
|
|
95
|
-
_request_state("
|
|
96
|
-
_request_state("
|
|
97
|
-
_request_state("
|
|
98
|
-
_request_state("
|
|
97
|
+
_request_state("d", _options("d")),
|
|
98
|
+
_request_state("d", _options("d")),
|
|
99
|
+
_request_state("d", _options("d")),
|
|
100
|
+
_request_state("d", _options("b")),
|
|
99
101
|
_request_state("b", _options("b")),
|
|
100
102
|
_request_state("b", _options("b")),
|
|
101
103
|
_request_state("b", _options("c")),
|
|
102
|
-
_request_state("c", _options("
|
|
104
|
+
_request_state("c", _options("d")),
|
|
103
105
|
_request_state("c", _options("c")),
|
|
104
106
|
_request_state("invalid", _options("c")),
|
|
105
107
|
]
|
|
@@ -107,9 +109,42 @@ def test_evaluate_instances_multi_class():
|
|
|
107
109
|
metric.evaluate_instances(request_states),
|
|
108
110
|
_expected_stats(
|
|
109
111
|
{
|
|
110
|
-
"
|
|
112
|
+
"d": {"tp": 3, "fp": 1, "tn": 5, "fn": 1},
|
|
111
113
|
"b": {"tp": 2, "fp": 1, "tn": 6, "fn": 1},
|
|
112
114
|
"c": {"tp": 1, "fp": 1, "tn": 6, "fn": 2},
|
|
113
115
|
}
|
|
114
116
|
),
|
|
115
117
|
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_evaluate_instances_multilabel():
|
|
121
|
+
# Note: no "a" because it would get filtered out by normalize_text()
|
|
122
|
+
metric = ClassificationMetric(delimiter=",")
|
|
123
|
+
|
|
124
|
+
def _options(correct: List[str]):
|
|
125
|
+
return [_Option(text, text in correct) for text in ["d", "b", "c"]]
|
|
126
|
+
|
|
127
|
+
request_states = [
|
|
128
|
+
_request_state("d,b", _options(["d", "b"])),
|
|
129
|
+
_request_state("d,b", _options(["d", "c"])),
|
|
130
|
+
_request_state("d", _options(["d"])),
|
|
131
|
+
_request_state("c", _options(["b"])),
|
|
132
|
+
_request_state("b", _options(["b", "c"])),
|
|
133
|
+
_request_state("d,b", _options(["c"])),
|
|
134
|
+
_request_state("d,c", _options(["d"])),
|
|
135
|
+
_request_state("d,b,c", _options(["d", "b", "c"])),
|
|
136
|
+
_request_state("", []),
|
|
137
|
+
_request_state("n/a", []),
|
|
138
|
+
_request_state("invalid", _options(["c"])),
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
assert_stats_equal(
|
|
142
|
+
metric.evaluate_instances(request_states),
|
|
143
|
+
_expected_stats(
|
|
144
|
+
{
|
|
145
|
+
"d": {"tp": 5, "fp": 1, "tn": 5, "fn": 0},
|
|
146
|
+
"b": {"tp": 3, "fp": 2, "tn": 5, "fn": 1},
|
|
147
|
+
"c": {"tp": 1, "fp": 2, "tn": 4, "fn": 4},
|
|
148
|
+
}
|
|
149
|
+
),
|
|
150
|
+
)
|