crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
- helm/benchmark/__init__.py +13 -0
- helm/benchmark/adaptation/adapter_spec.py +3 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
- helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
- helm/benchmark/contamination/__init__.py +0 -0
- helm/benchmark/metrics/classification_metrics.py +70 -0
- helm/benchmark/metrics/machine_translation_metrics.py +36 -0
- helm/benchmark/metrics/summarization_metrics.py +7 -8
- helm/benchmark/metrics/test_classification_metrics.py +150 -0
- helm/benchmark/presentation/create_plots.py +617 -0
- helm/benchmark/presentation/run_display.py +7 -48
- helm/benchmark/presentation/summarize.py +4 -2
- helm/benchmark/presentation/test_create_plots.py +32 -0
- helm/benchmark/run.py +144 -48
- helm/benchmark/run_expander.py +164 -47
- helm/benchmark/run_specs.py +346 -39
- helm/benchmark/runner.py +34 -6
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
- helm/benchmark/scenarios/imdb_listdir.json +50014 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
- helm/benchmark/scenarios/lextreme_scenario.py +458 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
- helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
- helm/benchmark/scenarios/med_qa_scenario.py +96 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
- helm/benchmark/scenarios/scenario.py +5 -0
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
- helm/benchmark/static/benchmarking.css +14 -0
- helm/benchmark/static/benchmarking.js +43 -0
- helm/benchmark/static/index.html +2 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/plot-captions.js +16 -0
- helm/benchmark/static/schema.yaml +154 -1
- helm/benchmark/window_services/cohere_window_service.py +20 -0
- helm/benchmark/window_services/flan_t5_window_service.py +29 -0
- helm/benchmark/window_services/huggingface_window_service.py +39 -0
- helm/benchmark/window_services/santacoder_window_service.py +27 -0
- helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
- helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
- helm/benchmark/window_services/window_service_factory.py +34 -7
- helm/common/codec.py +123 -0
- helm/common/general.py +12 -5
- helm/common/test_codec.py +144 -0
- helm/proxy/clients/aleph_alpha_client.py +47 -28
- helm/proxy/clients/auto_client.py +32 -24
- helm/proxy/clients/google_client.py +88 -0
- helm/proxy/clients/huggingface_client.py +32 -16
- helm/proxy/clients/huggingface_model_registry.py +111 -0
- helm/proxy/clients/huggingface_tokenizer.py +25 -7
- helm/proxy/clients/openai_client.py +60 -2
- helm/proxy/clients/test_huggingface_model_registry.py +57 -0
- helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
- helm/proxy/clients/together_client.py +17 -2
- helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
- helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
- helm/proxy/models.py +115 -7
- helm/proxy/test_models.py +1 -1
- helm/benchmark/presentation/present.py +0 -249
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: crfm-helm
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Benchmark for language models
|
|
5
5
|
Home-page: https://github.com/stanford-crfm/helm
|
|
6
6
|
Author: Stanford CRFM
|
|
@@ -12,15 +12,11 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
12
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
13
|
Requires-Python: ~=3.8
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: pytest (~=7.2.0)
|
|
16
|
-
Requires-Dist: black (~=22.10.0)
|
|
17
|
-
Requires-Dist: mypy (~=0.982)
|
|
18
|
-
Requires-Dist: pre-commit (~=2.20.0)
|
|
19
|
-
Requires-Dist: flake8 (~=5.0.4)
|
|
20
15
|
Requires-Dist: zstandard (~=0.18.0)
|
|
21
16
|
Requires-Dist: tqdm (~=4.64.1)
|
|
22
17
|
Requires-Dist: pyhocon (~=0.3.59)
|
|
23
18
|
Requires-Dist: dacite (~=1.6.0)
|
|
19
|
+
Requires-Dist: aleph-alpha-client (~=2.14.0)
|
|
24
20
|
Requires-Dist: bottle (~=0.12.23)
|
|
25
21
|
Requires-Dist: gunicorn (~=20.1.0)
|
|
26
22
|
Requires-Dist: Mako (~=1.2.3)
|
|
@@ -28,8 +24,9 @@ Requires-Dist: sqlitedict (~=1.7.0)
|
|
|
28
24
|
Requires-Dist: pymongo (~=4.2.0)
|
|
29
25
|
Requires-Dist: retrying (~=1.3.3)
|
|
30
26
|
Requires-Dist: websocket-client (~=1.3.2)
|
|
31
|
-
Requires-Dist: openai (~=0.
|
|
32
|
-
Requires-Dist: transformers (~=4.
|
|
27
|
+
Requires-Dist: openai (~=0.27.0)
|
|
28
|
+
Requires-Dist: transformers (~=4.26.1)
|
|
29
|
+
Requires-Dist: tokenizers (~=0.13.2)
|
|
33
30
|
Requires-Dist: icetk (~=0.0.4)
|
|
34
31
|
Requires-Dist: protobuf (~=3.20.2)
|
|
35
32
|
Requires-Dist: google-api-python-client (~=2.64.0)
|
|
@@ -40,6 +37,8 @@ Requires-Dist: jsonlines (~=3.1.0)
|
|
|
40
37
|
Requires-Dist: sympy (~=1.11.1)
|
|
41
38
|
Requires-Dist: sentencepiece (~=0.1.97)
|
|
42
39
|
Requires-Dist: numba (~=0.56.4)
|
|
40
|
+
Requires-Dist: cattrs (~=22.2.0)
|
|
41
|
+
Requires-Dist: xlrd (~=2.0.1)
|
|
43
42
|
Requires-Dist: importlib-resources (~=5.10.0)
|
|
44
43
|
Requires-Dist: nltk (~=3.7)
|
|
45
44
|
Requires-Dist: scipy (~=1.9.1)
|
|
@@ -53,5 +52,9 @@ Requires-Dist: spacy (~=3.2.4)
|
|
|
53
52
|
Requires-Dist: summ-eval (~=0.892)
|
|
54
53
|
Requires-Dist: torch (~=1.12.1)
|
|
55
54
|
Requires-Dist: torchvision (~=0.13.1)
|
|
55
|
+
Requires-Dist: colorcet (~=3.0.1)
|
|
56
|
+
Requires-Dist: matplotlib (~=3.6.0)
|
|
57
|
+
Requires-Dist: numpy (~=1.23.3)
|
|
58
|
+
Requires-Dist: seaborn (~=0.11.0)
|
|
56
59
|
|
|
57
60
|
Benchmark for language models
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
helm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
helm/benchmark/__init__.py,sha256=
|
|
2
|
+
helm/benchmark/__init__.py,sha256=haJrJawd2zOTaxV_nkk6-V05vnePuHwCi0DytuJ0898,4450
|
|
3
3
|
helm/benchmark/data_preprocessor.py,sha256=aNdM-o2t4qkLIQHiQeWUFg03DjjJ8HTBIphYCK8pXVo,2173
|
|
4
4
|
helm/benchmark/executor.py,sha256=Vkmc4wmar2MRIavfiUOa2mu8Pp-zXsguYOevbjog4-4,3299
|
|
5
|
-
helm/benchmark/run.py,sha256=
|
|
6
|
-
helm/benchmark/run_expander.py,sha256=
|
|
7
|
-
helm/benchmark/run_specs.py,sha256=
|
|
8
|
-
helm/benchmark/runner.py,sha256=
|
|
5
|
+
helm/benchmark/run.py,sha256=AWa862BtEh5aOTjKZ9OkSv3be2ZrU4R1qiwJtRTQwfk,9402
|
|
6
|
+
helm/benchmark/run_expander.py,sha256=vnq-zRmuXLzgr3sS3XYaXJFarNC7-QKc0_DtPjwXq3Q,32952
|
|
7
|
+
helm/benchmark/run_specs.py,sha256=ssBJYMZVMF4XGk6lvCSlQJh6A-Pmh2_ndi_JAwgW0CQ,71441
|
|
8
|
+
helm/benchmark/runner.py,sha256=zYDe8UeB1LFmbpChmRdRqEIZo-X0xWMenOCp2NnZ9Ws,8802
|
|
9
9
|
helm/benchmark/server.py,sha256=HsuVsch1SPjQ4YyZi60kjr3JZeL82h8jgkxTUlfb130,1620
|
|
10
10
|
helm/benchmark/test_data_preprocessor.py,sha256=adT-pgVeWvmZXLUUehxH0C-lMhXhtdxsvYdr69o1BD4,2047
|
|
11
11
|
helm/benchmark/adaptation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
helm/benchmark/adaptation/adapter_spec.py,sha256=
|
|
12
|
+
helm/benchmark/adaptation/adapter_spec.py,sha256=YoxMyN4RJM8GG_DeZ-k0edyARZ69hHLkvOlvKCk-u2o,2811
|
|
13
13
|
helm/benchmark/adaptation/prompt.py,sha256=MATerIUIhFp_BMGvK7bLpNtWH6Oi4kknjBjOkr2bHv4,1948
|
|
14
14
|
helm/benchmark/adaptation/request_state.py,sha256=o3OpZbB0TJFiZ2Nmhvg3vWmByaUSYTffT_WnoNb7w68,2712
|
|
15
15
|
helm/benchmark/adaptation/scenario_state.py,sha256=ZflBuNgvN0JqUhshFcy0kTweO1WJs6j5UCaTxWTMe0o,1747
|
|
@@ -18,7 +18,7 @@ helm/benchmark/adaptation/adapters/adapter.py,sha256=8wK28jISxW8rUfXP-_-FfQJRRzc
|
|
|
18
18
|
helm/benchmark/adaptation/adapters/adapter_factory.py,sha256=N2n-xIoGt_DxlN0LT4GUgVvdoaqhyUU8rSWr_nyfb80,2318
|
|
19
19
|
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py,sha256=3j24nFQuZE0Zl6DMAB4aYUpjieerdSMLsJbpMT9Nzfw,5646
|
|
20
20
|
helm/benchmark/adaptation/adapters/generation_adapter.py,sha256=-on4QAo8hhzJVgAnM6G8lFFqaoiSiVF-KxwwfHwE61A,1927
|
|
21
|
-
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=
|
|
21
|
+
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py,sha256=q5K4Hag8LOpfpkeEzwIMPLNpBqMThcB1LXLGr_n_Xfo,13118
|
|
22
22
|
helm/benchmark/adaptation/adapters/language_modeling_adapter.py,sha256=vPo2EVgbMfzmwPPcljoXdDfBW3c80LLKaUhA-RefU2w,11967
|
|
23
23
|
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py,sha256=pV14yvmH_mRQpeXF0teAxGpJcouSQViipr-aMkNE-AM,1711
|
|
24
24
|
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py,sha256=4CALqc--PUaEl3cLzmjP9nFSuarCZMKBwrPQxde5TYM,3471
|
|
@@ -30,6 +30,7 @@ helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py,sha256=
|
|
|
30
30
|
helm/benchmark/augmentations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
31
|
helm/benchmark/augmentations/contraction_expansion_perturbation.py,sha256=bqcXlazuoss4hOxOarQptNEh52SU4Gy58Zph1PZ34W4,4803
|
|
32
32
|
helm/benchmark/augmentations/contrast_sets_perturbation.py,sha256=fpLtZDgQY8beQLBlCId3gSyvCoPHVq3J7PGzcNdR0kM,3454
|
|
33
|
+
helm/benchmark/augmentations/correct_to_misspelling.json,sha256=L44RiJXlJCa6zQzTLf0MFHCOhFyRDRKfLQNXH-n3XIs,213429
|
|
33
34
|
helm/benchmark/augmentations/data_augmenter.py,sha256=57LA6h7z1tVMy_xGcW46F1KRi3D4wnv0fi8XeJjsi2c,3849
|
|
34
35
|
helm/benchmark/augmentations/dialect_perturbation.py,sha256=zy3SJtYAxHf2fMB7w-u5gsEC6q8g-94sKnwNLVp0pFc,6227
|
|
35
36
|
helm/benchmark/augmentations/extra_space_perturbation.py,sha256=9_pmthcyFfuYu6GsJB03hKkhvDZqqfH7hOeSNZRohSg,835
|
|
@@ -45,6 +46,7 @@ helm/benchmark/augmentations/space_perturbation.py,sha256=7OdpoibdizoPDBPpLc1ENy
|
|
|
45
46
|
helm/benchmark/augmentations/synonym_perturbation.py,sha256=2qFx7xparhEPd82tvs59HkAr1hwQWv7asWtmNCbcQrQ,4209
|
|
46
47
|
helm/benchmark/augmentations/test_perturbation.py,sha256=v_U5CmBpA5aXqg4EJUYZrSfGsNbZTwCP0inxz1XNGq0,9991
|
|
47
48
|
helm/benchmark/augmentations/typos_perturbation.py,sha256=nfF1Zw2REKZEnnyPVFWD87MP8L5ANbaZXeI2n70Sonw,2790
|
|
49
|
+
helm/benchmark/contamination/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
50
|
helm/benchmark/efficiency_data/inference_denoised_runtimes.json,sha256=ios_dt-_8wtXvkVAx0iI2zwCxqHvk3XKTx31qHPalsI,4203
|
|
49
51
|
helm/benchmark/efficiency_data/inference_idealized_runtimes.json,sha256=5w7reeZc0yc4cjH8kJGxQQSoe8yaRVX2SSlSrx0QWFQ,12348
|
|
50
52
|
helm/benchmark/efficiency_data/training_efficiency.json,sha256=aH2moiBLStOLVi8Ci2KTK5ZkWlTBLK-B3fRfNZwhoSg,9763
|
|
@@ -53,18 +55,21 @@ helm/benchmark/metrics/basic_metrics.py,sha256=xBCTgLVdiWVxdpB08MbWFc2nvwCJTgD1-
|
|
|
53
55
|
helm/benchmark/metrics/bbq_metrics.py,sha256=H44mwKXLJ0PXo-sVKgRHCEKjZGnCahDD6GOQMWpnbOQ,6061
|
|
54
56
|
helm/benchmark/metrics/bias_metrics.py,sha256=uVJFQvDSzvPR1ELu0FNYyExjRy2ThaJRCw8beEMDqJs,11309
|
|
55
57
|
helm/benchmark/metrics/bias_word_lists.py,sha256=mx5JjW3mHffXIqo4GcQN-zENUEttBqQnEjPTz3J3J_4,13909
|
|
58
|
+
helm/benchmark/metrics/classification_metrics.py,sha256=1q7gPnWRrx4QwE8T0m269vFJWg_bKfVx21a5spDBbjU,3701
|
|
56
59
|
helm/benchmark/metrics/code_metrics.py,sha256=uWdigk0QyEsfVHQzq9KxkOc-LROvcqWXeui42Mr0YF4,5119
|
|
57
60
|
helm/benchmark/metrics/code_metrics_helper.py,sha256=byyuI1lJgbIDPVJzywaBsam9zFMPPyn28g1grsK9xyA,22336
|
|
58
61
|
helm/benchmark/metrics/copyright_metrics.py,sha256=8sk85mLTasWIgHIXxOho0z_nQYyLqtzSWHSAwd5ayAQ,7560
|
|
59
62
|
helm/benchmark/metrics/disinformation_metrics.py,sha256=YH8QJ5s8LvIRzp49_O20UvOm8_z7PKyleoOX3hdX0HE,10499
|
|
63
|
+
helm/benchmark/metrics/machine_translation_metrics.py,sha256=Ki7wBa9Odko_wVg1ec1MIoY2fHn0oY_vYrT3r7Ya6tc,1559
|
|
60
64
|
helm/benchmark/metrics/metric.py,sha256=zF3IHmjGxRXrUoIOEIb2wRbsTPldzgclF8uPiGsZv4g,18789
|
|
61
65
|
helm/benchmark/metrics/metric_name.py,sha256=hk5WQ6uj_9EjgKKFawPenL2-XOMf-aKvNRkOxlu4nCo,1355
|
|
62
66
|
helm/benchmark/metrics/metric_service.py,sha256=aJ21wWdc1Spfi3mjrj8JEnsANL45P7wr7fk_EdDObko,709
|
|
63
67
|
helm/benchmark/metrics/numeracy_metrics.py,sha256=panMWD3a1NPerg3Ix7l6NhR7jGOIQOQV9i_KysBeDA8,2818
|
|
64
68
|
helm/benchmark/metrics/ranking_metrics.py,sha256=b3qxTRnr62zz1Gr1dsVDYtdwB8WBIb-v98yoRB3Wtvs,17231
|
|
65
69
|
helm/benchmark/metrics/statistic.py,sha256=9VM5JA1-M_iYCNziWm2qeDZaAQqPQ_ySdaSMcqAeYdM,3048
|
|
66
|
-
helm/benchmark/metrics/summarization_metrics.py,sha256=
|
|
70
|
+
helm/benchmark/metrics/summarization_metrics.py,sha256=hHNWGYA1bNfgCg7o1RSiTo7E-SJujHhkKh9G204icoo,16083
|
|
67
71
|
helm/benchmark/metrics/test_bias_metrics.py,sha256=brut1rdnKNtTVJoe6qkllmJwZTFBZkLcyI_4qmqZ_vA,6264
|
|
72
|
+
helm/benchmark/metrics/test_classification_metrics.py,sha256=usW5ciUYu2ZUUqVjFk4NfZTGNIoBArwia_-8uGOvFpw,5475
|
|
68
73
|
helm/benchmark/metrics/test_metric.py,sha256=S7LGHNCHuhMk582eHylw1tOasUBEf_7F0T4u3tey7b4,757
|
|
69
74
|
helm/benchmark/metrics/test_numeracy_metrics.py,sha256=ls1ZIHDePKpHMoqAbf4HmJ1SIBjLFuLIzGbfg6OiZvM,4162
|
|
70
75
|
helm/benchmark/metrics/test_statistic.py,sha256=WQv9i8wSNTCzlw-L1wir0lmW0g3D4CM_ebpii7IB9Lw,406
|
|
@@ -85,13 +90,14 @@ helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py,sha256=gPqoYNI
|
|
|
85
90
|
helm/benchmark/metrics/tokens/token_cost_estimator.py,sha256=fTGUfhHV6yMwpTkCEMTGMxKO8jskqJz4sAtwXT6M_C8,425
|
|
86
91
|
helm/benchmark/presentation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
87
92
|
helm/benchmark/presentation/contamination.py,sha256=5wLwq266sCxT62MdzAXT9V-au6b07HaL44DLj_2qiSk,2788
|
|
88
|
-
helm/benchmark/presentation/
|
|
89
|
-
helm/benchmark/presentation/run_display.py,sha256=
|
|
93
|
+
helm/benchmark/presentation/create_plots.py,sha256=-YyrhEmfVOMnESJ8m2yk7RWAOYdZkVrLAt2K8XnpNF0,28442
|
|
94
|
+
helm/benchmark/presentation/run_display.py,sha256=HSvV71ZRshMIhHZHGtlbYfRxK9xx1GQgn6YmGPVncME,9892
|
|
90
95
|
helm/benchmark/presentation/run_entry.py,sha256=J1QgLOP99N7N4bs7nzXWxyU3pOd-a1j8xwL9ag1nP_Y,1158
|
|
91
96
|
helm/benchmark/presentation/schema.py,sha256=i1utCqiNkbTK9CcDcOFwi7e91KPaKkpFU07ZcBfWXTc,8753
|
|
92
|
-
helm/benchmark/presentation/summarize.py,sha256=
|
|
97
|
+
helm/benchmark/presentation/summarize.py,sha256=BUXog2m_UPbftyzFHx_U4mE2FrG56iv9mvcCdXoZVmI,45071
|
|
93
98
|
helm/benchmark/presentation/table.py,sha256=VzVMwsgP3kItAM6FPRUaTphzJ-ZjriiuFbWlO1rJUMU,2879
|
|
94
99
|
helm/benchmark/presentation/test_contamination.py,sha256=8mnzUzxUW9pXUOuLpU4BBBg0V7Mn1d1s4AQgwy6_kl4,459
|
|
100
|
+
helm/benchmark/presentation/test_create_plots.py,sha256=2q3v2Qdh_hBKCEX9toygXFLIryu1FlcLMt2PXprx7j8,1251
|
|
95
101
|
helm/benchmark/presentation/test_run_entry.py,sha256=M5z4dnVb7fM3PWrZWIZNlG8CT4KnDxjnEE4FBb1ZFNU,621
|
|
96
102
|
helm/benchmark/scenarios/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
97
103
|
helm/benchmark/scenarios/babi_qa_scenario.py,sha256=SkIJjqBuMLYxZR8l9epu9arBeirvJPtQsIBXv4bzkx4,5030
|
|
@@ -104,7 +110,8 @@ helm/benchmark/scenarios/civil_comments_scenario.py,sha256=vXq6KxyS5C0-tD8xUmkG5
|
|
|
104
110
|
helm/benchmark/scenarios/code_scenario.py,sha256=Q_TP_vWewkClvibPFHXpsOjR-CWexYgu5kl4OpfZXNc,11355
|
|
105
111
|
helm/benchmark/scenarios/code_scenario_helper.py,sha256=EbQNfHqhQXaMMPmYT2mG2dRjzYaI2FvcPb9j6NlNHDU,5853
|
|
106
112
|
helm/benchmark/scenarios/commonsense_scenario.py,sha256=9roSJS3iGSNgqxTbLI87xuZGB8IxJkbbtzr-ep0HUn0,10661
|
|
107
|
-
helm/benchmark/scenarios/copyright_scenario.py,sha256=
|
|
113
|
+
helm/benchmark/scenarios/copyright_scenario.py,sha256=APYQPC-esq3oM2qQxW6JNxa4pkv_yHDKfePjpvvi6nQ,3660
|
|
114
|
+
helm/benchmark/scenarios/covid_dialog_scenario.py,sha256=FmYIuRr81xD_d0iyRa5blPC8OTqpfv8XGTz5XXUOd2E,3958
|
|
108
115
|
helm/benchmark/scenarios/dialogue_scenarios.py,sha256=SPwo1iYiLbPwNtOgAVkTr-dO8FQLshmrfXdjPcayW5A,5616
|
|
109
116
|
helm/benchmark/scenarios/disinformation_scenario.py,sha256=Ff66LxBm8APuMziLfGvTM0WIatrAty5_q_8ObaLW5lo,8491
|
|
110
117
|
helm/benchmark/scenarios/dyck_language_scenario.py,sha256=Yua5S2gwLX2C-odJY3LeL-Yj47H1xXChdlI8cratVz4,9300
|
|
@@ -112,42 +119,53 @@ helm/benchmark/scenarios/entity_data_imputation_scenario.py,sha256=oChQzEptlf731
|
|
|
112
119
|
helm/benchmark/scenarios/entity_matching_scenario.py,sha256=gtrLSCw2JSNnBgFQFsUm4EcICsjVWtp9wsOdqcyBU4k,6863
|
|
113
120
|
helm/benchmark/scenarios/gsm_scenario.py,sha256=PmX0zutkGqnqGirWidUdk166cWv_23RtaTFcVQGBpzc,2619
|
|
114
121
|
helm/benchmark/scenarios/ice_scenario.py,sha256=smrpTOwtMDL-m40zfKfNz9btOGoINZNv3_2oBcLBMmk,16156
|
|
122
|
+
helm/benchmark/scenarios/imdb_listdir.json,sha256=eczxp9gslYYwx5XR86ATnZorIxuujFMDTfzR4h5NCpo,1015402
|
|
115
123
|
helm/benchmark/scenarios/imdb_scenario.py,sha256=VTD0Ur6ATyY9NWxcnkGzn9Iw5vl4d94o0FbFm61ZZTA,6057
|
|
116
124
|
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py,sha256=wYaivSqqYYZPjPHTKaS6D7j960dcLIVTfmuZ8awd1Zs,2192
|
|
117
125
|
helm/benchmark/scenarios/legal_support_scenario.py,sha256=K9HfTyHJnnLFvGKNwYQayu7JV4uyNT5wH48wc9ixRa0,3912
|
|
126
|
+
helm/benchmark/scenarios/lex_glue_scenario.py,sha256=r89KevvM1Kifu3ZkUIXAV8jXktclNtL0-JL9T6qOx_Q,10224
|
|
127
|
+
helm/benchmark/scenarios/lextreme_scenario.py,sha256=tejeKE08YX5MYFXrJRY48rIzO5V5fdwHDWEzg883K2E,20300
|
|
118
128
|
helm/benchmark/scenarios/lsat_qa_scenario.py,sha256=2-GrPB8whoBFcQ5608jrwlAcRJgpkT1P2UehcR6-EYY,5977
|
|
119
129
|
helm/benchmark/scenarios/math_scenario.py,sha256=5PespNtseDOnPgAwtdP0vMkXz1CaJM0BkJsWdeG5gUM,13825
|
|
130
|
+
helm/benchmark/scenarios/me_q_sum_scenario.py,sha256=ZIn7tlBC4baV8CYcU_-mYe2RbYaQ-8dX1Ca_hOvZTfI,3988
|
|
131
|
+
helm/benchmark/scenarios/med_dialog_scenario.py,sha256=w_s0s5TY6VjnCdmJ5BcSbDAYKZtnb5c7KSP1wYd9z9A,7282
|
|
132
|
+
helm/benchmark/scenarios/med_mcqa_scenario.py,sha256=OgxZbyqzNnADTjYMlejV2I54G7tK3awGecTaxSgO9W8,5022
|
|
133
|
+
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py,sha256=nw6EhUtZq6ABFXxNU0Uh2mG5yiCUDesKqLQdZo7Kr90,7578
|
|
134
|
+
helm/benchmark/scenarios/med_qa_scenario.py,sha256=rHSoIfb50Uq8T7l37m9Wn_fC16otifaP77qAEctf1oI,4404
|
|
120
135
|
helm/benchmark/scenarios/mmlu_scenario.py,sha256=pzIRmLGikWTgB0AD2VFj64Q3GUsQg7nJzUqTSo-7pZo,3777
|
|
121
136
|
helm/benchmark/scenarios/msmarco_scenario.py,sha256=_EaKsppb2Ax8f_ETc3cBy27i5w5ajTtIbF3xNWO8lUA,33669
|
|
122
137
|
helm/benchmark/scenarios/narrativeqa_scenario.py,sha256=r5TwYkZH_YaCws6BSnjbiDQkZH_YM2BtxtDFVl4jj5s,5595
|
|
123
138
|
helm/benchmark/scenarios/natural_qa_scenario.py,sha256=nuL5Qlh26xq7Q_lvzK2sKVN7eNYh7F5SjbIjahKaMNg,12527
|
|
124
139
|
helm/benchmark/scenarios/newsqa_scenario.py,sha256=vMuIZyYxufH2AqhDoIZzzllfq8ScJIhSDH6lM3IUxGM,7242
|
|
125
140
|
helm/benchmark/scenarios/numeracy_scenario.py,sha256=Iwtypyb0249zKYyV6p4YUX4bYke6uIyL6R3aopiGUb8,30552
|
|
141
|
+
helm/benchmark/scenarios/opinions_qa_scenario.py,sha256=dtIYKL9ZiccX_F3-5OrnHXJdNBBTxCoTHc2Kc-XX79E,7380
|
|
126
142
|
helm/benchmark/scenarios/pubmed_qa_scenario.py,sha256=cpMmQWwCDXIYO0btGLhevMT-Mhs9-5Es9cDvQYkIlL0,7493
|
|
127
143
|
helm/benchmark/scenarios/quac_scenario.py,sha256=46nqmeVgkWu6jDGCHl-KHu351bmJj7jx_1p5kPwcOjc,6615
|
|
128
144
|
helm/benchmark/scenarios/raft_scenario.py,sha256=RzewlMVkHJ2XbZ9_9FzBvbdV__BuBRgtX2HyhCnmH1o,4500
|
|
129
145
|
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py,sha256=8DQD7KkilAfxwzHUwCPrPPHeYEz_dEOWSls8yZo15do,2387
|
|
130
|
-
helm/benchmark/scenarios/scenario.py,sha256=
|
|
146
|
+
helm/benchmark/scenarios/scenario.py,sha256=bdRcv-YoLkxjlpNcq4MXiu8HQgjByHkkLWOdih4ahsM,7365
|
|
131
147
|
helm/benchmark/scenarios/simple_scenarios.py,sha256=rcHzukhjBgvNRqkjcg0cms_zWtAsLPk0xiFN9I25_hI,1947
|
|
132
148
|
helm/benchmark/scenarios/summarization_scenario.py,sha256=fKeRSkXrH26WyfeIhn43_fZxnAO4bIX1Xh5HoKcjOQM,6550
|
|
133
149
|
helm/benchmark/scenarios/synthetic_efficiency_scenario.py,sha256=dtMMeULjw6pcobBRp0r6f9N4VCUKamx1Jy-6xPu85q0,3083
|
|
134
150
|
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py,sha256=YvEgO5qVZ4hLpnvjer4CG0Ct1upssZRjZWxnNi1ZUtY,16308
|
|
135
151
|
helm/benchmark/scenarios/synthetic_reasoning_scenario.py,sha256=Pm63shscbGuigg4zWfcw3rOIPP8LgxBCnpcQKAA_CX0,8327
|
|
136
152
|
helm/benchmark/scenarios/test_scenario.py,sha256=o8w8ElDPF-RzeCmecwyvie_nRMYj01b38BufXS-igqY,1612
|
|
137
|
-
helm/benchmark/scenarios/the_pile_scenario.py,sha256=
|
|
153
|
+
helm/benchmark/scenarios/the_pile_scenario.py,sha256=aCcjZp0wabu8lpPVNAdCr1x6m_3QvgKe7dIGS2qgGm4,4981
|
|
138
154
|
helm/benchmark/scenarios/truthful_qa_scenario.py,sha256=zQgv_qUU5h5ODoQE06rpJa6O_8FwF695cIie3PG7bx8,5969
|
|
139
155
|
helm/benchmark/scenarios/twitter_aae_scenario.py,sha256=_qWotvPagNj3ATnyaww3U1XZtxN-wgueUf34fFmKXQI,2083
|
|
140
156
|
helm/benchmark/scenarios/wikifact_scenario.py,sha256=pucEuLYz5N9qIobnEbJnRUnK6PrkWFdkl7yPuCJj3SE,5778
|
|
141
157
|
helm/benchmark/scenarios/wikitext_103_scenario.py,sha256=rXVbUzOZi4eWM-_HP1gzY5SBmMwOX1vk12WrLkR3NHo,3074
|
|
142
|
-
helm/benchmark/
|
|
143
|
-
helm/benchmark/static/benchmarking.
|
|
158
|
+
helm/benchmark/scenarios/wmt_14_scenario.py,sha256=u24E_w0AOXpl3PzEFLmiBcl8qyJEy-1Yc-i4YHgU99M,4356
|
|
159
|
+
helm/benchmark/static/benchmarking.css,sha256=DGC4Huh4tVD2o9wEeUf3YOc3MYcq2fmJQXvhTjVDumE,2057
|
|
160
|
+
helm/benchmark/static/benchmarking.js,sha256=qiXAY_9fiWZ4ydzhBQAUhylzxNCoIN8ciLlEFMg33uE,49107
|
|
144
161
|
helm/benchmark/static/contamination.yaml,sha256=LbISh56ORvfkkWptm7ZWmlPvWxtls6pBF1TbGiWD7hk,3096
|
|
145
162
|
helm/benchmark/static/general.js,sha256=L3S4CBUED0k7RsjLHCeWjO29ZMFJckZgNTAYAARzaEg,3029
|
|
146
|
-
helm/benchmark/static/index.html,sha256=
|
|
163
|
+
helm/benchmark/static/index.html,sha256=yzf_VEGW15CPZOW9a2Z6opoQ0Gg2YQCtj182niUb0fk,3130
|
|
147
164
|
helm/benchmark/static/info-icon.png,sha256=P-PW3Ek3NGiRAW5BXOjJRPBfMVqprjAqtQheGWu7zNI,3428
|
|
148
165
|
helm/benchmark/static/json-urls-root.js,sha256=G3qenwLgBojh3ukzp_gyMUaZja83ZFqvT1WQ_Rg11BU,98
|
|
149
|
-
helm/benchmark/static/json-urls.js,sha256=
|
|
150
|
-
helm/benchmark/static/
|
|
166
|
+
helm/benchmark/static/json-urls.js,sha256=wvsG2Lrz2XArwwMOl_tGXL9y4mjqzjod4gcqlvVCiQA,1750
|
|
167
|
+
helm/benchmark/static/plot-captions.js,sha256=gTBn-IPPD4BkzryVYj3KkGqLhWqWvBbvufeJaDygQxk,3010
|
|
168
|
+
helm/benchmark/static/schema.yaml,sha256=TQdzlpOPvTwELiU0w0HBAZqKQBmjj1LOkEGM85oa8e4,95657
|
|
151
169
|
helm/benchmark/static/utils.js,sha256=H2PKYjuXZ392DlALCPJ1XRwGxBDRFjL9eTFiTd4vBU8,7338
|
|
152
170
|
helm/benchmark/static/images/crfm-logo.png,sha256=dDkauL_wJR_Luu7L7pltphS3a9HSLjDkpVLa6C9vcA4,62712
|
|
153
171
|
helm/benchmark/static/images/helm-logo-simple.png,sha256=LtVAC4OgcWgMAob53rTrf7cRDu-O0z85ZOGGj9wR9hw,86133
|
|
@@ -172,23 +190,27 @@ helm/benchmark/window_services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
|
|
|
172
190
|
helm/benchmark/window_services/ai21_window_service.py,sha256=Mo0Zzj2a9hDiUg6hTuJWb3ABhBUPlOYS-kHhaE-pHUE,12672
|
|
173
191
|
helm/benchmark/window_services/anthropic_window_service.py,sha256=mA9aWKfc-dbSnpt37k6zDFdNNMfbevAoleCzSaar-uE,799
|
|
174
192
|
helm/benchmark/window_services/bloom_window_service.py,sha256=o7MVedt6khdoj8zikLDuVraEzuoBZk7j4Fzjsas0sD4,1023
|
|
175
|
-
helm/benchmark/window_services/cohere_window_service.py,sha256=
|
|
193
|
+
helm/benchmark/window_services/cohere_window_service.py,sha256=3SJT97CaxNxtUNS9_qKvKMVCA6lvNavS-xo6jAsxPbM,7070
|
|
176
194
|
helm/benchmark/window_services/encoder_decoder_window_service.py,sha256=Dg_rD0RtKIALtpvT4Wi4Am3zgFcIvgxfzdglw5fbbTU,2478
|
|
195
|
+
helm/benchmark/window_services/flan_t5_window_service.py,sha256=39IZX89_tay3bpSGVoWDoekmhW-RUNATQuT-bNyFRTs,915
|
|
177
196
|
helm/benchmark/window_services/gpt2_window_service.py,sha256=d-ys9FHoVI1u1GZOH734JPAOc8W6QZb1N3ZjooKmwz8,990
|
|
178
197
|
helm/benchmark/window_services/gptj_window_service.py,sha256=C2OEl-3ZatwxYVoaQgyvAZ5SBS2TK-2PnM02zUNjhiA,1103
|
|
179
198
|
helm/benchmark/window_services/gptneox_window_service.py,sha256=FD0NDlBxDu1fZ8vlaTsOBh2IKYfrMV73qNGetkrp1P4,944
|
|
199
|
+
helm/benchmark/window_services/huggingface_window_service.py,sha256=tDbhjn81Aw3ZSYMKh4aQU3e8JX1IVYCu_2gPOLaIWD8,1440
|
|
180
200
|
helm/benchmark/window_services/ice_window_service.py,sha256=5z52rP-xAF_jckIPoogyoNFW6FQXqaq3SHybCaBuRn0,2005
|
|
181
201
|
helm/benchmark/window_services/local_window_service.py,sha256=wgNBB-p9Zk-uFLRXYNxrz16_DhWyQ8x9sltSLBhzUh4,4247
|
|
182
202
|
helm/benchmark/window_services/luminous_window_service.py,sha256=0w-nyfXXDwnIDBzU3Y84LZPnQyJGiSbvf5Y_MQh-hCg,1791
|
|
183
203
|
helm/benchmark/window_services/mt_nlg_window_service.py,sha256=7zvEEZhqfdefUUFiDaqbJqrBu3Pt7BBuT_Si-4swV5s,838
|
|
184
204
|
helm/benchmark/window_services/openai_window_service.py,sha256=Oguy_ewlL2Uydq-B1QrqqHWKTenL4mtwtY24RNqhFCM,466
|
|
185
205
|
helm/benchmark/window_services/opt_window_service.py,sha256=ilj1G_pslwYeRZ2fhRMXsg2WjQ4rCfTUwuSmjItH-t4,1050
|
|
206
|
+
helm/benchmark/window_services/santacoder_window_service.py,sha256=sfbuAAdhEkQkFON_bHxDI9Ek3jZiwCF6upa41bsjUO4,674
|
|
186
207
|
helm/benchmark/window_services/t0pp_window_service.py,sha256=oa1vJRiyFPbkTb8eYnfjZNyKFJSvceipOq0U3Ys5e04,1196
|
|
187
208
|
helm/benchmark/window_services/t511b_window_service.py,sha256=8CvkSfuG_Bg17gAEre_bmM4tUwoi9fAWpPxx6qCJwAE,1005
|
|
188
209
|
helm/benchmark/window_services/test_ai21_window_service.py,sha256=U_n2mQ5GqD8oYv7e9vfNkACDic2-zDVuUIHN4meSC-I,8177
|
|
189
210
|
helm/benchmark/window_services/test_bloom_window_service.py,sha256=xeYpsvdR8Ug31BAb4a-PU4Sc0oihpCIMJ5OzTfgUhM0,4221
|
|
190
211
|
helm/benchmark/window_services/test_cohere_window_service.py,sha256=6WpIiuEyfgTZS3BIeAGOR8AAr2djpVIfGbItQRI19Ck,3205
|
|
191
212
|
helm/benchmark/window_services/test_cohere_window_service_utils.py,sha256=sf25f9MeXzoqsbDzZ7d7le13hm8RkDe54nhLtKF2pqo,158150
|
|
213
|
+
helm/benchmark/window_services/test_flan_t5_window_service.py,sha256=xv_EXbiRklveJPQtThYCSYF7qBYwjL7K4wH3Xu5z2Fg,591
|
|
192
214
|
helm/benchmark/window_services/test_gpt2_window_service.py,sha256=3k25pLa_z__g4yoQL40DEXj-T4dGtrgif13N2NXs59U,2568
|
|
193
215
|
helm/benchmark/window_services/test_gptj_window_service.py,sha256=sxsTpozKv9N-wZXtGl1prkQr9Md_q-tnCjO9zt226Co,2267
|
|
194
216
|
helm/benchmark/window_services/test_gptneox_window_service.py,sha256=MCYHZIoulJf_WCx6de7rWB3nqku6wCyAokA-SWIPEks,4140
|
|
@@ -203,57 +225,64 @@ helm/benchmark/window_services/test_utils.py,sha256=1k2TlPdDIRjum669jpH3O7UOqm4G
|
|
|
203
225
|
helm/benchmark/window_services/test_yalm_window_service.py,sha256=NdunSxq-qDzfzYMBYZ-0my6LaU2qUxtm7Ii0c4fyKnY,4273
|
|
204
226
|
helm/benchmark/window_services/tokenizer_service.py,sha256=RNznJBAxcCUMCurb7mbraZULx_ZtB0G7IxbrnUe0Urk,865
|
|
205
227
|
helm/benchmark/window_services/ul2_window_service.py,sha256=R_VEzOb59zQE9mmbTLunQeIvLAtK3-97h-B2_oc0Uxs,1021
|
|
228
|
+
helm/benchmark/window_services/wider_ai21_window_service.py,sha256=VZ6EERN48FSYsmJ_aiwI30SEbobLt27c1QqL29Zg_8M,414
|
|
206
229
|
helm/benchmark/window_services/wider_openai_window_service.py,sha256=cpm4mDEGTY-cmrECcDCL0flONBdh4g40uWNLI-v46BU,539
|
|
207
230
|
helm/benchmark/window_services/window_service.py,sha256=aV4YnbXl7T23runB8xmSWRwV7YtliUlXrveEejOMJ1Y,2885
|
|
208
|
-
helm/benchmark/window_services/window_service_factory.py,sha256=
|
|
231
|
+
helm/benchmark/window_services/window_service_factory.py,sha256=UDrXRUXGwBMEiO7Cw_nNbr0XM6mzzXrXiyJsvCmW9wg,5845
|
|
209
232
|
helm/benchmark/window_services/yalm_window_service.py,sha256=g7NeXvlRq3FXf2HwRStBMTkbWDxTdEITkrVRe-fv3mg,1805
|
|
210
233
|
helm/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
211
234
|
helm/common/authentication.py,sha256=RlMx29_TSrfU7ujE7dJkxmFub5EqLj2NswV5lAVFFDk,179
|
|
212
235
|
helm/common/cache.py,sha256=ustgsRHX0W3zoLPN05W3mFl9m9JYp9Ppq5cjMbdmm6Y,13116
|
|
213
|
-
helm/common/
|
|
236
|
+
helm/common/codec.py,sha256=zm8MP9Aqfh64D2HMZiCPEMoPkkiJxEzvzmuupGvkRh0,5499
|
|
237
|
+
helm/common/general.py,sha256=7vFw10h_hTrxUCfxE0LH13hp8Lunp77lbVmucgoeq2Y,10181
|
|
214
238
|
helm/common/hierarchical_logger.py,sha256=EnKLnfbQftca08EJfjGEQb4tcnCKbx-JtwLnoCnhMQs,2908
|
|
215
239
|
helm/common/object_spec.py,sha256=COMd4RpYgfulW940a5M_npbsfRBvLkmhjfwDIq4Gpqs,1833
|
|
216
240
|
helm/common/perspective_api_request.py,sha256=WAVwtajNVmi5XJNsPcorGEAVrqkpPSk-Kd3b0hJghbA,2427
|
|
217
241
|
helm/common/request.py,sha256=QBpkZBpU1nCTqtEM8Ekki3bZgxpASuVRC3sQ_4RYaRE,5793
|
|
218
242
|
helm/common/test_cache.py,sha256=XqboYHQAkFWIHPsuIjuageRSLeN7QoATKF7wwxggPqE,7054
|
|
243
|
+
helm/common/test_codec.py,sha256=igL--k-2DwAy0eoMr8D9Xs8MOjBoT0LutbMPzDlTNkM,5885
|
|
219
244
|
helm/common/test_general.py,sha256=zOxSwWNgWnWHsXKcG4NZ50GkWicn4uZ4jPVypSwFaQE,1672
|
|
220
245
|
helm/common/tokenization_request.py,sha256=aDyf4A6QlTgISXy4IyXJVQytrOLwYVX9-TCa2CK2h1M,3226
|
|
221
246
|
helm/proxy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
222
247
|
helm/proxy/accounts.py,sha256=xq-zVggvueB4D5QK58mFWGPxZe-hnIUAT4D341yd0ac,13503
|
|
223
248
|
helm/proxy/cli.py,sha256=2SOxIF55PDjzXXcDOYRT8m-oyQM_2VyZheKRC3fXDVw,8094
|
|
224
249
|
helm/proxy/example_queries.py,sha256=p1wH-tp1pRUslkAwaJYrrG5aDfmFWK3KYn4M6WQfPqQ,4120
|
|
225
|
-
helm/proxy/models.py,sha256=
|
|
250
|
+
helm/proxy/models.py,sha256=sfdp7DasJXbun-VMZz5p7iD6uhoUW1mFsy64BWfDoV4,29681
|
|
226
251
|
helm/proxy/query.py,sha256=eftbiUICMh8QIHVs-7cLtv_rDXKeKdRPmwjLMu0TDxQ,645
|
|
227
252
|
helm/proxy/retry.py,sha256=GLLDW1iGCwHfgTle8YK7ZB3vV-7XqsHcqeruKoVdsxE,1953
|
|
228
253
|
helm/proxy/server.py,sha256=uBispGXfn39s_Pskd9Xjud0rijTjqXtSKU_2YvE6zGE,7356
|
|
229
|
-
helm/proxy/test_models.py,sha256
|
|
254
|
+
helm/proxy/test_models.py,sha256=hWeDcBw1GkPvyJUd-ABxRVe1FhSUfz8bzyrKYdsqmyY,726
|
|
230
255
|
helm/proxy/test_retry.py,sha256=8h398auzjW9VnlTJWllxR-bdpub-XFp8EN8LWDEnEHM,1049
|
|
231
256
|
helm/proxy/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
232
257
|
helm/proxy/clients/ai21_client.py,sha256=c4D64-1mLaFcGsqOb5CYkaXTmAWEJYk9cIjtr6DbgZQ,7770
|
|
233
|
-
helm/proxy/clients/aleph_alpha_client.py,sha256=
|
|
258
|
+
helm/proxy/clients/aleph_alpha_client.py,sha256=VV7dbgh7sYqoSWfNkxxAiQ7i3yPW33rBPnBig9EXL10,7707
|
|
234
259
|
helm/proxy/clients/anthropic_client.py,sha256=vrfc8lS9bxQbUxMxbElV2z5cMDq-JD6yFDTS2cJdFO0,15526
|
|
235
|
-
helm/proxy/clients/auto_client.py,sha256=
|
|
260
|
+
helm/proxy/clients/auto_client.py,sha256=oktJQ4rCntp7id4vj-d3x_EA3RWGNZyXUMFwzRse--c,11063
|
|
236
261
|
helm/proxy/clients/chat_gpt_client.py,sha256=nG3opHbnzX50r9Ialh3RaRErOZo6k4Q7gVzRHeGQgj8,5312
|
|
237
262
|
helm/proxy/clients/client.py,sha256=bh6FvYFjw6MoHp5n7-KN1asXrIrOC-jfYsg3aW4xMgo,4570
|
|
238
263
|
helm/proxy/clients/cohere_client.py,sha256=KF21m7qUjuhrpEEQv68FNeX0rsWSmxZgw52Oa7CZ5pI,11362
|
|
264
|
+
helm/proxy/clients/google_client.py,sha256=sGGxDWD22c9a9KMzLAFL3vAEDHxp5jSY2W3RDpVDIak,3334
|
|
239
265
|
helm/proxy/clients/goose_ai_client.py,sha256=2tqJK_AhD2-ScXtOTdt9S9khzVjal5pm38BJWiFhwq8,4217
|
|
240
|
-
helm/proxy/clients/huggingface_client.py,sha256=
|
|
241
|
-
helm/proxy/clients/
|
|
266
|
+
helm/proxy/clients/huggingface_client.py,sha256=LypY3YfyoaGFH83UkYwDARGuoN2JOUk2S-nEaJ6GemI,11813
|
|
267
|
+
helm/proxy/clients/huggingface_model_registry.py,sha256=0WHyWPxxBI4KtTs2Yt6-Cw16FC4XBEe6yqUc0-YSn1Y,3891
|
|
268
|
+
helm/proxy/clients/huggingface_tokenizer.py,sha256=ujtsBupMMrE9efds2205c8NiPTcxHX8XM0UoV9spLK0,4591
|
|
242
269
|
helm/proxy/clients/ice_tokenizer_client.py,sha256=Ui8YhAXoY1Q0vC3icoeFs6X9xAcESF6Tl2EGERGWVGU,2325
|
|
243
270
|
helm/proxy/clients/microsoft_client.py,sha256=-VC8IrgrpSp1_FvRSI_8MSxhNp5I6dMc4qWSHc4Oulg,8237
|
|
244
|
-
helm/proxy/clients/openai_client.py,sha256=
|
|
271
|
+
helm/proxy/clients/openai_client.py,sha256=frW9fOjYWkRXdfzE88ppxaLhVl9pCnXfheUqeANW6QQ,9415
|
|
245
272
|
helm/proxy/clients/perspective_api_client.py,sha256=-L8IwokuktWPoOu7nXwsfoab_U1QRGCt8xT1SrcGfYE,5491
|
|
246
273
|
helm/proxy/clients/simple_client.py,sha256=GXHTCRB58XAxnUVqgpynidc7h6kaDBOP7TedVHrOpD4,2915
|
|
247
274
|
helm/proxy/clients/test_client.py,sha256=bvkFob_Yoy8bALrVeQ0h757g9RU687JYI0g3AISPFQ8,1268
|
|
248
275
|
helm/proxy/clients/test_huggingface_client.py,sha256=n-6D-RXqwQyxPxCLCSqHxqfK5JA-PdP5ffP17XwTe2I,3520
|
|
249
|
-
helm/proxy/clients/
|
|
276
|
+
helm/proxy/clients/test_huggingface_model_registry.py,sha256=zMboFlwMtDEV7hkd9SZFuItye-vkzz3CE5mWQrw--W4,2554
|
|
277
|
+
helm/proxy/clients/test_huggingface_tokenizer.py,sha256=KmlAXezQ6R7DAEpV85_JRdTRrOJoJxfmtylqybWn5VA,2189
|
|
250
278
|
helm/proxy/clients/test_ice_tokenizer_client.py,sha256=Ugmn5a7QdAPEAbLtreLS5-sji8yrzxy5mhFPAl3rOuI,2404
|
|
251
279
|
helm/proxy/clients/test_yalm_tokenizer_client.py,sha256=tnrYl7T1DcZ9FN09nBWV1gesjWQ3osiUpsGHSm_IypQ,2336
|
|
252
|
-
helm/proxy/clients/together_client.py,sha256=
|
|
280
|
+
helm/proxy/clients/together_client.py,sha256=epyiYElD0BfAZgUSu4zZKC5Oe8yIVVyJn2RtTwjPMzM,6334
|
|
253
281
|
helm/proxy/clients/yalm_tokenizer_client.py,sha256=cpBoc8eHQoBGQguZsDaVnGWLdZnPgkHjqLSO0B94O0U,2420
|
|
254
282
|
helm/proxy/clients/yalm_tokenizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
255
283
|
helm/proxy/clients/yalm_tokenizer/test_yalm_tokenizer.py,sha256=W9p5QNn1GSm-y85yVEQe_82zn5CVK_vR6jvhk7JTs_k,869
|
|
256
|
-
helm/proxy/clients/yalm_tokenizer/
|
|
284
|
+
helm/proxy/clients/yalm_tokenizer/voc_100b.sp,sha256=LmPD0_OIOXi8dWuNjXUYOSPhf8kPp2xhvK-g3bXcwrQ,2815034
|
|
285
|
+
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py,sha256=7Y4_nCZptFWzifCJ5aPmM3_OOxhtomIAQVjpJGV1D8g,5954
|
|
257
286
|
helm/proxy/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
258
287
|
helm/proxy/services/remote_service.py,sha256=xKS-0P-EqKTPn7odTXDoQPjn9FliQRLaMFUnCEsUmCU,6965
|
|
259
288
|
helm/proxy/services/server_service.py,sha256=gps7PwXqCi8b0yGYC0nQwFdKbfxCriSHt5CD1N1kkJs,5696
|
|
@@ -270,9 +299,9 @@ helm/proxy/token_counters/openai_token_counter.py,sha256=gPo_VrkEH07xmprzdfIhmJ_
|
|
|
270
299
|
helm/proxy/token_counters/test_ai21_token_counter.py,sha256=42J1fCi20kQUwAD18bIa6h9TaP7KZnlgF-mLbvKURro,5508
|
|
271
300
|
helm/proxy/token_counters/test_openai_token_counter.py,sha256=EovaVCZSr9moITZ9-AKiv_YM-D3OUsUDs4iQhEvpazQ,4823
|
|
272
301
|
helm/proxy/token_counters/token_counter.py,sha256=x8KyTR82EedgCQUuneQiVq9AiU1B3_CHPmKPNumClHc,429
|
|
273
|
-
crfm_helm-0.2.
|
|
274
|
-
crfm_helm-0.2.
|
|
275
|
-
crfm_helm-0.2.
|
|
276
|
-
crfm_helm-0.2.
|
|
277
|
-
crfm_helm-0.2.
|
|
278
|
-
crfm_helm-0.2.
|
|
302
|
+
crfm_helm-0.2.2.dist-info/LICENSE,sha256=bJiay7Nn5SHQ2n_4ZIT3AE0W1RGq4O7pxOApgBsaT64,11349
|
|
303
|
+
crfm_helm-0.2.2.dist-info/METADATA,sha256=_OlkKmj1P7vaZvlpvOnNnzzm3w1IEW6de75SK7TmuPw,2066
|
|
304
|
+
crfm_helm-0.2.2.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
305
|
+
crfm_helm-0.2.2.dist-info/entry_points.txt,sha256=AvH9soAH3uey9xffisWewd0yrmPWGASC036jHd1SFyg,300
|
|
306
|
+
crfm_helm-0.2.2.dist-info/top_level.txt,sha256=s9yl-XmuTId6n_W_xRjCS99MHTwPXOlkKxmTr8xZUNY,5
|
|
307
|
+
crfm_helm-0.2.2.dist-info/RECORD,,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
crfm-proxy-cli = helm.proxy.cli:main
|
|
3
3
|
crfm-proxy-server = helm.proxy.server:main
|
|
4
|
-
helm-
|
|
4
|
+
helm-create-plots = helm.benchmark.presentation.create_plots:main
|
|
5
|
+
helm-run = helm.benchmark.run:main
|
|
5
6
|
helm-server = helm.benchmark.server:main
|
|
6
7
|
helm-summarize = helm.benchmark.presentation.summarize:main
|
helm/benchmark/__init__.py
CHANGED
|
@@ -42,12 +42,24 @@ from .scenarios import legal_support_scenario # noqa
|
|
|
42
42
|
from .scenarios import entity_matching_scenario # noqa
|
|
43
43
|
from .scenarios import entity_data_imputation_scenario # noqa
|
|
44
44
|
from .scenarios import big_bench_scenario # noqa
|
|
45
|
+
from .scenarios import opinions_qa_scenario # noqa
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Biomedical
|
|
49
|
+
from .scenarios import covid_dialog_scenario # noqa
|
|
50
|
+
from .scenarios import me_q_sum_scenario # noqa
|
|
51
|
+
from .scenarios import med_dialog_scenario # noqa
|
|
52
|
+
from .scenarios import med_mcqa_scenario # noqa
|
|
53
|
+
from .scenarios import med_paragraph_simplification_scenario # noqa
|
|
54
|
+
from .scenarios import med_qa_scenario # noqa
|
|
45
55
|
from .scenarios import pubmed_qa_scenario # noqa
|
|
56
|
+
from .scenarios import wmt_14_scenario # noqa
|
|
46
57
|
|
|
47
58
|
# Metrics
|
|
48
59
|
from .metrics import basic_metrics # noqa
|
|
49
60
|
from .metrics import bbq_metrics # noqa
|
|
50
61
|
from .metrics import bias_metrics # noqa
|
|
62
|
+
from .metrics import classification_metrics # noqa
|
|
51
63
|
from .metrics import code_metrics # noqa
|
|
52
64
|
from .metrics import copyright_metrics # noqa
|
|
53
65
|
from .metrics import disinformation_metrics # noqa
|
|
@@ -56,6 +68,7 @@ from .metrics import ranking_metrics # noqa
|
|
|
56
68
|
from .metrics import summarization_metrics # noqa
|
|
57
69
|
from .metrics import toxicity_metrics # noqa
|
|
58
70
|
from .metrics import tokens_metric # noqa
|
|
71
|
+
from .metrics import machine_translation_metrics # noqa
|
|
59
72
|
|
|
60
73
|
# Perturbations for data augmentation
|
|
61
74
|
from .augmentations.extra_space_perturbation import ExtraSpacePerturbation # noqa
|
|
@@ -68,6 +68,9 @@ class AdapterSpec:
|
|
|
68
68
|
# set of training instances. Used to compute error bars.
|
|
69
69
|
num_train_trials: int = 1
|
|
70
70
|
|
|
71
|
+
# If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
72
|
+
sample_train: bool = True
|
|
73
|
+
|
|
71
74
|
# Decoding parameters (inherited by `Request`)
|
|
72
75
|
|
|
73
76
|
# Model to make the request to (need to fill in)
|
|
@@ -23,7 +23,7 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
23
23
|
@htrack(None)
|
|
24
24
|
def adapt(self, instances: List[Instance], parallelism: int) -> ScenarioState:
|
|
25
25
|
"""
|
|
26
|
-
Takes a
|
|
26
|
+
Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
|
|
27
27
|
The reason we don't do this per eval instance is that we create a common set of
|
|
28
28
|
training instances which is shared across all eval instances.
|
|
29
29
|
"""
|
|
@@ -65,7 +65,9 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
65
65
|
parallelism: int,
|
|
66
66
|
) -> List[RequestState]:
|
|
67
67
|
self.train_trial_index: int = train_trial_index
|
|
68
|
-
self.train_instances: List[Instance] = self.sample_examples(
|
|
68
|
+
self.train_instances: List[Instance] = self.sample_examples(
|
|
69
|
+
all_train_instances, seed=train_trial_index, sample_train=self.adapter_spec.sample_train
|
|
70
|
+
)
|
|
69
71
|
hlog(f"Sampled {len(self.train_instances)} examples for trial #{self.train_trial_index}.")
|
|
70
72
|
|
|
71
73
|
# Generate request_states
|
|
@@ -93,7 +95,9 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
93
95
|
|
|
94
96
|
return [request_state for result in results for request_state in result]
|
|
95
97
|
|
|
96
|
-
def sample_examples(
|
|
98
|
+
def sample_examples(
|
|
99
|
+
self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
|
|
100
|
+
) -> List[Instance]:
|
|
97
101
|
"""
|
|
98
102
|
Sample a random set of train instances to use as examples by following the steps below:
|
|
99
103
|
1. Sort the class labels (i.e., correct References) by the number of Instances that belong to the
|
|
@@ -121,9 +125,14 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
121
125
|
random.seed(seed)
|
|
122
126
|
num_instances_to_sample: int = min(len(all_train_instances), self.adapter_spec.max_train_instances)
|
|
123
127
|
|
|
128
|
+
examples: List[Instance] = []
|
|
129
|
+
if not sample_train:
|
|
130
|
+
# Select sequentially from the train set
|
|
131
|
+
examples = all_train_instances[num_instances_to_sample * seed : num_instances_to_sample * (seed + 1)]
|
|
132
|
+
return examples
|
|
133
|
+
|
|
124
134
|
unlabeled_instances: List[Instance] = []
|
|
125
135
|
label_to_instances: Dict[str, List[Instance]] = defaultdict(list)
|
|
126
|
-
|
|
127
136
|
for instance in all_train_instances:
|
|
128
137
|
if instance.first_correct_reference:
|
|
129
138
|
label_to_instances[instance.first_correct_reference.output.text].append(instance)
|
|
@@ -145,7 +154,6 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
145
154
|
sorted_labels.extend(labels)
|
|
146
155
|
|
|
147
156
|
labels_iterable = cycle(sorted_labels)
|
|
148
|
-
examples: List[Instance] = []
|
|
149
157
|
while num_instances_to_sample > 0:
|
|
150
158
|
next_label: Optional[str] = next(labels_iterable, None)
|
|
151
159
|
if not next_label:
|
|
@@ -218,10 +226,15 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
218
226
|
|
|
219
227
|
# References (optionally) and output
|
|
220
228
|
output: str
|
|
229
|
+
|
|
230
|
+
delimiter = ","
|
|
221
231
|
if reference_index is None:
|
|
222
232
|
# Put only the correct reference as the output
|
|
223
|
-
|
|
224
|
-
|
|
233
|
+
correct_references: List[Reference] = instance.all_correct_references
|
|
234
|
+
if not correct_references:
|
|
235
|
+
output = "n/a"
|
|
236
|
+
else:
|
|
237
|
+
output = delimiter.join([correct_reference.output.text for correct_reference in correct_references])
|
|
225
238
|
else:
|
|
226
239
|
reference = instance.references[reference_index]
|
|
227
240
|
output = reference.output.text
|