crfm-helm 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/METADATA +29 -55
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/RECORD +146 -134
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +12 -12
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +8 -8
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +77 -9
- helm/benchmark/adaptation/common_adapter_specs.py +2 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +70 -0
- helm/benchmark/annotation/call_center_annotator.py +247 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +68 -0
- helm/benchmark/annotation/{image2structure → image2struct}/latex_compiler_annotator.py +2 -2
- helm/benchmark/annotation/{image2structure → image2struct}/lilypond_compiler_annotator.py +5 -3
- helm/benchmark/annotation/{image2structure → image2struct}/webpage_compiler_annotator.py +5 -5
- helm/benchmark/annotation/live_qa_annotator.py +32 -45
- helm/benchmark/annotation/medication_qa_annotator.py +31 -44
- helm/benchmark/annotation/model_as_judge.py +45 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +64 -0
- helm/benchmark/annotation/xstest_annotator.py +110 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/bhasa_metrics.py +188 -0
- helm/benchmark/metrics/bhasa_metrics_specs.py +10 -0
- helm/benchmark/metrics/code_metrics_helper.py +11 -1
- helm/benchmark/metrics/safety_metrics.py +57 -0
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +4 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +1 -1
- helm/benchmark/model_metadata_registry.py +3 -3
- helm/benchmark/presentation/test_run_entry.py +1 -0
- helm/benchmark/run.py +15 -0
- helm/benchmark/run_expander.py +56 -30
- helm/benchmark/run_specs/bhasa_run_specs.py +638 -0
- helm/benchmark/run_specs/call_center_run_specs.py +152 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +8 -8
- helm/benchmark/run_specs/experimental_run_specs.py +52 -0
- helm/benchmark/run_specs/finance_run_specs.py +78 -1
- helm/benchmark/run_specs/safety_run_specs.py +154 -0
- helm/benchmark/run_specs/vlm_run_specs.py +92 -21
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +71 -0
- helm/benchmark/scenarios/banking77_scenario.py +51 -0
- helm/benchmark/scenarios/bhasa_scenario.py +1798 -0
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +2 -1
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +2 -0
- helm/benchmark/scenarios/financebench_scenario.py +53 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +59 -0
- helm/benchmark/scenarios/scenario.py +1 -1
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +33 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +25 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +2 -8
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +4 -4
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/chart2csv_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/latex_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/musicsheet_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/utils_latex.py +31 -39
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/driver.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/utils.py +1 -1
- helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage_scenario.py +41 -12
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/mme_scenario.py +21 -18
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/pope_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +7 -5
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +6 -3
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +3 -1
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +1 -6
- helm/benchmark/static/schema_air_bench.yaml +750 -750
- helm/benchmark/static/schema_bhasa.yaml +709 -0
- helm/benchmark/static/schema_call_center.yaml +232 -0
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +55 -9
- helm/benchmark/static/{schema_image2structure.yaml → schema_image2struct.yaml} +231 -90
- helm/benchmark/static/schema_safety.yaml +247 -0
- helm/benchmark/static/schema_tables.yaml +124 -7
- helm/benchmark/static/schema_thai.yaml +21 -0
- helm/benchmark/static/schema_vhelm.yaml +96 -91
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +1 -0
- helm/benchmark/static_build/assets/index-58f97dcd.js +10 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/test_openai_window_service.py +8 -8
- helm/clients/ai21_client.py +71 -1
- helm/clients/anthropic_client.py +7 -19
- helm/clients/huggingface_client.py +38 -37
- helm/clients/nvidia_nim_client.py +35 -0
- helm/clients/openai_client.py +2 -3
- helm/clients/palmyra_client.py +25 -0
- helm/clients/perspective_api_client.py +11 -6
- helm/clients/test_client.py +4 -6
- helm/clients/vision_language/open_flamingo_client.py +1 -2
- helm/clients/vision_language/palmyra_vision_client.py +28 -13
- helm/common/images_utils.py +6 -0
- helm/common/mongo_key_value_store.py +2 -1
- helm/common/request.py +16 -0
- helm/config/model_deployments.yaml +315 -332
- helm/config/model_metadata.yaml +384 -110
- helm/config/tokenizer_configs.yaml +116 -11
- helm/proxy/example_queries.py +14 -21
- helm/proxy/services/server_service.py +1 -2
- helm/proxy/token_counters/test_auto_token_counter.py +2 -2
- helm/tokenizers/ai21_tokenizer.py +51 -59
- helm/tokenizers/cohere_tokenizer.py +0 -75
- helm/tokenizers/huggingface_tokenizer.py +0 -1
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/general.js +0 -122
- helm/benchmark/static/images/crfm-logo.png +0 -0
- helm/benchmark/static/images/helm-logo-simple.png +0 -0
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/info-icon.png +0 -0
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/static_build/assets/index-30dbceba.js +0 -10
- helm/benchmark/static_build/assets/index-66b02d40.css +0 -1
- helm/benchmark/window_services/ai21_window_service.py +0 -247
- helm/benchmark/window_services/cohere_window_service.py +0 -101
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -75
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -327
- helm/tokenizers/ice_tokenizer.py +0 -30
- helm/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.2.dist-info → crfm_helm-0.5.3.dist-info}/top_level.txt +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/annotation/{image2structure → image2struct}/image_compiler_annotator.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure/image2structure_scenario.py → image2struct/image2struct_scenario.py} +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/__init__.py +0 -0
- /helm/benchmark/scenarios/vision_language/{image2structure → image2struct}/webpage/jekyll_server.py +0 -0
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
import{r as a,a as Ms,L as E,O as Rs,d as ks,u as Ce,f as Pe,H as Ls,h as As,i as O,R as Ts}from"./react-d4a0b69b.js";import{g as Q,b as K,m as me,s as Ie,a as _s,d as ke,y as Cs,c as Le,e as xe,l as fe}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const l of document.querySelectorAll('link[rel="modulepreload"]'))r(l);new MutationObserver(l=>{for(const c of l)if(c.type==="childList")for(const i of c.addedNodes)i.tagName==="LINK"&&i.rel==="modulepreload"&&r(i)}).observe(document,{childList:!0,subtree:!0});function n(l){const c={};return l.integrity&&(c.integrity=l.integrity),l.referrerPolicy&&(c.referrerPolicy=l.referrerPolicy),l.crossOrigin==="use-credentials"?c.credentials="include":l.crossOrigin==="anonymous"?c.credentials="omit":c.credentials="same-origin",c}function r(l){if(l.ep)return;l.ep=!0;const c=n(l);fetch(l.href,c)}})();var Ue={exports:{}},ae={};/**
|
|
2
|
-
* @license React
|
|
3
|
-
* react-jsx-runtime.production.min.js
|
|
4
|
-
*
|
|
5
|
-
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
6
|
-
*
|
|
7
|
-
* This source code is licensed under the MIT license found in the
|
|
8
|
-
* LICENSE file in the root directory of this source tree.
|
|
9
|
-
*/var Ps=a,Is=Symbol.for("react.element"),Us=Symbol.for("react.fragment"),$s=Object.prototype.hasOwnProperty,Ds=Ps.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,Os={key:!0,ref:!0,__self:!0,__source:!0};function $e(s,t,n){var r,l={},c=null,i=null;n!==void 0&&(c=""+n),t.key!==void 0&&(c=""+t.key),t.ref!==void 0&&(i=t.ref);for(r in t)$s.call(t,r)&&!Os.hasOwnProperty(r)&&(l[r]=t[r]);if(s&&s.defaultProps)for(r in t=s.defaultProps,t)l[r]===void 0&&(l[r]=t[r]);return{$$typeof:Is,type:s,key:c,ref:i,props:l,_owner:Ds.current}}ae.Fragment=Us;ae.jsx=$e;ae.jsxs=$e;Ue.exports=ae;var e=Ue.exports,ue={},Ae=Ms;ue.createRoot=Ae.createRoot,ue.hydrateRoot=Ae.hydrateRoot;function Hs({title:s,titleId:t,...n},r){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const Bs=a.forwardRef(Hs),De=Bs;function Fs({title:s,titleId:t,...n},r){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const zs=a.forwardRef(Fs),qs=zs;function Gs({title:s,titleId:t,...n},r){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?a.createElement("title",{id:t},s):null,a.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const Ws=a.forwardRef(Gs),Js=Ws,Oe=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,He=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function Vs({title:s,titleId:t,...n},r){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const Zs=a.forwardRef(Vs),Be=Zs;function Ks({title:s,titleId:t,...n},r){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const Ys=a.forwardRef(Ks),Xs=Ys;function Qs({title:s,titleId:t,...n},r){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const et=a.forwardRef(Qs),st=et;function tt({title:s,titleId:t,...n},r){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const nt=a.forwardRef(tt),Fe=nt;function rt({title:s,titleId:t,...n},r){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const at=a.forwardRef(rt),ze=at;function lt({title:s,titleId:t,...n},r){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const it=a.forwardRef(lt),ot=it;function pe(s,t){return t?t==="home"?"https://crfm.stanford.edu/helm/":s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function qe(){const[s,t]=a.useState([]),[n,r]=a.useState();return a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(l=>l.json()).then(l=>{if(t(l),window.PROJECT_ID){const c=l.find(i=>i.id===window.PROJECT_ID);r(c)}else{const c=l.find(i=>i.id==="lite");r(c)}}).catch(l=>{console.error("Error fetching JSON:",l)})},[]),n===void 0||n.title===void 0?null:e.jsxs("div",{className:"dropdown z-50",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap z-40","aria-haspopup":"true","aria-controls":"menu",children:[n.title," ",e.jsx(Fe,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((l,c)=>e.jsx("li",{className:"z-40",children:e.jsxs("a",{href:pe(void 0,l.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:n.title===l.title?"underline":"",children:l.title}),": ",l.description]})},c))})]})}function _(s){return`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function V(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function ct(s){try{return await(await fetch(_(`${V()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function dt(){const[s,t]=a.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[n,r]=a.useState();a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(m=>m.json()).then(m=>{if(window.PROJECT_ID){const h=m.find(N=>N.id===window.PROJECT_ID);r(h)}else{const h=m.find(N=>N.id==="lite");r(h)}}).catch(m=>{console.error("Error fetching JSON:",m)})},[]);function l(){return n!==void 0&&n.releases!==void 0?n.releases:["v1.0.0"]}a.useEffect(()=>{const m=new AbortController;async function h(){const N=await ct(m.signal);t(N)}return h(),()=>m.abort()},[]);const c=l();if(!s.release&&!s.suite)return null;const i=`Release ${s.release||s.suite} (${s.date})`;return c.length<=1?e.jsx("div",{children:i}):e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[i," ",e.jsx(Fe,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:c.map(m=>e.jsx("li",{children:e.jsx("a",{href:pe(m,n?n.id:"lite"),className:"block",role:"menuitem",children:m})}))})]})}function mt(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(De,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(E,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(E,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(E,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(E,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(E,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(E,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:Oe,className:"object-contain"})}),e.jsx(E,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(qe,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(E,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(E,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(E,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(E,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(E,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(dt,{})})]})})]})}function ut(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(De,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(E,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:Oe,className:"object-contain"})}),e.jsx(E,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(qe,{})]})]})}function ht(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="home"?e.jsx(ut,{}):e.jsx(mt,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(Rs,{})})})]})}async function H(s){try{return await(await fetch(_(`${V()}/schema.json`),{signal:s})).json()}catch(t){return console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function xt({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function J({value:s}){return e.jsx("span",{children:e.jsx(ks,{components:{a:xt},children:s})})}function $({title:s,subtitle:t,markdown:n=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),n&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(J,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const ft={open:"green",limited:"yellow",closed:"red"},pt={open:"Open",limited:"Limited",closed:"Closed"};function gt({level:s}){return e.jsx(Q,{color:ft[s],children:pt[s]})}function F(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function jt(){const[s,t]=a.useState([]);a.useEffect(()=>{const i=new AbortController;async function m(){const h=await H(i.signal);t(h.models)}return m(),()=>i.abort()},[]);const[n,r,l]=s.reduce((i,m)=>{switch(m.access){case"open":i[0]+=1;break;case"limited":i[1]+=1;break;case"closed":i[2]+=1;break}return i},[0,0,0]),c=Object.values(s.reduce((i,m)=>{const h=m.creator_organization;return i[h]===void 0?(i[h]={name:h,models:1},i):(i[h].models+=1,i)},{}));return s.length===0?e.jsx(F,{}):e.jsxs(e.Fragment,{children:[e.jsx($,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(i=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:i.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:i.display_name}),e.jsx("br",{}),e.jsx("span",{children:i.name})]}),e.jsx("td",{children:e.jsx(J,{value:i.description})}),e.jsx("td",{children:e.jsx(gt,{level:i.access})})]}))})]}),e.jsx($,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(K,{className:"flex flex-col justify-between",children:[e.jsx(me,{children:"Models"}),e.jsx(Ie,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(_s,{values:[n,r,l],colors:["green","yellow","red"]}),e.jsx(ke,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(K,{className:"md:col-span-2",children:[e.jsx(me,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(Cs,{data:c,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(ke,{categories:c.map(i=>i.name),className:"basis-7/12"})]})]})]})]})]})}function re({to:s,children:t,inTable:n=!1,title:r=""}){return n?e.jsx(E,{className:"link link-hover",to:s,title:r,children:t}):e.jsx(E,{className:"link link-primary link-hover",to:s,children:t})}function bt(){const[s,t]=a.useState([]);a.useEffect(()=>{const r=new AbortController;async function l(){const c=await H(r.signal);t(c.run_groups.filter(i=>!i.todo&&i.taxonomy&&!i.display_name.includes("CLEVA")))}return l(),()=>r.abort()},[]);const n=Object.values(s.reduce((r,l)=>{var i;const c=((i=l.taxonomy)==null?void 0:i.task)||"Unknown";return r[c]===void 0?(r[c]={name:c,value:1},r):(r[c].value+=1,r)},{}));return s.length===0?e.jsx(F,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx($,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(r=>{var l,c,i,m,h;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(re,{to:`/groups/${r.name}`,children:e.jsx("span",{className:"text-lg",children:r.display_name})}),e.jsx("span",{className:"block",children:r.name})]}),e.jsx("td",{children:((l=r.taxonomy)==null?void 0:l.task)||""}),e.jsx("td",{children:((c=r.taxonomy)==null?void 0:c.what)||""}),e.jsx("td",{children:((i=r.taxonomy)==null?void 0:i.who)||""}),e.jsx("td",{children:((m=r.taxonomy)==null?void 0:m.when)||""}),e.jsx("td",{children:((h=r.taxonomy)==null?void 0:h.language)||""}),e.jsx("td",{children:e.jsx(J,{value:r.description})})]})})})]}),e.jsx($,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(K,{className:"flex flex-col",children:[e.jsx(me,{children:"Total scenarios"}),e.jsx(Ie,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(K,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(Le,{data:n.slice(0,Math.floor(n.length/2))}),e.jsx(Le,{data:n.slice(Math.ceil(n.length/2))})]})})]})]})]}))}function Ge(){return _(`${V()}/groups.json`)}async function ge(s){try{return await(await fetch(Ge(),{signal:s})).json()}catch(t){return console.log(t),[]}}function le({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function Y({active:s=!1,onClick:t=()=>{},size:n="md",children:r}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${n} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:r})}function vt({title:s,titleId:t,...n},r){return a.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?a.createElement("title",{id:t},s):null,a.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),a.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const wt=a.forwardRef(vt),Te=wt;function W(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function X({value:s,title:t,hideIcon:n}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const r=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const l="/runs/?q="+s.run_spec_names.map(i=>`^${i}$`).join("|");return encodeURI(l)}})();return r?e.jsx(re,{to:r,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center ",children:[W(s.value),!n&&e.jsx(Te,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):t?e.jsx("a",{title:t,children:W(s.value)}):e.jsx(e.Fragment,{children:W(s.value)})}return s.href?e.jsx(re,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[W(s.value),!n&&e.jsx(Te,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):s.markdown?e.jsx(J,{value:String(s.value)}):t?e.jsx("a",{title:t,children:W(s.value)}):e.jsx(e.Fragment,{children:W(s.value)})}function We({groupsTables:s,activeGroup:t,ignoreHref:n=!1,sortable:r=!0,sortFirstMetric:l=!0}){const[c,i]=a.useState(l?1:void 0),[m,h]=a.useState({...s[t]}),[N,S]=a.useState(1);a.useEffect(()=>{h({...s[t]})},[t,s]);const k=v=>{let w=N;c===v?w=w*-1:w=1,i(v),S(w),h(f=>{const b={...f};return b.rows.sort((M,u)=>{var g,o;const y=(g=M[v])==null?void 0:g.value,d=(o=u[v])==null?void 0:o.value;return y!==void 0&&d===void 0?-1:d!==void 0&&y===void 0?1:typeof y=="number"&&typeof d=="number"?(y-d)*w:typeof y=="string"&&typeof d=="string"?w===1?y.localeCompare(d):d.localeCompare(y):0}),b})};return a.useEffect(()=>{l&&c&&k(c)},[l,c]),e.jsx("div",{children:e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:m.header.map((v,w)=>e.jsx("th",{className:`${w===c?"bg-gray-100 ":"bg-white"} ${w===0?"left-0 z-10":""} whitespace-nowrap sticky top-0`,children:e.jsxs("div",{className:"flex gap-2 items-center",children:[e.jsx("span",{children:v.value}),r?e.jsx("button",{className:"link",onClick:()=>k(w),children:e.jsx(ze,{className:"w-6 h-6"})}):null]})},`${t}-${w}`))})}),e.jsx("tbody",{children:m.rows.map((v,w)=>e.jsx("tr",{children:v.map((f,b)=>e.jsx("td",{className:`${b==0?"text-lg sticky left-0":""} ${c===b?"bg-gray-100":"bg-white"}`,children:e.jsx("div",{className:f&&f.style&&f.style["font-weight"]&&f.style["font-weight"]==="bold"?"font-bold":"",children:e.jsx(X,{ignoreHref:n&&b===0,value:f})})},`${t}-${b}`))},`${t}-${w}`))})]})})}function yt(){const[s,t]=a.useState(0),[n,r]=a.useState([]),[l,c]=a.useState([]);return a.useEffect(()=>{const i=new AbortController;async function m(){const h=await ge(i.signal);c(h),r(h.map(N=>N.title))}return m(),()=>i.abort()},[]),l.length===0?e.jsx(F,{}):e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx($,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("a",{className:"flex link-primary space-between items-center self-end link link-hover block",href:Ge(),download:"true",target:"_blank",children:[e.jsx(Be,{className:"w-6 h-6 mr-2"})," JSON"]})]}),e.jsx("div",{children:e.jsx(le,{children:n.map((i,m)=>e.jsx(Y,{onClick:()=>t(m),active:s===m,size:"lg",children:i},m))})}),e.jsx("div",{className:"mt-8",children:e.jsx(We,{sortable:!1,groupsTables:l,activeGroup:s})})]})}async function je(s,t){try{return await(await fetch(_(`${V()}/groups/${s}.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function be(s){try{return await(await fetch(_(`${V()}/groups_metadata.json`),{signal:s})).json()}catch(t){return console.log(t),{}}}function Nt(){const{groupName:s}=Ce(),[t,n]=a.useState([]),[r,l]=a.useState(),[c,i]=a.useState(!0),[m,h]=a.useState(0);return a.useEffect(()=>{const N=new AbortController;async function S(){if(s===void 0)return;const[k,v]=await Promise.all([je(s,N.signal),be(N.signal)]);n(k),l(v[s]),i(!1)}return S(),()=>N.abort()},[s]),c||r===void 0?e.jsx(F,{}):t.length===0?e.jsxs(e.Fragment,{children:[e.jsx($,{title:r.display_name,subtitle:r.description,markdown:!0,className:"mr-8"}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex flex-row justify-between",children:e.jsx($,{title:r.display_name,subtitle:r.description,markdown:!0,className:"mr-8 mb-16"})}),e.jsx("div",{className:"overflow-x-auto",children:t.length>1?e.jsx(le,{children:t.map((N,S)=>e.jsx(Y,{active:S===m,onClick:()=>h(S),children:N.title},S))}):null}),e.jsx(We,{groupsTables:t,activeGroup:m,ignoreHref:!0})]})}async function Je(s){try{return await(await fetch(_(`${V()}/run_specs.json`),{signal:s})).json()}catch(t){return console.log(t),[]}}function he({currentPage:s,totalPages:t,onNextPage:n,onPrevPage:r,className:l}){let c="join";return l!==void 0&&(c=`join ${l}`),e.jsxs("div",{className:c,children:[e.jsx("button",{onClick:r,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:n,className:"join-item btn",children:"»"})]})}const oe=100;function St(){const[s,t]=Pe(),[n,r]=a.useState([]),[l,c]=a.useState(Number(s.get("page")||1)),[i,m]=a.useState(1),[h,N]=a.useState([]),[S,k]=a.useState(!0),[v,w]=a.useState(s.get("q")||"");a.useEffect(()=>{const u=new AbortController;async function y(){const d=await Je(u.signal);r(d),f(v,d)}return y(),()=>u.abort()},[v]),a.useEffect(()=>{f(v,n)},[n,v]);function f(u,y){const d=S?new RegExp(u):null,g=y.filter(o=>d?d.test(o.name):o.name.includes(u));N(g),m(Math.ceil(g.length/oe))}const b=u=>{u.preventDefault();const d=u.target.q.value;w(d),t({q:d,page:"1"}),f(d,n)},M=h.slice((l-1)*oe,l*oe);return n.length===0?e.jsx(F,{}):e.jsxs(e.Fragment,{children:[e.jsx($,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:b,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:v,onChange:u=>w(u.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:S,onChange:()=>k(!S)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${h.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(ot,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:M.map((u,y)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(re,{to:`/runs/${u.name}`,children:u.name})}),e.jsx("td",{children:u.adapter_spec.model}),e.jsx("td",{children:u.groups.join(", ")}),e.jsx("td",{children:u.adapter_spec.method}),e.jsx("td",{children:u.scenario_spec.args.subject||u.scenario_spec.args.task||"-"})]},`${u.name}-${y}`))})]})}),i>0?e.jsx(he,{className:"flex justify-center my-8",onNextPage:()=>{const u=Math.min(l+1,i);c(u),s.set("page",String(u)),t(s)},onPrevPage:()=>{const u=Math.max(l-1,1);c(u),s.set("page",String(u)),t(s)},currentPage:l,totalPages:i}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function q(){return window.SUITE!==void 0?window.SUITE:void 0}async function Et(s,t,n){try{return await(await fetch(_(`/runs/${n||q()}/${s}/instances.json`),{signal:t})).json()}catch(r){return console.log(r),[]}}async function Mt(s,t,n){try{return await(await fetch(_(`/runs/${n||q()}/${s}/stats.json`),{signal:t})).json()}catch(r){return console.log(r),[]}}async function Rt(s,t,n){try{return await(await fetch(_(`/runs/${n||q()}/${s}/display_requests.json`),{signal:t})).json()}catch(r){return console.log(r),[]}}async function kt(s,t,n){try{return await(await fetch(_(`/runs/${n||q()}/${s}/display_predictions.json`),{signal:t})).json()}catch(r){return console.log(r),[]}}async function Lt(s,t,n){try{return await(await fetch(_(`/runs/${n||q()}/${s}/scenario.json`),{signal:t})).json()}catch(r){console.log(r);return}}function At(s,t){return _(`/runs/${t||q()}/${s}/run_spec.json`)}function Tt(s,t){return _(`/runs/${t||q()}/${s}/scenario_state.json`)}function ce({value:s}){return e.jsx(Q,{icon:qs,color:"green",children:s})}function de({value:s}){return e.jsx(Q,{icon:Js,color:"red",children:s})}function _t({stats:s}){if(s.quasi_exact_match!==void 0)return s.quasi_exact_match?e.jsx(ce,{value:`quasi exact match: ${s.quasi_exact_match}`}):e.jsx(de,{value:`quasi exact match: ${s.quasi_exact_match}`});if(s.toxic_frac!==void 0)return s.toxic_frac>0?e.jsx(de,{value:`toxic frac: ${s.toxic_frac}`}):e.jsx(ce,{value:`toxic frac: ${s.toxic_frac}`});if(s.exact_match!==void 0)return s.exact_match>0?e.jsx(ce,{value:`exact match: ${s.exact_match}`}):e.jsx(de,{value:`exact match: ${s.exact_match}`})}function z({value:s}){const[t,n]=a.useState(!1),[r,l]=a.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>n(!0),onMouseOut:()=>n(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>l(!0),children:e.jsx(st,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:r,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>l(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function Ve({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=_(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function Ze({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(Ve,{mediaObject:t}))})}function Ct(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function Pt({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(z,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(Ze,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(xe,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,n)=>e.jsxs(fe,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:Ct(s.request[t])}):"null"]},n+1))})]})}function It(s){return e.jsx("div",{children:s.map((t,n)=>e.jsxs("div",{children:[t.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(z,{value:t.error})," "]}),t.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(z,{value:t.text})," "]}),t.media_object&&e.jsx(Ve,{mediaObject:t.media_object})]},n))})}function Ut(s){return e.jsx("div",{children:Object.entries(s).map(([t,n])=>e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:t}),e.jsx(z,{value:n.toString()})]}))})}function $t({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,n])=>e.jsxs("div",{children:[e.jsx("h3",{children:e.jsx("strong",{children:t})}),Array.isArray(n)?It(n):Ut(n)]},t)):null})}function Dt({predictions:s,requests:t,metricFieldMap:n}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((r,l)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",l]}):null,e.jsx("div",{className:"mt-2 w-full",children:r.base64_images&&r.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),r.base64_images.map(c=>e.jsx("img",{src:"data:image;base64,"+c,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(_t,{stats:r.stats})]}),e.jsx(z,{value:r.predicted_text}),r.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(z,{value:String(r.mapped_output)})]}):null]})}),e.jsx($t,{predictionAnnotations:r.annotations}),e.jsx("h3",{children:"Metrics"}),e.jsx(xe,{children:Object.keys(r.stats).map((c,i)=>e.jsxs(fe,{children:[n[c]?e.jsx("span",{title:n[c].description,children:n[c].display_name}):e.jsx("span",{children:c}),e.jsx("span",{children:String(r.stats[c])})]},i))}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(Pt,{request:t[l]})})]})]},l))})})}const Ot="correct";function Ht({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,n)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(r=>e.jsx(Q,{className:"mx-2",color:r===Ot?"green":void 0,children:r}))]},n))})]})}function Bt({instance:s,requests:t,predictions:n,metricFieldMap:r}){return e.jsxs("div",{className:"border p-4",children:[e.jsx("h3",{className:"text-xl mb-4",children:`Instance id: ${s.id} [split: ${s.split}]`}),e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(Ze,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('<br><img src="data:image;base64')?e.jsx("div",{dangerouslySetInnerHTML:{__html:s.input.text}}):e.jsx(z,{value:s.input.text}),e.jsx("div",{children:s.references&&s.references.length>0?e.jsx(Ht,{references:s.references}):null}),e.jsx("div",{children:n&&t?e.jsx(Dt,{predictions:n,requests:t,metricFieldMap:r}):null})]})}function Ft({stat:s,metricFieldMap:t}){const n=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),n]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),n]})}function Ke(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function zt(s){try{return await(await fetch(_(`/releases/${Ke()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return console.log(t),{}}}function qt(s,t){return Ke()?s[t]:window.SUITE}const te=10,ne=50;function Gt(){const{runName:s}=Ce(),[t,n]=Pe(),[r,l]=a.useState(0),[c,i]=a.useState(),[m,h]=a.useState(),[N,S]=a.useState([]),[k,v]=a.useState([]),[w,f]=a.useState(),[b,M]=a.useState(),[u,y]=a.useState(1),[d,g]=a.useState(1),[o,p]=a.useState(1),[x,P]=a.useState(1),[D,T]=a.useState(),[L,I]=a.useState(),[C,se]=a.useState({}),[Z,ps]=a.useState({}),[ye,gs]=a.useState("");if(a.useEffect(()=>{const j=new AbortController;async function B(){const U=j.signal;if(s===void 0)return()=>j.abort();const G=window.SUITE?window.SUITE:qt(await zt(U),s);h(G);const[Ne,Se,Ee,vs,ws,ys]=await Promise.all([Je(U),Et(s,U,G),Mt(s,U,G),Lt(s,U,G),kt(s,U,G),Rt(s,U,G)]);i(Ne.find(R=>R.name===s)),S(Se);const Me=Math.ceil(Se.length/te),Ns=Number(t.get("instancesPage")||1);g(Me),y(Math.max(Math.min(Ns,Me),1)),v(Ee),I(vs);const Re=Math.floor(Ee.length/ne),Ss=Number(t.get("metricsPage")||1);P(Re),p(Math.max(Math.min(Ss,Re),1)),f(ws.reduce((R,A)=>(R[A.instance_id]===void 0&&(R[A.instance_id]=[]),R[A.instance_id].push(A),R),{})),M(ys.reduce((R,A)=>(R[A.instance_id]===void 0&&(R[A.instance_id]=[]),R[A.instance_id].push(A),R),{}));const ie=await H(U);ps(ie.metrics.reduce((R,A)=>(R[A.name]=A,R),{})),se(ie.adapter.reduce((R,A)=>(R[A.name]=A,R),{})),T(ie.models.find(R=>{var A;return R.name===((A=Ne.find(Es=>Es.name===s))==null?void 0:A.adapter_spec.model)}))}return B(),()=>j.abort()},[s,t]),c===void 0||w===void 0||b===void 0||L===void 0)return e.jsx(F,{});const js=N.slice((u-1)*te,(u-1)*te+te),bs=k.slice((o-1)*ne,(o-1)*ne+ne);return e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[L.name,e.jsx("a",{href:"/#/groups/"+L.name,children:e.jsx(Xs,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(J,{value:L.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:c.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(J,{value:(D==null?void 0:D.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:L.tags.map(j=>e.jsx(Q,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:j})}))})]})}),e.jsxs(K,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(Be,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:At(c.name,m),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:Tt(c.name,m),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(xe,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(c.adapter_spec).map(([j,B],U)=>e.jsxs(fe,{className:U<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:C[j]?C[j].description:void 0,children:`${j}: `}),e.jsx("span",{className:"overflow-x-auto",children:B})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(le,{children:[e.jsx(Y,{size:"lg",active:r===0,onClick:()=>l(0),children:"Instances + Predictions"}),e.jsx(Y,{size:"lg",active:r===1,onClick:()=>l(1),children:"All metrics"})]})}),r===0?e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:js.map((j,B)=>e.jsx(Bt,{instance:j,requests:b[j.id],predictions:w[j.id],metricFieldMap:Z},`${j.id}-${B}`))}),e.jsx(he,{className:"flex justify-center my-8",onNextPage:()=>{const j=Math.min(u+1,d);y(j),t.set("instancesPage",String(j)),n(t)},onPrevPage:()=>{const j=Math.max(u-1,1);y(j),t.set("instancesPage",String(j)),n(t)},currentPage:u,totalPages:d})]}):e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:j=>gs(j.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:Object.keys(k[0]).map(j=>e.jsx("th",{children:j},j))})}),e.jsx("tbody",{children:bs.filter(j=>!ye||j.name.name.toLowerCase().includes(ye.toLowerCase())).map(j=>e.jsx("tr",{children:Object.entries(j).map(([B,U])=>B==="name"?e.jsx("td",{children:e.jsx(Ft,{stat:j,metricFieldMap:Z})},B):e.jsx("td",{children:U}))}))})]})}),e.jsx(he,{className:"flex justify-center my-8",onNextPage:()=>{const j=Math.min(o+1,x);p(j),t.set("metricsPage",String(j)),n(t)},onPrevPage:()=>{const j=Math.max(o-1,1);p(j),t.set("metricsPage",String(j)),n(t)},currentPage:o,totalPages:x})]})]})}function Wt({groupsTables:s,activeGroup:t,sortable:n=!0,sortFirstMetric:r=!0}){const[l,c]=a.useState(r?1:void 0),[i,m]=a.useState({...s[t]}),[h,N]=a.useState(1);function S(d){return d.length>30?d.substring(0,27)+"...":d}const k=d=>{const g=["AIRBench 2024 -","-book"];if(d.value==="Model/adapter")return"Model";if(g.some(o=>d.value.includes(o))){let o=d.value;return g.forEach(p=>{o=o.replace(p,"")}),S(o)}else return S(d.value)},[v,w]=a.useState(void 0);a.useEffect(()=>{const d=new AbortController;async function g(){const o=await H(d.signal);w(o)}return g(),()=>d.abort()},[]);const f=d=>{if(v){const g=v.models.find(o=>o.display_name===d);if(g){let o=g.description;return o.includes("/")&&(o=o.replace("/","_")),o}}return""},b=d=>{if(v){const g=v.models.find(o=>o.display_name===d);if(g){let o=g.name;return o.includes("/")&&(o=o.replace("/","_")),o}}return""};function M(d){const g=d.lastIndexOf(" - ");return g===-1?d:d.substring(0,g)+"*"+d.substring(g+1)}const u=d=>{const o=M(d).split("*")[0].trim();if(v){const p=v.run_groups.find(x=>x.display_name===o||x.short_display_name===o);if(p)return p.name}return""};a.useEffect(()=>{m({...s[t]})},[t,s]);const y=(d,g=!1)=>{let o=h;l===d?o=o*-1:o=1,g&&(o=o*-1),c(d),N(o),m(p=>{const x={...p};return x.rows.sort((P,D)=>{var I,C;const T=(I=P[d])==null?void 0:I.value,L=(C=D[d])==null?void 0:C.value;return T!==void 0&&L===void 0?-1:L!==void 0&&T===void 0?1:typeof T=="number"&&typeof L=="number"?(T-L)*o:typeof T=="string"&&typeof L=="string"?o===1?T.localeCompare(L):L.localeCompare(T):0}),x})};return a.useEffect(()=>{r&&l&&y(l,i.header[l].lower_is_better)},[r,l]),e.jsx(e.Fragment,{children:e.jsx("div",{children:e.jsx("div",{children:e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:i.header.map((d,g)=>e.jsx("th",{className:`${g===l?"bg-gray-100":"bg-white"} ${g===0?"left-0 z-40":""} ${d.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:d.description?d.description:"",children:e.jsxs("div",{className:"z-20 flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:k(d)}),n?e.jsx("button",{className:"link",onClick:()=>y(g,d.lower_is_better),children:e.jsx(ze,{className:"w-6 h-6"})}):null]})},`${t}-${g}`))})}),e.jsx("tbody",{children:i.rows.map((d,g)=>e.jsx("tr",{children:d.map((o,p)=>e.jsx("td",{className:`${p===0?"z-20 text-lg sticky left-0":"z-0"} ${g%2===0?"bg-gray-50":"bg-white"}`,children:p==1?e.jsx("div",{className:`${o&&o.style&&o.style["font-weight"]&&o.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(X,{value:{...o,href:"/runs/?q="+b(String(d[0].value))},title:`Click value to see all predictions for: ${b(String(d[0].value))}`})}):e.jsx("div",{className:`${o&&o.style&&o.style["font-weight"]&&o.style["font-weight"]==="bold"?"font-bold":""} ${p===0?"underline decoration-dashed decoration-gray-300 z-10":"z-0"}`,children:e.jsx(X,{value:{...o},title:String(d[0].value)===o.value?f(String(d[0].value)):`Click value to see predictions for ${u(k(i.header[p]))}: ${b(String(d[0].value))}`})})},`${t}-${p}`))},`${t}-${g}`))})]})})})})}function Jt(){const[s,t]=a.useState([]),[n,r]=a.useState(),[l,c]=a.useState([]),[i,m]=a.useState(),[h,N]=a.useState(!0),[S,k]=a.useState(0);function v(f,b){console.log(f,b);const M=f.find(u=>u.title===b);return M??f[0]}function w(f,b){r(v(f,b))}return a.useEffect(()=>{const f=new AbortController;async function b(){const M=await ge(f.signal),u=[];if(M.forEach(o=>{o.rows.forEach(p=>{u.push({title:String(p[0].value),name:p[0].href.replace("?group=","")})})}),t(u),u.length===0)throw new Error("Could not find any groups!");const y=n?n.name:u[0].name,[d,g]=await Promise.all([je(y,f.signal),be(f.signal)]);c(d),m(g[y]),N(!1)}return b(),()=>f.abort()},[n]),h||i===void 0?e.jsx(F,{}):l.length===0?e.jsxs(e.Fragment,{children:[e.jsx($,{title:i.display_name,subtitle:i.description,markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsx(e.Fragment,{children:e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx($,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",value:n?n.title:s[0].title,onChange:f=>w(s,f.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:s.map((f,b)=>e.jsx("option",{value:f.title,children:f.title},b))})]})]}),e.jsx("div",{className:"overflow-x-auto",children:l.length>1?e.jsx(le,{children:l.map((f,b)=>e.jsx(Y,{active:b===S,onClick:()=>k(b),children:f.title},b))}):null}),e.jsx(Wt,{groupsTables:l,activeGroup:S,ignoreHref:!0})]})})}const Vt=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,Zt=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function Kt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:Vt,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:Zt,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function ve({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,n)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},n):e.jsx(E,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},n)}))})]})}function we({runGroups:s}){const t=new Map(s.filter(l=>l.metric_groups!==void 0&&(l.subgroups===void 0||l.subgroups.length===0)).map(l=>[l.name,l])),n=new Set,r=[];return s.forEach(l=>{const c=l.subgroups?l.subgroups:[],i=[];c.forEach(m=>{const h=t.get(m);h&&(i.push(h),n.add(h.name))}),i.length>0&&r.push([l,i])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," scenarios"]}),e.jsx("ul",{children:r.map(([l,c])=>e.jsxs("li",{className:"my-3",children:[e.jsx(E,{className:"text-black",to:"groups/"+l.name,children:e.jsx("h2",{children:l.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:c.map(i=>i.todo?e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name):e.jsx(E,{className:"text-black",to:"groups/"+i.name,children:e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name)}))})]},l.name))})]})}const Ye=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function Yt({groupsTables:s,activeGroup:t,sortFirstMetric:n=!0,filteredCols:r=[],modelsToFilter:l=[],numModelsToAutoFilter:c=0}){const[i,m]=a.useState(n?1:void 0),[h,N]=a.useState({...s[t]}),[S,k]=a.useState(1),[v,w]=a.useState(l);function f(o){return o.length>30?o.substring(0,27)+"...":o}const b=o=>o.value==="Model/adapter"?"Model":o.value.includes("-book")?f(o.value.replace("-book","")):f(o.value),[M,u]=a.useState(void 0);a.useEffect(()=>{const o=new AbortController;async function p(){const x=await H(o.signal);u(x)}return p(),()=>o.abort()},[]);const y=o=>{if(M){const p=M.models.find(x=>x.display_name===o);if(p){let x=p.description;return x.includes("/")&&(x=x.replace("/","_")),x}}return""},d=o=>{if(M){const p=M.models.find(x=>x.display_name===o);if(p){let x=p.name;return x.includes("/")&&(x=x.replace("/","_")),x}}return""};a.useEffect(()=>{N({...s[t]});const o=h.header[1].lower_is_better;if(c){const D=s[0].rows.sort((T,L)=>o?Number(T[1].value)-Number(L[1].value):Number(L[1].value)-Number(T[1].value)).slice(0,c).map(T=>String(T[0].value));w(D)}},[t,h,s,c]);const g=(o,p=!1)=>{let x=S;i===o?x=x*-1:x=1,p&&(x=x*-1),m(o),k(x),N(P=>{const D={...P};return D.rows.sort((T,L)=>{var se,Z;const I=(se=T[o])==null?void 0:se.value,C=(Z=L[o])==null?void 0:Z.value;return I!==void 0&&C===void 0?-1:C!==void 0&&I===void 0?1:typeof I=="number"&&typeof C=="number"?(I-C)*x:typeof I=="string"&&typeof C=="string"?x===1?I.localeCompare(C):C.localeCompare(I):0}),D})};return a.useEffect(()=>{n&&i&&g(i,h.header[i].lower_is_better)},[n,i]),e.jsx(e.Fragment,{children:e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table w-full",children:[e.jsx("thead",{children:e.jsx("tr",{children:h.header.filter((o,p)=>r.length===0||r.includes(p)).map((o,p)=>e.jsx("th",{className:`${p===i?"bg-gray-100":""} ${o.description?"underline decoration-dashed":""} whitespace-nowrap px-4 `,title:o.description?o.description:"",children:e.jsx("div",{className:"flex gap-2 items-center",children:e.jsx("span",{children:b(o)})})},`${t}-${p}`))})}),e.jsx("tbody",{children:h.rows.filter(o=>v.includes(String(o[0].value))).map((o,p)=>e.jsx("tr",{className:`${p%2===0?"bg-gray-50":""}`,children:o.filter((x,P)=>r.length===0||r.includes(P)).map((x,P)=>e.jsx("td",{className:`${P===0?"text-lg":""}`,children:e.jsx("div",{className:x&&x.style&&x.style["font-weight"]&&x.style["font-weight"]==="bold"?"font-bold":"",children:P===0?e.jsx(X,{value:{...x},title:y(String(o[0].value)),hideIcon:!0}):e.jsx(X,{value:{...x,href:"/runs/?q="+d(String(o[0].value))},title:`Click value to see all predictions for: ${d(String(o[0].value))}`})})},`${t}-${P}`))},`${t}-${p}`))})]})})})})}function ee({numModelsToAutoFilter:s=6}){const[t,n]=a.useState([]),[r,l]=a.useState([]),[c,i]=a.useState(),[m,h]=a.useState(!0),N=0;return console.log(t),a.useEffect(()=>{const S=new AbortController;async function k(){const v=await ge(S.signal),w=[];if(v.forEach(u=>{u.rows.forEach(y=>{w.push({title:String(y[0].value),name:y[0].href.replace("?group=","")})})}),n(w),w.length===0)throw new Error("Could not find any groups!");const f=w[0].name,[b,M]=await Promise.all([je(f,S.signal),be(S.signal)]);l(b),i(M[f]),h(!1)}return k(),()=>S.abort()},[]),m||c===void 0?e.jsx(F,{}):r.length===0?e.jsxs(e.Fragment,{children:[e.jsx($,{title:c.display_name,subtitle:c.description,markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsx(e.Fragment,{children:e.jsx(e.Fragment,{children:e.jsx(Yt,{groupsTables:r,activeGroup:N,numModelsToAutoFilter:s,filteredCols:[0,1]})})})}function Xt(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",style:{height:"525px",transform:"scale(0.9)"},children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Ye,alt:"HELM Hero",className:"object-cover h-full",style:{maxWidth:"100%"}})}),e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center h-full py-10",children:e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(ee,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(E,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})})]})]})}const Xe=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,Qe=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,es=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,ss=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,ts=""+new URL("cohere-3550c6cb.png",import.meta.url).href,ns=""+new URL("eleutherai-b9451114.png",import.meta.url).href,rs=""+new URL("google-06d997ad.png",import.meta.url).href,as=""+new URL("meta-5580e9f1.png",import.meta.url).href,ls=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,is=""+new URL("mistral-18e1be23.png",import.meta.url).href,os=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,cs=""+new URL("openai-3f8653e4.png",import.meta.url).href,ds=""+new URL("tii-24de195c.png",import.meta.url).href,ms=""+new URL("together-a665a35b.png",import.meta.url).href,us=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,hs="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASYAAABfCAYAAABFnmpnAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAlhSURBVHgB7d3xddtGEgbw7+np/8gVeFLBqYOwg/gqOKaC+CqIU0GcCqxUcOnAvAruUoEnFZxcwUUjEDYMA8QCmN0dgN/vPUggAHFpejGcXSyWQON/T8v/HZa3SOdR3vvEsg5O5dnyZuD5xfH5lywfzu/FL0/L8fx61njj+Lr63js9d87lPYZFfb1DDpVf0wd8rpOvMLNO3p5/Pz4td1jvm8TjPMoygvIU8ch5OXS2nZ6Wn8+/iUoTfK6Tr8/bHtDUSZ3645vz78kDE71IPM4rMKUS+FFswwHNJ9Y71AngRH1HNJmUZVEXY0AbmP6Ej5eJxwl8iPNxKR6xLUc0AUpAFINlUP/BhTrpnTGlZkKeGVNqMPSi2B4BgxPFIrhQJ70DkyQet+Wm3NYyppagqQil33uiMfK0/GtoRxuYPE+2lIov8PNtwjFeWdV/sW2Cpn1PFMU9Bq50e2dMJuXKnOentqCcrWZLXUd8efWOqLYf0TuPc2RMJTOYVAIfin34CURxWKLyurshR8YkCceUzphSjknhdfWytgPY10Sx/AOdOnnT2aHwced0TKqp7MuzrD005VpHEMVh5+mr9kE3MHmddKU7v6d4BibFfnwPoli+a1e6gekP+EjpPyrZlJvaP8eeMqZ7EMVyaFdyZExTt6UIfAnK2fpwgS77cGBwokgE56QlRx/TVMYk8DWVfQn87CljMgxMFI3Yj1p9TJ5KBSbF/jAwUTTPdbLGVTmBv0tZmlegVOxP6jQ1RKU8B6bbzgaFj7vzMpaBCcryCkwfUc7v+Pr9a/uEBH6YMVGqUnXy+cMyR2Bqn3wsMOVo6tlo87HBj16jzBXl/IrxCd4e0AxG81BjkOU/Z5R7hN+/9Tc0712KEn2JD0/Lv7EdrzF8jgmaG3G9PuTEftz2NnrNZGlX5sYCRY6TQS7s21tTziqIjUHy+HfVCExzrmwe4EcRazZPC0oP2D59Wn5AM7+SB7EfNwOFuD35iBz3yd0t3DeHIgb78Jhzcl9SIzDR/lh9VDjKFZhKBIrU5xT4iDRUwGswrCl9QzXtk1ez9KtxTMarg1cW7ltq7OTyDIIfEIeCKBbX4UZ7yZjGRpsL/ESaWSBS9kZkXOtkrsA0Nj4mV59G7vIURFRMPzB5RT2ZuT1XeXsNTKUn9iMq6WU/MHld7ZGR7bkypruZ2+cqObgyBZtyFI3CUa6MaSwgCPLIXZ4iFl7mp2gEfj4OBSaP4JQ7gxnyMmN5ilj2erWRyDzeDG2Ej5yBYsiLxNewhCIWZkwUjWudHApMCh9DgUKQjwxs83qzovXpCPxE6z+jbfobfDyfa0OByWu8jgxsyznK+C5x2xLRmjtelcCwI5083MOH2o+cTbkIE8YJfEQaXCnwu7nV6yosXbcDnFsnOZtyOQNFankeb1a0jMLzyyqZLZEHr6lpzPN9oDkzJknc5qXfTPSK4JGyCgtKR/jxvBmYro+dY7/At04+n2+3Yzsc9G8Tyd2063e2C3zU6By2ibeGZgv0fg9PIEpzGtiWo06OBiavjKkfKHIHplwZk6K8HP/hQ06gWgQ+fYV2IpdokgvyU1wITAof/UAhyOtu4vFSe+2HOYF9TDX9BJ/+QsF+/h9P7crNyAGK9XIFitTyBD4U+/QbiGL5VCfHApNHBJaJx976zZ8tN+VyU+xjvmnaD2vCndoHY4HJ62pNjkBxSbfDXeBjj82dn0EUy9+7D3JmTKYbKEoEphcjZa+xt0GI9tVQDyCKwz4otbshZx+T6U5CVmLSe+ms73Fw5VrWhn8NojgsKL3pb8ydMUlnvUTG5N10VOyHZUpHEMVgMca+/PTN0M7bkT9S+OgGB0F+MrK+1B4yphOaT6UTiGI4ofmSTB07oGRgYsZU3gMG2u9EFVmW9Dsm6mTuply3X6nkVTmBj0izCixxRDNlyzuUyViJpti9dZN18lJg8ghO7VUyQRlteRzD9KUjmu+WfwWiGI5Py3uMzON0c+EPPQJTmzEJymjLY2D6mr0ndnPwPYhiEDTBSfo7LgUmj/E7awLEksDYlifwscfBlRacSjSriVJYXXzX33gpMHlM97EmUCwZfS69ctdS7I+AY5kolgN6My1cCkyK9dr71wTzLc3Y2jLX8upnW8KuWjyguayaY+T59yCa5wGf66TC3xczLdxeOFDhw66ULQkUel4E83wDn6acoh7LaLpXBAXNQDSvKUzvz0uOoEf79EPvscC3Th7QxInROb9bXtnCt1gemJa8BivP4/aXSP1LiuYqxq/wcwDRcoqmTp7g59CulMiYLCgtCRQWGKyfae5VpD33L719Wn6ED8+vgKL5TvCpY7W/F9AG8B7gw85168YoFpiWBIulfTxL+7T6Ig6uVCxr3g7hsIG67IbqB2yfZ3fApwRmqinnNWGcYD7FsuAo2PftKF5NTA4ZIA9WHxU+pF25wXSha1kUnHsSPGJdxsTANE1AFIu0K1OBSbHe0mbc0vJdv0OdiMqbCkwe/SwHzKe937nLu/QaiKiwEk25JRnTR8fyl2LGRFRJiabcEnr+XWv0NQceElVUImNa4nFkvUb5RFTYVGCqlTnoyHqN8omosKgZk3bWawx03PrMlUSbFrWP6bHya2BTjqiiqcBkFOVpZ52d30TX4dO5nhKYanc+M3vJg+8rRTMrMCnK0t7jGtnLNWRMDEwUVkpgKt0RrL3HHC5AdGUiNuX688soylIQUVVbaMqNbStZPhEVFDEwPSZuy6X2jIBEV48Z09c4VICosoh9TDqwrWQWw45vospSA1PJk3WoLEU5CiKq6jbxOAsWHtPVptDEbbkwY6IS7EtHBX7eYkd1NzUwWb+LoIzand8fQJTfq/Pi5R12FJhSmnKmVB+PztyeA2cWIKosNTApytCZ20uVT0QFRQtMY5lZqRRVQUTVpQamCIHh0j4vHFxJFEC0jOlx4T4vCiKqbksZ0x+oWz4RFbKVzm/DjInoSqQGJqPI7zFw+URUyJzAVDtjKVE+B1cSBTAnMJXo46mdMV3T4EpmhxSNtiuRMiZduX+taztRGZgorEh9TDqxP/eJxHmYiIKI1sc0VX7O18DBlURBzAlMuTMKTTgmZ2BSEFEIkTKmlI7nnMGRfS5EQWypj8nkbG6xj4koiNSJ4lon5JswThOOseDxHfJYmzEpyvjTqayp+xIV6ynW8Xod7XOtpdgOz/duisLHp8TjLzZXn0YvVQJMAAAAAElFTkSuQmCC",xs=""+new URL("yandex-38e09d70.png",import.meta.url).href,fs=""+new URL("01-694cb9b7.png",import.meta.url).href,Qt=[Xe,Qe,es,ss,ts,ns,rs,as,ls,is,os,cs,ds,ms,us,hs,xs,fs];function _e(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const n=new AbortController;async function r(){const l=await H(n.signal);t(l)}return r(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(Xt,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:Qt.map((n,r)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},r))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(ve,{models:s.models}),e.jsx(we,{runGroups:s.run_groups})]})})]})]}):null}function en(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(ee,{numModelsToAutoFilter:10})})]})]})}const sn=""+new URL("air-overview-d2e6c49f.png",import.meta.url).href;function tn(){const s={fontVariant:"small-caps",fontWeight:"bold"},t=e.jsx("span",{style:s,children:"AIR-Bench 2024"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:t}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("img",{src:sn,alt:"AIR 2024 Categories",className:"mx-auto my-4 block w-3/4",sizes:"100vw"}),e.jsxs("p",{children:["We introduce ",t,", the first AI safety benchmark aligned with emerging government regulations and company policies, following the regulation-based safety categories grounded in our AI Risks study, AIR 2024. AIR 2024 decomposes 8 government regulations and 16 company policies into a four-tiered safety taxonomy with 314 granular risk categories in the lowest tier. ",t," contains 5,694 diverse prompts spanning these categories, with manual curation and human auditing to ensure quality. We evaluate leading language models on ",t,", uncovering insights into their alignment with specified safety concerns. By bridging the gap between public benchmarks and practical AI risks, ",t," ","provides a foundation for assessing model safety across jurisdictions, fostering the development of safer and more responsible AI systems."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",style:{display:"none"},children:"Paper (TBD)"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(ee,{numModelsToAutoFilter:10}),e.jsx("div",{className:"flex justify-end",children:e.jsx(E,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const nn=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function rn({metricFieldMap:s,metricGroups:t}){const n=new Set,r=[];return t.forEach(l=>{const c=[];l.metrics.forEach(i=>{const m=s[i.name];m&&(c.push(m),n.add(m.name))}),c.length>0&&r.push([l,c])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," metrics"]}),e.jsx("ul",{children:r.map(([l,c])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:l.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:c.map(i=>e.jsx("li",{className:"ml-4",children:i.display_name},i.name))})]},l.name))})]})}function an(){const[s,t]=a.useState(void 0);a.useEffect(()=>{const r=new AbortController;async function l(){const c=await H(r.signal);t(c)}return l(),()=>r.abort()},[]);const n=s?s.metrics.reduce((r,l)=>(r[l.name]=l,r),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:nn,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&n?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(ve,{models:s.models}),e.jsx(we,{runGroups:s.run_groups}),e.jsx(rn,{metricFieldMap:n,metricGroups:s.metric_groups})]}):null]})}const ln=""+new URL("vhelm-framework-cde7618a.png",import.meta.url).href,on=""+new URL("vhelm-model-6d812526.png",import.meta.url).href;function cn(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const n=new AbortController;async function r(){const l=await H(n.signal);t(l)}return r(),()=>n.abort()},[]),e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"The First Steps to Holistic Evaluation of Vision-Language Models"}),e.jsxs("p",{className:"my-4",children:["To better understand VLMs, we introduce the first version of"," ",e.jsx("em",{children:"Holistic Evaluation of Vision-Language Models (VHELM)"})," by extending the ",e.jsx("a",{href:"https://arxiv.org/abs/2211.09110",children:"HELM"})," ","framework with the necessary adaptation methods to assess the performance of 6 prominent VLMs on 3 standard VLM benchmarks."]}),e.jsx("p",{className:"my-4 font-bold",children:"This is ongoing work to achieve holistic evaluation for vision-language models, so please stay tuned!"}),e.jsxs("div",{className:"my-16 flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:ln,alt:"An image of a helm and the text 'This helm is a' is sent to a Vision-Language Model, which produces the text 'wheel for steering a ship...'",className:""}),e.jsx("img",{src:on,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Vision), then Metrics (Exact match) are computed",className:""})]}),e.jsxs("div",{className:"flex-1",children:[e.jsx(ee,{numModelsToAutoFilter:10}),e.jsx(E,{to:"leaderboard",className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})]})]}),s===void 0?null:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(ve,{models:s.models}),e.jsx(we,{runGroups:s.run_groups})]})]})}const dn=({id:s,title:t,text:n})=>(t.includes("HE")||(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:pe(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:n})]})})}));function mn(){const[s,t]=a.useState();return a.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(n=>n.json()).then(n=>{t(n)}).catch(n=>{console.error("Error fetching JSON:",n)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-3 gap-4",children:s&&s.map((n,r)=>n.id==="home"?null:e.jsx(dn,{id:n.id,title:n.title,text:n.description},r))})})}function un(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-32",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl
|
|
10
|
-
mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:Ye,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const hn=[Xe,Qe,es,ss,ts,ns,rs,as,ls,is,os,cs,ds,ms,us,hs,xs,fs];function xn(){const[s,t]=a.useState(void 0);return a.useEffect(()=>{const n=new AbortController;async function r(){const l=await H(n.signal);t(l)}return r(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(un,{}),e.jsx("div",{className:"container py-5 mx-auto text-lg",children:e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})})}),e.jsx(mn,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:hn.map((n,r)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},r))})})})]})})]}):null}const fn=""+new URL("overview-74aea3d8.png",import.meta.url).href,pn=""+new URL("process-flow-bd2eba96.png",import.meta.url).href;function gn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Image2Struct: A Benchmark for Evaluating Vision-Language Models in Extracting Structured Information from Images"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-2 md:gap-8 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"TODO",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-latex",children:"Latex dataset"}),e.jsx("a",{className:"px-5 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-webpage",children:"Webpage dataset"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-musicsheet",children:"Music sheet dataset"})]}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Image2struct"})," is a benchmark for evaluating vision-language models in practical tasks of extracting structured information from images."]}),e.jsx("br",{}),e.jsx("p",{children:"In our tasks, VLMs are prompted to generate the underlying structured information (i.e., code) from an input image. The code can be compiled, and the output image is evaluated against the input image to produce a score. This round-trip evaluation allows us to quantitatively evaluate VLMs on complex tasks with multiple correct answers. We create a pipeline that downloads fresh, user-submitted data from active online communities upon execution, evaluates the VLMs shortly, and produces a leaderboard."}),e.jsx("br",{}),e.jsx("img",{src:fn,alt:"Evaluation flowchart",className:"mx-auto block w-full",sizes:"100vw"}),e.jsx("br",{}),e.jsx("p",{children:"We introduce 3 tasks:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"LaTex: equations, tables, plots and algorithms form ArXiV papers"}),e.jsx("li",{children:"Webpages: pages from GitHub written in HTML, CSS and Javascript, ..."}),e.jsx("li",{children:"Music sheets: crops of measures from music sheets from IMSLP"})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(ee,{numModelsToAutoFilter:12})})]}),e.jsx("br",{}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("p",{children:"We provide an automated process for collecting new fresh data from online communities, evaluating the models and producing a leaderboard. The pipeline is designed to be executed on a regular basis to keep the leaderboard up-to-date."}),e.jsx("br",{}),e.jsxs("p",{children:["In addition to the automated data collection, we also provide a"," ",e.jsx("i",{children:"wild"})," subset for the LaTeX and webpage tasks that are collected from Wikipedia and various popular websites. These instances do not have a corresponding code, and the evaluation is done by our proposed metric: block EMD (Earth Mover Distance)."]})]}),e.jsx("div",{className:"flex-1",children:e.jsx("img",{src:pn,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block w-full",sizes:"200vw"})})]})]})}function jn(){return window.PROJECT_ID==="lite"?e.jsx(_e,{}):window.PROJECT_ID==="instruct"?e.jsx(Kt,{}):window.PROJECT_ID==="image2struct"?e.jsx(gn,{}):window.PROJECT_ID==="heim"?e.jsx(an,{}):window.PROJECT_ID==="mmlu"?e.jsx(en,{}):window.PROJECT_ID==="vhelm"?e.jsx(cn,{}):window.PROJECT_ID==="air-bench"?e.jsx(tn,{}):window.PROJECT_ID==="home"?e.jsx(xn,{}):e.jsx(_e,{})}function bn(){return e.jsx(Ls,{children:e.jsx(As,{children:e.jsxs(O,{path:"/",element:e.jsx(ht,{}),children:[e.jsx(O,{index:!0,element:e.jsx(jn,{})}),e.jsx(O,{path:"leaderboard",element:e.jsx(Jt,{})}),e.jsx(O,{path:"models",element:e.jsx(jt,{})}),e.jsx(O,{path:"scenarios",element:e.jsx(bt,{})}),e.jsx(O,{path:"groups",element:e.jsx(yt,{})}),e.jsx(O,{path:"groups/:groupName",element:e.jsx(Nt,{})}),e.jsx(O,{path:"runs",element:e.jsx(St,{})}),e.jsx(O,{path:"runs/:runName",element:e.jsx(Gt,{})})]})})})}ue.createRoot(document.getElementById("root")).render(e.jsx(Ts.StrictMode,{children:e.jsx(bn,{})}));
|