crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +112 -63
- helm/benchmark/run_spec_factory.py +15 -10
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +444 -65
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +447 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/schema_vhelm.yaml +824 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +78 -14
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +11 -5
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +199 -2
- helm/clients/vertexai_client.py +117 -64
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +12 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +29 -3
- helm/config/model_deployments.yaml +504 -12
- helm/config/model_metadata.yaml +579 -52
- helm/config/tokenizer_configs.yaml +100 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/schema_vlm.yaml +0 -576
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
import{r as l,a as Ms,L as N,O as Rs,d as ks,u as Te,f as Ue,H as Ls,h as Ps,i as O,R as _s}from"./react-d4a0b69b.js";import{g as ee,b as X,m as he,s as Ie,a as Cs,d as Pe,y as As,c as _e,e as pe,l as ge}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const a of document.querySelectorAll('link[rel="modulepreload"]'))n(a);new MutationObserver(a=>{for(const o of a)if(o.type==="childList")for(const i of o.addedNodes)i.tagName==="LINK"&&i.rel==="modulepreload"&&n(i)}).observe(document,{childList:!0,subtree:!0});function r(a){const o={};return a.integrity&&(o.integrity=a.integrity),a.referrerPolicy&&(o.referrerPolicy=a.referrerPolicy),a.crossOrigin==="use-credentials"?o.credentials="include":a.crossOrigin==="anonymous"?o.credentials="omit":o.credentials="same-origin",o}function n(a){if(a.ep)return;a.ep=!0;const o=r(a);fetch(a.href,o)}})();var $e={exports:{}},ie={};/**
|
|
2
|
-
* @license React
|
|
3
|
-
* react-jsx-runtime.production.min.js
|
|
4
|
-
*
|
|
5
|
-
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
6
|
-
*
|
|
7
|
-
* This source code is licensed under the MIT license found in the
|
|
8
|
-
* LICENSE file in the root directory of this source tree.
|
|
9
|
-
*/var Ts=l,Us=Symbol.for("react.element"),Is=Symbol.for("react.fragment"),$s=Object.prototype.hasOwnProperty,Os=Ts.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,Ds={key:!0,ref:!0,__self:!0,__source:!0};function Oe(s,t,r){var n,a={},o=null,i=null;r!==void 0&&(o=""+r),t.key!==void 0&&(o=""+t.key),t.ref!==void 0&&(i=t.ref);for(n in t)$s.call(t,n)&&!Ds.hasOwnProperty(n)&&(a[n]=t[n]);if(s&&s.defaultProps)for(n in t=s.defaultProps,t)a[n]===void 0&&(a[n]=t[n]);return{$$typeof:Us,type:s,key:o,ref:i,props:a,_owner:Os.current}}ie.Fragment=Is;ie.jsx=Oe;ie.jsxs=Oe;$e.exports=ie;var e=$e.exports,xe={},Ce=Ms;xe.createRoot=Ce.createRoot,xe.hydrateRoot=Ce.hydrateRoot;function Bs({title:s,titleId:t,...r},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const Fs=l.forwardRef(Bs),De=Fs;function Hs({title:s,titleId:t,...r},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const zs=l.forwardRef(Hs),Gs=zs;function qs({title:s,titleId:t,...r},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const Vs=l.forwardRef(qs),Js=Vs,Be=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,Fe=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function Ws({title:s,titleId:t,...r},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const Ks=l.forwardRef(Ws),He=Ks;function Zs({title:s,titleId:t,...r},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const Ys=l.forwardRef(Zs),Xs=Ys;function Qs({title:s,titleId:t,...r},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const et=l.forwardRef(Qs),ze=et;function st({title:s,titleId:t,...r},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const tt=l.forwardRef(st),je=tt;function nt({title:s,titleId:t,...r},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M19.902 4.098a3.75 3.75 0 00-5.304 0l-4.5 4.5a3.75 3.75 0 001.035 6.037.75.75 0 01-.646 1.353 5.25 5.25 0 01-1.449-8.45l4.5-4.5a5.25 5.25 0 117.424 7.424l-1.757 1.757a.75.75 0 11-1.06-1.06l1.757-1.757a3.75 3.75 0 000-5.304zm-7.389 4.267a.75.75 0 011-.353 5.25 5.25 0 011.449 8.45l-4.5 4.5a5.25 5.25 0 11-7.424-7.424l1.757-1.757a.75.75 0 111.06 1.06l-1.757 1.757a3.75 3.75 0 105.304 5.304l4.5-4.5a3.75 3.75 0 00-1.035-6.037.75.75 0 01-.354-1z",clipRule:"evenodd"}))}const rt=l.forwardRef(nt),at=rt;function lt({title:s,titleId:t,...r},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},r),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const it=l.forwardRef(lt),ot=it;function we(s,t){return t?s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function Ge(){const[s,t]=l.useState([]),[r,n]=l.useState();return l.useEffect(()=>{fetch("https://storage.googleapis.com/crfm-helm-public/config/project_metadata.json").then(a=>a.json()).then(a=>{if(t(a),window.PROJECT_ID)if(window.PROJECT_ID==="global")n({id:"global",title:"All Projects",description:"description",releases:["releases"],imageUrl:"imageUrl"});else{const o=a.find(i=>i.id===window.PROJECT_ID);n(o)}else{const o=a.find(i=>i.id==="lite");n(o)}}).catch(a=>{console.error("Error fetching JSON:",a)})},[]),r===void 0||r.title===void 0?null:e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[r.title," ",e.jsx(ze,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((a,o)=>e.jsx("li",{children:e.jsxs("a",{href:we(void 0,a.id),className:"block",role:"menuitem",children:[e.jsxs("strong",{children:[a.title,":"]})," ",a.description]})},o))})]})}function P(s){return`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function W(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function ct(s){try{return await(await fetch(P(`${W()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function dt(){const[s,t]=l.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[r,n]=l.useState();l.useEffect(()=>{fetch("https://storage.googleapis.com/crfm-helm-public/config/project_metadata.json").then(c=>c.json()).then(c=>{if(window.PROJECT_ID){const u=c.find(f=>f.id===window.PROJECT_ID);n(u)}else{const u=c.find(f=>f.id==="lite");n(u)}}).catch(c=>{console.error("Error fetching JSON:",c)})},[]);function a(){return r!==void 0&&r.releases!==void 0?r.releases:["v1.0.0"]}l.useEffect(()=>{const c=new AbortController;async function u(){const f=await ct(c.signal);t(f)}return u(),()=>c.abort()},[]);const o=a();if(!s.release&&!s.suite)return null;const i=`Release ${s.release||s.suite} (${s.date})`;return o.length<=1?e.jsx("div",{children:i}):e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[i," ",e.jsx(ze,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:o.map(c=>e.jsx("li",{children:e.jsx("a",{href:we(c,r?r.id:"lite"),className:"block",role:"menuitem",children:c})}))})]})}function mt(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(De,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(N,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(N,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(N,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(N,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(N,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(N,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:Be,className:"object-contain"})}),e.jsx(N,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:Fe,className:"object-contain"})}),e.jsx(Ge,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(N,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(N,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(N,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(N,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(N,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(dt,{})})]})})]})}function ut(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(De,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(N,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:Be,className:"object-contain"})}),e.jsx(N,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:Fe,className:"object-contain"})}),e.jsx(Ge,{})]})]})}function ht(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="global"?e.jsx(ut,{}):e.jsx(mt,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(Rs,{})})})]})}async function F(s){try{return await(await fetch(P(`${W()}/schema.json`),{signal:s})).json()}catch(t){return console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function xt({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function V({value:s}){return e.jsx("span",{children:e.jsx(ks,{components:{a:xt},children:s})})}function U({title:s,subtitle:t,markdown:r=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),r&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(V,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const jt={open:"green",limited:"yellow",closed:"red"},ft={open:"Open",limited:"Limited",closed:"Closed"};function pt({level:s}){return e.jsx(ee,{color:jt[s],children:ft[s]})}function H(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function gt(){const[s,t]=l.useState([]);l.useEffect(()=>{const i=new AbortController;async function c(){const u=await F(i.signal);t(u.models)}return c(),()=>i.abort()},[]);const[r,n,a]=s.reduce((i,c)=>{switch(c.access){case"open":i[0]+=1;break;case"limited":i[1]+=1;break;case"closed":i[2]+=1;break}return i},[0,0,0]),o=Object.values(s.reduce((i,c)=>{const u=c.creator_organization;return i[u]===void 0?(i[u]={name:u,models:1},i):(i[u].models+=1,i)},{}));return s.length===0?e.jsx(H,{}):e.jsxs(e.Fragment,{children:[e.jsx(U,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(i=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:i.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:i.display_name}),e.jsx("br",{}),e.jsx("span",{children:i.name})]}),e.jsx("td",{children:e.jsx(V,{value:i.description})}),e.jsx("td",{children:e.jsx(pt,{level:i.access})})]}))})]}),e.jsx(U,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(X,{className:"flex flex-col justify-between",children:[e.jsx(he,{children:"Models"}),e.jsx(Ie,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(Cs,{values:[r,n,a],colors:["green","yellow","red"]}),e.jsx(Pe,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(X,{className:"md:col-span-2",children:[e.jsx(he,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(As,{data:o,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(Pe,{categories:o.map(i=>i.name),className:"basis-7/12"})]})]})]})]})]})}function le({to:s,children:t,inTable:r=!1,title:n=""}){return r?e.jsx(N,{className:"link link-hover",to:s,title:n,children:t}):e.jsx(N,{className:"link link-primary link-hover",to:s,children:t})}function wt(){const[s,t]=l.useState([]);l.useEffect(()=>{const n=new AbortController;async function a(){const o=await F(n.signal);t(o.run_groups.filter(i=>!i.todo&&i.taxonomy&&!i.display_name.includes("CLEVA")))}return a(),()=>n.abort()},[]);const r=Object.values(s.reduce((n,a)=>{var i;const o=((i=a.taxonomy)==null?void 0:i.task)||"Unknown";return n[o]===void 0?(n[o]={name:o,value:1},n):(n[o].value+=1,n)},{}));return s.length===0?e.jsx(H,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(U,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(n=>{var a,o,i,c,u;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(le,{to:`/groups/${n.name}`,children:e.jsx("span",{className:"text-lg",children:n.display_name})}),e.jsx("span",{className:"block",children:n.name})]}),e.jsx("td",{children:((a=n.taxonomy)==null?void 0:a.task)||""}),e.jsx("td",{children:((o=n.taxonomy)==null?void 0:o.what)||""}),e.jsx("td",{children:((i=n.taxonomy)==null?void 0:i.who)||""}),e.jsx("td",{children:((c=n.taxonomy)==null?void 0:c.when)||""}),e.jsx("td",{children:((u=n.taxonomy)==null?void 0:u.language)||""}),e.jsx("td",{children:e.jsx(V,{value:n.description})})]})})})]}),e.jsx(U,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(X,{className:"flex flex-col",children:[e.jsx(he,{children:"Total scenarios"}),e.jsx(Ie,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(X,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(_e,{data:r.slice(0,Math.floor(r.length/2))}),e.jsx(_e,{data:r.slice(Math.ceil(r.length/2))})]})})]})]})]}))}function qe(){return P(`${W()}/groups.json`)}async function be(s){try{return await(await fetch(qe(),{signal:s})).json()}catch(t){return console.log(t),[]}}function oe({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function Q({active:s=!1,onClick:t=()=>{},size:r="md",children:n}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${r} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:n})}function q(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function ae({value:s,title:t}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const r=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const n="/runs/?q="+s.run_spec_names.map(o=>`^${o}$`).join("|");return encodeURI(n)}})();return r?e.jsx(le,{to:r,inTable:!0,title:t,children:q(s.value)}):t?e.jsx("a",{title:t,children:q(s.value)}):e.jsx(e.Fragment,{children:q(s.value)})}return s.href?e.jsx(le,{to:s.href,inTable:!0,title:t,children:q(s.value)}):s.markdown?e.jsx(V,{value:String(s.value)}):t?e.jsx("a",{title:t,children:q(s.value)}):e.jsx(e.Fragment,{children:q(s.value)})}function Ve({groupsTables:s,activeGroup:t,ignoreHref:r=!1,sortable:n=!0,sortFirstMetric:a=!0}){const[o,i]=l.useState(a?1:void 0),[c,u]=l.useState({...s[t]}),[f,S]=l.useState(1);l.useEffect(()=>{u({...s[t]})},[t,s]);const M=w=>{let v=f;o===w?v=v*-1:v=1,i(w),S(v),u(p=>{const g={...p};return g.rows.sort((_,d)=>{var A,L;const b=(A=_[w])==null?void 0:A.value,y=(L=d[w])==null?void 0:L.value;return b!==void 0&&y===void 0?-1:y!==void 0&&b===void 0?1:typeof b=="number"&&typeof y=="number"?(b-y)*v:typeof b=="string"&&typeof y=="string"?v===1?b.localeCompare(y):y.localeCompare(b):0}),g})};return l.useEffect(()=>{a&&o&&M(o)},[a,o]),e.jsx("div",{children:e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:c.header.map((w,v)=>e.jsx("th",{className:`${v===o?"bg-gray-100 ":"bg-white"} ${v===0?"left-0 z-10":""} whitespace-nowrap sticky top-0`,children:e.jsxs("div",{className:"flex gap-2 items-center",children:[e.jsx("span",{children:w.value}),n?e.jsx("button",{className:"link",onClick:()=>M(v),children:e.jsx(je,{className:"w-6 h-6"})}):null]})},`${t}-${v}`))})}),e.jsx("tbody",{children:c.rows.map((w,v)=>e.jsx("tr",{children:w.map((p,g)=>e.jsx("td",{className:`${g==0?"text-lg sticky left-0":""} ${o===g?"bg-gray-100":"bg-white"}`,children:e.jsx(ae,{ignoreHref:r&&g===0,value:p})},`${t}-${g}`))},`${t}-${v}`))})]})})}function bt(){const[s,t]=l.useState(0),[r,n]=l.useState([]),[a,o]=l.useState([]);return l.useEffect(()=>{const i=new AbortController;async function c(){const u=await be(i.signal);o(u),n(u.map(f=>f.title))}return c(),()=>i.abort()},[]),a.length===0?e.jsx(H,{}):e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx(U,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("a",{className:"flex link-primary space-between items-center self-end link link-hover block",href:qe(),download:"true",target:"_blank",children:[e.jsx(He,{className:"w-6 h-6 mr-2"})," JSON"]})]}),e.jsx("div",{children:e.jsx(oe,{children:r.map((i,c)=>e.jsx(Q,{onClick:()=>t(c),active:s===c,size:"lg",children:i},c))})}),e.jsx("div",{className:"mt-8",children:e.jsx(Ve,{sortable:!1,groupsTables:a,activeGroup:s})})]})}async function ve(s,t){try{return await(await fetch(P(`${W()}/groups/${s}.json`),{signal:t})).json()}catch(r){return console.log(r),[]}}async function ye(s){try{return await(await fetch(P(`${W()}/groups_metadata.json`),{signal:s})).json()}catch(t){return console.log(t),{}}}function vt(){const{groupName:s}=Te(),[t,r]=l.useState([]),[n,a]=l.useState(),[o,i]=l.useState(!0),[c,u]=l.useState(0);return l.useEffect(()=>{const f=new AbortController;async function S(){if(s===void 0)return;const[M,w]=await Promise.all([ve(s,f.signal),ye(f.signal)]);r(M),a(w[s]),i(!1)}return S(),()=>f.abort()},[s]),o||n===void 0?e.jsx(H,{}):t.length===0?e.jsxs(e.Fragment,{children:[e.jsx(U,{title:n.display_name,subtitle:n.description,markdown:!0,className:"mr-8"}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex flex-row justify-between",children:e.jsx(U,{title:n.display_name,subtitle:n.description,markdown:!0,className:"mr-8 mb-16"})}),e.jsx("div",{className:"overflow-x-auto",children:t.length>1?e.jsx(oe,{children:t.map((f,S)=>e.jsx(Q,{active:S===c,onClick:()=>u(S),children:f.title},S))}):null}),e.jsx(Ve,{groupsTables:t,activeGroup:c,ignoreHref:!0})]})}async function Je(s){try{return await(await fetch(P(`${W()}/run_specs.json`),{signal:s})).json()}catch(t){return console.log(t),[]}}function fe({currentPage:s,totalPages:t,onNextPage:r,onPrevPage:n,className:a}){let o="join";return a!==void 0&&(o=`join ${a}`),e.jsxs("div",{className:o,children:[e.jsx("button",{onClick:n,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:r,className:"join-item btn",children:"»"})]})}const de=100;function yt(){const[s,t]=Ue(),[r,n]=l.useState([]),[a,o]=l.useState(Number(s.get("page")||1)),[i,c]=l.useState(1),[u,f]=l.useState([]),[S,M]=l.useState(!0),[w,v]=l.useState(s.get("q")||"");l.useEffect(()=>{const d=new AbortController;async function b(){const y=await Je(d.signal);n(y),p(w,y)}return b(),()=>d.abort()},[w]),l.useEffect(()=>{p(w,r)},[r,w]);function p(d,b){const y=S?new RegExp(d):null,A=b.filter(L=>y?y.test(L.name):L.name.includes(d));f(A),c(Math.ceil(A.length/de))}const g=d=>{d.preventDefault();const y=d.target.q.value;v(y),t({q:y,page:"1"}),p(y,r)},_=u.slice((a-1)*de,a*de);return r.length===0?e.jsx(H,{}):e.jsxs(e.Fragment,{children:[e.jsx(U,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:g,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:w,onChange:d=>v(d.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:S,onChange:()=>M(!S)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${u.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(ot,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:_.map((d,b)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(le,{to:`/runs/${d.name}`,children:d.name})}),e.jsx("td",{children:d.adapter_spec.model}),e.jsx("td",{children:d.groups.join(", ")}),e.jsx("td",{children:d.adapter_spec.method}),e.jsx("td",{children:d.scenario_spec.args.subject||d.scenario_spec.args.task||"-"})]},`${d.name}-${b}`))})]})}),i>0?e.jsx(fe,{className:"flex justify-center my-8",onNextPage:()=>{const d=Math.min(a+1,i);o(d),s.set("page",String(d)),t(s)},onPrevPage:()=>{const d=Math.max(a-1,1);o(d),s.set("page",String(d)),t(s)},currentPage:a,totalPages:i}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function z(){return window.SUITE!==void 0?window.SUITE:void 0}async function Nt(s,t,r){try{return await(await fetch(P(`/runs/${r||z()}/${s}/instances.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function St(s,t,r){try{return await(await fetch(P(`/runs/${r||z()}/${s}/stats.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Et(s,t,r){try{return await(await fetch(P(`/runs/${r||z()}/${s}/display_requests.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Mt(s,t,r){try{return await(await fetch(P(`/runs/${r||z()}/${s}/display_predictions.json`),{signal:t})).json()}catch(n){return console.log(n),[]}}async function Rt(s,t,r){try{return await(await fetch(P(`/runs/${r||z()}/${s}/scenario.json`),{signal:t})).json()}catch(n){console.log(n);return}}function kt(s,t){return P(`/runs/${t||z()}/${s}/run_spec.json`)}function Lt(s,t){return P(`/runs/${t||z()}/${s}/scenario_state.json`)}function me({value:s}){return e.jsx(ee,{icon:Gs,color:"green",children:s})}function ue({value:s}){return e.jsx(ee,{icon:Js,color:"red",children:s})}function Pt({stats:s}){if(s.quasi_exact_match!==void 0)return s.quasi_exact_match?e.jsx(me,{value:`quasi exact match: ${s.quasi_exact_match}`}):e.jsx(ue,{value:`quasi exact match: ${s.quasi_exact_match}`});if(s.toxic_frac!==void 0)return s.toxic_frac>0?e.jsx(ue,{value:`toxic frac: ${s.toxic_frac}`}):e.jsx(me,{value:`toxic frac: ${s.toxic_frac}`});if(s.exact_match!==void 0)return s.exact_match>0?e.jsx(me,{value:`exact match: ${s.exact_match}`}):e.jsx(ue,{value:`exact match: ${s.exact_match}`})}function J({value:s}){const[t,r]=l.useState(!1),[n,a]=l.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>r(!0),onMouseOut:()=>r(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>a(!0),children:e.jsx(Xs,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:n,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>a(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function We({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=P(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function Ke({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(We,{mediaObject:t}))})}function _t(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function Ct({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(J,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(Ke,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(pe,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,r)=>e.jsxs(ge,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:_t(s.request[t])}):"null"]},r+1))})]})}function At({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,r])=>e.jsxs("div",{children:[e.jsx("h3",{children:e.jsx("strong",{children:t})}),r.map((n,a)=>e.jsxs("div",{children:[n.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(J,{value:n.error})," "]}),n.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(J,{value:n.text})," "]}),n.media_object&&e.jsx(We,{mediaObject:n.media_object})]},a))]},t)):null})}function Tt({predictions:s,requests:t,metricFieldMap:r}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((n,a)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",a]}):null,e.jsx("div",{className:"mt-2 w-full",children:n.base64_images&&n.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),n.base64_images.map(o=>e.jsx("img",{src:"data:image;base64,"+o,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(Pt,{stats:n.stats})]}),e.jsx(J,{value:n.predicted_text}),n.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(J,{value:String(n.mapped_output)})]}):null]})}),e.jsx(At,{predictionAnnotations:n.annotations}),e.jsx("h3",{children:"Metrics"}),e.jsx(pe,{children:Object.keys(n.stats).map((o,i)=>e.jsxs(ge,{children:[r[o]?e.jsx("span",{title:r[o].description,children:r[o].display_name}):e.jsx("span",{children:o}),e.jsx("span",{children:String(n.stats[o])})]},i))}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(Ct,{request:t[a]})})]})]},a))})})}const Ut="correct";function It({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,r)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(n=>e.jsx(ee,{className:"mx-2",color:n===Ut?"green":void 0,children:n}))]},r))})]})}function $t({instance:s,requests:t,predictions:r,metricFieldMap:n}){return e.jsxs("div",{className:"border p-4",children:[e.jsx("h3",{className:"text-xl mb-4",children:`Instance id: ${s.id} [split: ${s.split}]`}),e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(Ke,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('<br><img src="data:image;base64')?e.jsx("div",{dangerouslySetInnerHTML:{__html:s.input.text}}):e.jsx(J,{value:s.input.text}),e.jsx("div",{children:s.references&&s.references.length>0?e.jsx(It,{references:s.references}):null}),e.jsx("div",{children:r&&t?e.jsx(Tt,{predictions:r,requests:t,metricFieldMap:n}):null})]})}function Ot({stat:s,metricFieldMap:t}){const r=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),r]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),r]})}function Ze(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function Dt(s){try{return await(await fetch(P(`/releases/${Ze()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return console.log(t),{}}}function Bt(s,t){return Ze()?s[t]:window.SUITE}const ne=10,re=50;function Ft(){const{runName:s}=Te(),[t,r]=Ue(),[n,a]=l.useState(0),[o,i]=l.useState(),[c,u]=l.useState(),[f,S]=l.useState([]),[M,w]=l.useState([]),[v,p]=l.useState(),[g,_]=l.useState(),[d,b]=l.useState(1),[y,A]=l.useState(1),[L,D]=l.useState(1),[se,K]=l.useState(1),[m,h]=l.useState(),[x,E]=l.useState(),[T,Z]=l.useState({}),[I,$]=l.useState({}),[Y,te]=l.useState("");if(l.useEffect(()=>{const j=new AbortController;async function B(){const C=j.signal;if(s===void 0)return()=>j.abort();const G=window.SUITE?window.SUITE:Bt(await Dt(C),s);u(G);const[Ee,Me,Re,bs,vs,ys]=await Promise.all([Je(C),Nt(s,C,G),St(s,C,G),Rt(s,C,G),Mt(s,C,G),Et(s,C,G)]);i(Ee.find(R=>R.name===s)),S(Me);const ke=Math.ceil(Me.length/ne),Ns=Number(t.get("instancesPage")||1);A(ke),b(Math.max(Math.min(Ns,ke),1)),w(Re),E(bs);const Le=Math.floor(Re.length/re),Ss=Number(t.get("metricsPage")||1);K(Le),D(Math.max(Math.min(Ss,Le),1)),p(vs.reduce((R,k)=>(R[k.instance_id]===void 0&&(R[k.instance_id]=[]),R[k.instance_id].push(k),R),{})),_(ys.reduce((R,k)=>(R[k.instance_id]===void 0&&(R[k.instance_id]=[]),R[k.instance_id].push(k),R),{}));const ce=await F(C);$(ce.metrics.reduce((R,k)=>(R[k.name]=k,R),{})),Z(ce.adapter.reduce((R,k)=>(R[k.name]=k,R),{})),h(ce.models.find(R=>{var k;return R.name===((k=Ee.find(Es=>Es.name===s))==null?void 0:k.adapter_spec.model)}))}return B(),()=>j.abort()},[s,t]),o===void 0||v===void 0||g===void 0||x===void 0)return e.jsx(H,{});const gs=f.slice((d-1)*ne,(d-1)*ne+ne),ws=M.slice((L-1)*re,(L-1)*re+re);return e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[x.name,e.jsx("a",{href:"/#/groups/"+x.name,children:e.jsx(at,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(V,{value:x.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:o.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(V,{value:(m==null?void 0:m.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:x.tags.map(j=>e.jsx(ee,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:j})}))})]})}),e.jsxs(X,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(He,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:kt(o.name,c),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:Lt(o.name,c),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(pe,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(o.adapter_spec).map(([j,B],C)=>e.jsxs(ge,{className:C<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:T[j]?T[j].description:void 0,children:`${j}: `}),e.jsx("span",{className:"overflow-x-auto",children:B})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(oe,{children:[e.jsx(Q,{size:"lg",active:n===0,onClick:()=>a(0),children:"Instances + Predictions"}),e.jsx(Q,{size:"lg",active:n===1,onClick:()=>a(1),children:"All metrics"})]})}),n===0?e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:gs.map((j,B)=>e.jsx($t,{instance:j,requests:g[j.id],predictions:v[j.id],metricFieldMap:I},`${j.id}-${B}`))}),e.jsx(fe,{className:"flex justify-center my-8",onNextPage:()=>{const j=Math.min(d+1,y);b(j),t.set("instancesPage",String(j)),r(t)},onPrevPage:()=>{const j=Math.max(d-1,1);b(j),t.set("instancesPage",String(j)),r(t)},currentPage:d,totalPages:y})]}):e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:j=>te(j.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:Object.keys(M[0]).map(j=>e.jsx("th",{children:j},j))})}),e.jsx("tbody",{children:ws.filter(j=>!Y||j.name.name.toLowerCase().includes(Y.toLowerCase())).map(j=>e.jsx("tr",{children:Object.entries(j).map(([B,C])=>B==="name"?e.jsx("td",{children:e.jsx(Ot,{stat:j,metricFieldMap:I})},B):e.jsx("td",{children:C}))}))})]})}),e.jsx(fe,{className:"flex justify-center my-8",onNextPage:()=>{const j=Math.min(L+1,se);D(j),t.set("metricsPage",String(j)),r(t)},onPrevPage:()=>{const j=Math.max(L-1,1);D(j),t.set("metricsPage",String(j)),r(t)},currentPage:L,totalPages:se})]})]})}function Ye({groupsTables:s,activeGroup:t,ignoreHref:r=!1,sortable:n=!0,sortFirstMetric:a=!0,filtered:o=!1,filteredCols:i=[],modelsToFilter:c=[],numModelsToAutoFilter:u=0}){const[f,S]=l.useState(a?1:void 0),[M,w]=l.useState({...s[t]}),[v,p]=l.useState(1),[g,_]=l.useState(c),d=m=>m.value==="Model/adapter"?"Model":m.value,[b,y]=l.useState(void 0);l.useEffect(()=>{const m=new AbortController;async function h(){const x=await F(m.signal);y(x)}return h(),()=>m.abort()},[]);const A=m=>{if(b){const h=b.models.find(x=>x.display_name===m);if(h){let x=h.description;return x.includes("/")&&(x=x.replace("/","_")),x}}return""},L=m=>{if(b){const h=b.models.find(x=>x.display_name===m);if(h){let x=h.name;return x.includes("/")&&(x=x.replace("/","_")),x}}return""};function D(m){const h=m.lastIndexOf(" - ");return h===-1?m:m.substring(0,h)+"*"+m.substring(h+1)}const se=m=>{const x=D(m).split("*")[0].trim();if(b){const E=b.run_groups.find(T=>T.display_name===x||T.short_display_name===x);if(E)return E.name}return""};l.useEffect(()=>{if(w({...s[t]}),u){const E=s[0].rows.sort((T,Z)=>Number(Z[1].value)-Number(T[1].value)).slice(0,u).map(T=>String(T[0].value));_(E)}},[t,s,u]);const K=m=>{let h=v;f===m?h=h*-1:h=1,S(m),p(h),w(x=>{const E={...x};return E.rows.sort((T,Z)=>{var Y,te;const I=(Y=T[m])==null?void 0:Y.value,$=(te=Z[m])==null?void 0:te.value;return I!==void 0&&$===void 0?-1:$!==void 0&&I===void 0?1:typeof I=="number"&&typeof $=="number"?(I-$)*h:typeof I=="string"&&typeof $=="string"?h===1?I.localeCompare($):$.localeCompare(I):0}),E})};return l.useEffect(()=>{a&&f&&K(f)},[a,f]),e.jsx(e.Fragment,{children:o?e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0",style:{overflow:"auto"},children:e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table w-full",children:[e.jsx("thead",{children:e.jsx("tr",{children:M.header.filter((m,h)=>i.length===0||i.includes(h)).map((m,h)=>e.jsx("th",{className:`${h===f?"bg-gray-100":""} whitespace-nowrap px-4`,title:m.description?m.description:"",children:e.jsxs("div",{className:"flex gap-2 items-center",children:[e.jsx("span",{children:d(m)}),n?e.jsx("button",{className:"link",onClick:()=>K(h),children:e.jsx(je,{className:"w-6 h-6"})}):null]})},`${t}-${h}`))})}),e.jsx("tbody",{children:M.rows.filter(m=>g.includes(String(m[0].value))).map((m,h)=>e.jsx("tr",{className:`${h%2===0?"bg-gray-50":""}`,children:m.filter((x,E)=>i.length===0||i.includes(E)).map((x,E)=>e.jsx("td",{className:`${E===0?"text-lg":""}`,children:e.jsx(ae,{ignoreHref:r&&E===0,value:x})},`${t}-${E}`))},`${t}-${h}`))})]})})}):e.jsx("div",{children:e.jsx("div",{children:e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:M.header.map((m,h)=>e.jsx("th",{className:`${h===f?"bg-gray-100":"bg-white"} ${h===0?"left-0 z-10":""} whitespace-nowrap px-4 sticky top-0`,title:m.description?m.description:"",children:e.jsxs("div",{className:"flex gap-2 items-center",children:[e.jsx("span",{children:d(m)}),n?e.jsx("button",{className:"link",onClick:()=>K(h),children:e.jsx(je,{className:"w-6 h-6"})}):null]})},`${t}-${h}`))})}),e.jsx("tbody",{children:M.rows.map((m,h)=>e.jsx("tr",{children:m.map((x,E)=>e.jsx("td",{className:`${E===0?"text-lg sticky left-0":""} ${h%2===0?"bg-gray-50":"bg-white"}`,children:E==1?e.jsx(ae,{value:{...x,href:"/runs/?q="+L(String(m[0].value))},title:`Click value to see all predictions for: ${L(String(m[0].value))}`}):e.jsx(ae,{value:{...x},title:String(m[0].value)===x.value?A(String(m[0].value)):`Click value to see predictions for ${se(d(M.header[E]))}: ${L(String(m[0].value))}`})},`${t}-${E}`))},`${t}-${h}`))})]})})})})}function Ht(){const[s,t]=l.useState([]),[r,n]=l.useState(),[a,o]=l.useState([]),[i,c]=l.useState(),[u,f]=l.useState(!0),[S,M]=l.useState(0);function w(p,g){console.log(p,g);const _=p.find(d=>d.title===g);return _??p[0]}function v(p,g){n(w(p,g))}return l.useEffect(()=>{const p=new AbortController;async function g(){const _=await be(p.signal),d=[];if(_.forEach(L=>{L.rows.forEach(D=>{d.push({title:String(D[0].value),name:D[0].href.replace("?group=","")})})}),t(d),d.length===0)throw new Error("Could not find any groups!");const b=r?r.name:d[0].name,[y,A]=await Promise.all([ve(b,p.signal),ye(p.signal)]);o(y),c(A[b]),f(!1)}return g(),()=>p.abort()},[r]),u||i===void 0?e.jsx(H,{}):a.length===0?e.jsxs(e.Fragment,{children:[e.jsx(U,{title:i.display_name,subtitle:i.description,markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsx(e.Fragment,{children:e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx(U,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",value:r?r.title:s[0].title,onChange:p=>v(s,p.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:s.map((p,g)=>e.jsx("option",{value:p.title,children:p.title},g))})]})]}),e.jsx("div",{className:"overflow-x-auto",children:a.length>1?e.jsx(oe,{children:a.map((p,g)=>e.jsx(Q,{active:g===S,onClick:()=>M(g),children:p.title},g))}):null}),e.jsx(Ye,{groupsTables:a,activeGroup:S,ignoreHref:!0})]})})}const zt=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,Gt=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function qt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:zt,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:Gt,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function Ne({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,r)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},r):e.jsx(N,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},r)}))})]})}function Se({runGroups:s}){const t=new Map(s.filter(a=>a.metric_groups!==void 0&&(a.subgroups===void 0||a.subgroups.length===0)).map(a=>[a.name,a])),r=new Set,n=[];return s.forEach(a=>{const o=a.subgroups?a.subgroups:[],i=[];o.forEach(c=>{const u=t.get(c);u&&(i.push(u),r.add(u.name))}),i.length>0&&n.push([a,i])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[r.size," scenarios"]}),e.jsx("ul",{children:n.map(([a,o])=>e.jsxs("li",{className:"my-3",children:[e.jsx(N,{className:"text-black",to:"groups/"+a.name,children:e.jsx("h2",{children:a.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:o.map(i=>i.todo?e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name):e.jsx(N,{className:"text-black",to:"groups/"+i.name,children:e.jsx("li",{className:`${i.todo?"ml-4 text-slate-300":"ml-4"}`,children:i.display_name},i.name)}))})]},a.name))})]})}const Xe=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function Qe(){const[s,t]=l.useState([]),[r,n]=l.useState([]),[a,o]=l.useState(),[i,c]=l.useState(!0),u=0;return console.log(s),l.useEffect(()=>{const f=new AbortController;async function S(){const M=await be(f.signal),w=[];if(M.forEach(_=>{_.rows.forEach(d=>{w.push({title:String(d[0].value),name:d[0].href.replace("?group=","")})})}),t(w),w.length===0)throw new Error("Could not find any groups!");const v=w[0].name,[p,g]=await Promise.all([ve(v,f.signal),ye(f.signal)]);n(p),o(g[v]),c(!1)}return S(),()=>f.abort()},[]),i||a===void 0?e.jsx(H,{}):r.length===0?e.jsxs(e.Fragment,{children:[e.jsx(U,{title:a.display_name,subtitle:a.description,markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]}):e.jsx(e.Fragment,{children:e.jsx(e.Fragment,{children:e.jsx(Ye,{groupsTables:r,activeGroup:u,ignoreHref:!0,filtered:!0,numModelsToAutoFilter:6,filteredCols:[0,1]})})})}function Vt(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",style:{height:"525px",transform:"scale(0.9)"},children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Xe,alt:"HELM Hero",className:"object-cover h-full",style:{maxWidth:"100%"}})}),e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center h-full py-10",children:e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(Qe,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(N,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})})]})]})}const es=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,ss=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,ts=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,ns=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,rs=""+new URL("cohere-3550c6cb.png",import.meta.url).href,as=""+new URL("eleutherai-b9451114.png",import.meta.url).href,ls=""+new URL("google-06d997ad.png",import.meta.url).href,is=""+new URL("meta-5580e9f1.png",import.meta.url).href,os=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,cs=""+new URL("mistral-18e1be23.png",import.meta.url).href,ds=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,ms=""+new URL("openai-3f8653e4.png",import.meta.url).href,us=""+new URL("tii-24de195c.png",import.meta.url).href,hs=""+new URL("together-a665a35b.png",import.meta.url).href,xs=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,js="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASYAAABfCAYAAABFnmpnAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAlhSURBVHgB7d3xddtGEgbw7+np/8gVeFLBqYOwg/gqOKaC+CqIU0GcCqxUcOnAvAruUoEnFZxcwUUjEDYMA8QCmN0dgN/vPUggAHFpejGcXSyWQON/T8v/HZa3SOdR3vvEsg5O5dnyZuD5xfH5lywfzu/FL0/L8fx61njj+Lr63js9d87lPYZFfb1DDpVf0wd8rpOvMLNO3p5/Pz4td1jvm8TjPMoygvIU8ch5OXS2nZ6Wn8+/iUoTfK6Tr8/bHtDUSZ3645vz78kDE71IPM4rMKUS+FFswwHNJ9Y71AngRH1HNJmUZVEXY0AbmP6Ej5eJxwl8iPNxKR6xLUc0AUpAFINlUP/BhTrpnTGlZkKeGVNqMPSi2B4BgxPFIrhQJ70DkyQet+Wm3NYyppagqQil33uiMfK0/GtoRxuYPE+2lIov8PNtwjFeWdV/sW2Cpn1PFMU9Bq50e2dMJuXKnOentqCcrWZLXUd8efWOqLYf0TuPc2RMJTOYVAIfin34CURxWKLyurshR8YkCceUzphSjknhdfWytgPY10Sx/AOdOnnT2aHwced0TKqp7MuzrD005VpHEMVh5+mr9kE3MHmddKU7v6d4BibFfnwPoli+a1e6gekP+EjpPyrZlJvaP8eeMqZ7EMVyaFdyZExTt6UIfAnK2fpwgS77cGBwokgE56QlRx/TVMYk8DWVfQn87CljMgxMFI3Yj1p9TJ5KBSbF/jAwUTTPdbLGVTmBv0tZmlegVOxP6jQ1RKU8B6bbzgaFj7vzMpaBCcryCkwfUc7v+Pr9a/uEBH6YMVGqUnXy+cMyR2Bqn3wsMOVo6tlo87HBj16jzBXl/IrxCd4e0AxG81BjkOU/Z5R7hN+/9Tc0712KEn2JD0/Lv7EdrzF8jgmaG3G9PuTEftz2NnrNZGlX5sYCRY6TQS7s21tTziqIjUHy+HfVCExzrmwe4EcRazZPC0oP2D59Wn5AM7+SB7EfNwOFuD35iBz3yd0t3DeHIgb78Jhzcl9SIzDR/lh9VDjKFZhKBIrU5xT4iDRUwGswrCl9QzXtk1ez9KtxTMarg1cW7ltq7OTyDIIfEIeCKBbX4UZ7yZjGRpsL/ESaWSBS9kZkXOtkrsA0Nj4mV59G7vIURFRMPzB5RT2ZuT1XeXsNTKUn9iMq6WU/MHld7ZGR7bkypruZ2+cqObgyBZtyFI3CUa6MaSwgCPLIXZ4iFl7mp2gEfj4OBSaP4JQ7gxnyMmN5ilj2erWRyDzeDG2Ej5yBYsiLxNewhCIWZkwUjWudHApMCh9DgUKQjwxs83qzovXpCPxE6z+jbfobfDyfa0OByWu8jgxsyznK+C5x2xLRmjtelcCwI5083MOH2o+cTbkIE8YJfEQaXCnwu7nV6yosXbcDnFsnOZtyOQNFankeb1a0jMLzyyqZLZEHr6lpzPN9oDkzJknc5qXfTPSK4JGyCgtKR/jxvBmYro+dY7/At04+n2+3Yzsc9G8Tyd2063e2C3zU6By2ibeGZgv0fg9PIEpzGtiWo06OBiavjKkfKHIHplwZk6K8HP/hQ06gWgQ+fYV2IpdokgvyU1wITAof/UAhyOtu4vFSe+2HOYF9TDX9BJ/+QsF+/h9P7crNyAGK9XIFitTyBD4U+/QbiGL5VCfHApNHBJaJx976zZ8tN+VyU+xjvmnaD2vCndoHY4HJ62pNjkBxSbfDXeBjj82dn0EUy9+7D3JmTKYbKEoEphcjZa+xt0GI9tVQDyCKwz4otbshZx+T6U5CVmLSe+ms73Fw5VrWhn8NojgsKL3pb8ydMUlnvUTG5N10VOyHZUpHEMVgMca+/PTN0M7bkT9S+OgGB0F+MrK+1B4yphOaT6UTiGI4ofmSTB07oGRgYsZU3gMG2u9EFVmW9Dsm6mTuply3X6nkVTmBj0izCixxRDNlyzuUyViJpti9dZN18lJg8ghO7VUyQRlteRzD9KUjmu+WfwWiGI5Py3uMzON0c+EPPQJTmzEJymjLY2D6mr0ndnPwPYhiEDTBSfo7LgUmj/E7awLEksDYlifwscfBlRacSjSriVJYXXzX33gpMHlM97EmUCwZfS69ctdS7I+AY5kolgN6My1cCkyK9dr71wTzLc3Y2jLX8upnW8KuWjyguayaY+T59yCa5wGf66TC3xczLdxeOFDhw66ULQkUel4E83wDn6acoh7LaLpXBAXNQDSvKUzvz0uOoEf79EPvscC3Th7QxInROb9bXtnCt1gemJa8BivP4/aXSP1LiuYqxq/wcwDRcoqmTp7g59CulMiYLCgtCRQWGKyfae5VpD33L719Wn6ED8+vgKL5TvCpY7W/F9AG8B7gw85168YoFpiWBIulfTxL+7T6Ig6uVCxr3g7hsIG67IbqB2yfZ3fApwRmqinnNWGcYD7FsuAo2PftKF5NTA4ZIA9WHxU+pF25wXSha1kUnHsSPGJdxsTANE1AFIu0K1OBSbHe0mbc0vJdv0OdiMqbCkwe/SwHzKe937nLu/QaiKiwEk25JRnTR8fyl2LGRFRJiabcEnr+XWv0NQceElVUImNa4nFkvUb5RFTYVGCqlTnoyHqN8omosKgZk3bWawx03PrMlUSbFrWP6bHya2BTjqiiqcBkFOVpZ52d30TX4dO5nhKYanc+M3vJg+8rRTMrMCnK0t7jGtnLNWRMDEwUVkpgKt0RrL3HHC5AdGUiNuX688soylIQUVVbaMqNbStZPhEVFDEwPSZuy6X2jIBEV48Z09c4VICosoh9TDqwrWQWw45vospSA1PJk3WoLEU5CiKq6jbxOAsWHtPVptDEbbkwY6IS7EtHBX7eYkd1NzUwWb+LoIzand8fQJTfq/Pi5R12FJhSmnKmVB+PztyeA2cWIKosNTApytCZ20uVT0QFRQtMY5lZqRRVQUTVpQamCIHh0j4vHFxJFEC0jOlx4T4vCiKqbksZ0x+oWz4RFbKVzm/DjInoSqQGJqPI7zFw+URUyJzAVDtjKVE+B1cSBTAnMJXo46mdMV3T4EpmhxSNtiuRMiZduX+taztRGZgorEh9TDqxP/eJxHmYiIKI1sc0VX7O18DBlURBzAlMuTMKTTgmZ2BSEFEIkTKmlI7nnMGRfS5EQWypj8nkbG6xj4koiNSJ4lon5JswThOOseDxHfJYmzEpyvjTqayp+xIV6ynW8Xod7XOtpdgOz/duisLHp8TjLzZXn0YvVQJMAAAAAElFTkSuQmCC",fs=""+new URL("yandex-38e09d70.png",import.meta.url).href,ps=""+new URL("01-694cb9b7.png",import.meta.url).href,Jt=[es,ss,ts,ns,rs,as,ls,is,os,cs,ds,ms,us,hs,xs,js,fs,ps];function Ae(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const r=new AbortController;async function n(){const a=await F(r.signal);t(a)}return n(),()=>r.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(Vt,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:Jt.map((r,n)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:r,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},n))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(Ne,{models:s.models}),e.jsx(Se,{runGroups:s.run_groups})]})})]})]}):null}function Wt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-2 gap-8",children:[e.jsxs("div",{children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx(Qe,{})]})]})}const Kt=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function Zt({metricFieldMap:s,metricGroups:t}){const r=new Set,n=[];return t.forEach(a=>{const o=[];a.metrics.forEach(i=>{const c=s[i.name];c&&(o.push(c),r.add(c.name))}),o.length>0&&n.push([a,o])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[r.size," metrics"]}),e.jsx("ul",{children:n.map(([a,o])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:a.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:o.map(i=>e.jsx("li",{className:"ml-4",children:i.display_name},i.name))})]},a.name))})]})}function Yt(){const[s,t]=l.useState(void 0);l.useEffect(()=>{const n=new AbortController;async function a(){const o=await F(n.signal);t(o)}return a(),()=>n.abort()},[]);const r=s?s.metrics.reduce((n,a)=>(n[a.name]=a,n),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:Kt,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&r?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(Ne,{models:s.models}),e.jsx(Se,{runGroups:s.run_groups}),e.jsx(Zt,{metricFieldMap:r,metricGroups:s.metric_groups})]}):null]})}const Xt=""+new URL("vhelm-framework-cde7618a.png",import.meta.url).href,Qt=""+new URL("vhelm-model-6d812526.png",import.meta.url).href;function en(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const r=new AbortController;async function n(){const a=await F(r.signal);t(a)}return n(),()=>r.abort()},[]),e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"The First Steps to Holistic Evaluation of Vision-Language Models"}),e.jsxs("p",{className:"my-4",children:["To better understand VLMs, we introduce the first version of"," ",e.jsx("em",{children:"Holistic Evaluation of Vision-Language Models (VHELM)"})," by extending the ",e.jsx("a",{href:"https://arxiv.org/abs/2211.09110",children:"HELM"})," ","framework with the necessary adaptation methods to assess the performance of 11 prominent VLMs on 3 standard VLM benchmarks."]}),e.jsx("p",{className:"my-4 font-bold",children:"This is ongoing work to achieve holistic evaluation for vision-language models, so please stay tuned!"}),e.jsx("img",{src:Xt,alt:"An image of a helm and the text 'This helm is a' is sent to a Vision-Language Model, which produces the text 'wheel for steering a ship...'",className:"mx-auto lg:max-w-3xl block my-8"}),e.jsx("img",{src:Qt,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Vision), then Metrics (Exact match) are computed",className:"mx-auto lg:max-w-3xl block my-8"}),s===void 0?null:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(Ne,{models:s.models}),e.jsx(Se,{runGroups:s.run_groups})]})]})}const sn=({id:s,title:t,imageUrl:r,text:n})=>(t.includes("HE")||(t="HELM "+t),e.jsxs("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:[r?e.jsx("img",{className:"w-full",src:r,alt:t}):e.jsx(e.Fragment,{}),e.jsxs("div",{className:"px-6 py-4",children:[e.jsx("div",{className:"font-bold text-xl mb-2",children:e.jsxs("a",{href:we(void 0,s),children:[" ",t+" →"]})}),e.jsx("p",{className:"text-gray-700 text-base",children:n})]})]}));function tn(){const[s,t]=l.useState();return l.useEffect(()=>{fetch("https://storage.googleapis.com/crfm-helm-public/config/release_index.json").then(r=>r.json()).then(r=>{t(r)}).catch(r=>{console.error("Error fetching JSON:",r)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-3 gap-4",children:s&&s.map((r,n)=>e.jsx(sn,{id:r.id,title:r.title,imageUrl:r.imageUrl!==void 0?String(r.imageUrl):void 0,text:r.description},n))})})}function nn(){return e.jsxs("div",{className:"flex px-6 py-20",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsx("h1",{className:"text-5xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md",onClick:()=>window.scrollTo({top:700,behavior:"smooth"}),children:e.jsx("body",{children:"Projects"})}),e.jsx(N,{to:"https://github.com/stanford-crfm/helm",className:"ml-4",children:e.jsx("button",{className:"px-6 btn btn-grey rounded-md",children:"Github"})})]})]}),e.jsx("div",{className:"w-1/3 mx-4",children:e.jsx("img",{src:Xe,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const rn=[es,ss,ts,ns,rs,as,ls,is,os,cs,ds,ms,us,hs,xs,js,fs,ps];function an(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const r=new AbortController;async function n(){const a=await F(r.signal);t(a)}return n(),()=>r.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(nn,{}),e.jsxs("div",{className:"container mt-40 mx-auto text-lg",children:[e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Projects"})})}),e.jsx("div",{className:"flex flex-col sm:flex-row flex sm:gap-8 md:gap-32",children:e.jsx("text",{children:"HELM projects leverage the HELM framework and target particular domains, languages, or use cases."})})]}),e.jsx(tn,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:rn.map((r,n)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:r,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},n))})})})]})})]}):null}function ln(){return window.PROJECT_ID==="lite"?e.jsx(Ae,{}):window.PROJECT_ID==="instruct"?e.jsx(qt,{}):window.PROJECT_ID==="heim"?e.jsx(Yt,{}):window.PROJECT_ID==="mmlu"?e.jsx(Wt,{}):window.PROJECT_ID==="vhelm"?e.jsx(en,{}):window.PROJECT_ID==="global"?e.jsx(an,{}):e.jsx(Ae,{})}function on(){return e.jsx(Ls,{children:e.jsx(Ps,{children:e.jsxs(O,{path:"/",element:e.jsx(ht,{}),children:[e.jsx(O,{index:!0,element:e.jsx(ln,{})}),e.jsx(O,{path:"leaderboard",element:e.jsx(Ht,{})}),e.jsx(O,{path:"models",element:e.jsx(gt,{})}),e.jsx(O,{path:"scenarios",element:e.jsx(wt,{})}),e.jsx(O,{path:"groups",element:e.jsx(bt,{})}),e.jsx(O,{path:"groups/:groupName",element:e.jsx(vt,{})}),e.jsx(O,{path:"runs",element:e.jsx(yt,{})}),e.jsx(O,{path:"runs/:runName",element:e.jsx(Ft,{})})]})})})}xe.createRoot(document.getElementById("root")).render(e.jsx(_s.StrictMode,{children:e.jsx(on,{})}));
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
"""Temporary test for preserving invariants during the model / tokenizer / window service refactor.
|
|
2
|
-
|
|
3
|
-
Delete this after the refactor is done."""
|
|
4
|
-
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
7
|
-
import pytest
|
|
8
|
-
from tempfile import TemporaryDirectory
|
|
9
|
-
from helm.benchmark.model_deployment_registry import (
|
|
10
|
-
get_model_deployment,
|
|
11
|
-
ModelDeployment,
|
|
12
|
-
ALL_MODEL_DEPLOYMENTS,
|
|
13
|
-
)
|
|
14
|
-
from helm.benchmark.model_metadata_registry import get_model_metadata, ModelMetadata
|
|
15
|
-
from helm.benchmark.tokenizer_config_registry import TokenizerConfig, get_tokenizer_config
|
|
16
|
-
from helm.benchmark.window_services.test_utils import get_tokenizer_service
|
|
17
|
-
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
18
|
-
from helm.clients.client import Client
|
|
19
|
-
from helm.tokenizers.tokenizer import Tokenizer
|
|
20
|
-
from helm.benchmark.window_services.window_service import WindowService
|
|
21
|
-
|
|
22
|
-
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
23
|
-
from helm.clients.auto_client import AutoClient
|
|
24
|
-
from helm.tokenizers.auto_tokenizer import AutoTokenizer
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
INT_MAX: int = 2**31 - 1
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class TestModelProperties:
|
|
31
|
-
@pytest.mark.parametrize("deployment_name", [deployment.name for deployment in ALL_MODEL_DEPLOYMENTS])
|
|
32
|
-
def test_models_has_window_service(self, deployment_name: str):
|
|
33
|
-
with TemporaryDirectory() as tmpdir:
|
|
34
|
-
credentials = {"openaiApiKey": "test-openai-api-key"}
|
|
35
|
-
auto_client = AutoClient(credentials, tmpdir, BlackHoleCacheBackendConfig())
|
|
36
|
-
auto_tokenizer = AutoTokenizer({}, BlackHoleCacheBackendConfig())
|
|
37
|
-
tokenizer_service = get_tokenizer_service(tmpdir, BlackHoleCacheBackendConfig())
|
|
38
|
-
|
|
39
|
-
# Loading the TokenizerConfig and ModelMetadat ensures that they are valid.
|
|
40
|
-
deployment: ModelDeployment = get_model_deployment(deployment_name)
|
|
41
|
-
tokenizer_name: str = deployment.tokenizer_name if deployment.tokenizer_name else deployment_name
|
|
42
|
-
tokenizer_config: Optional[TokenizerConfig] = get_tokenizer_config(tokenizer_name)
|
|
43
|
-
assert tokenizer_config is not None
|
|
44
|
-
model: ModelMetadata = get_model_metadata(
|
|
45
|
-
deployment.model_name if deployment.model_name else deployment_name
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
# Can't test lit-gpt client because it requires manual dependencies
|
|
49
|
-
if "lit-gpt" in model.name:
|
|
50
|
-
return
|
|
51
|
-
|
|
52
|
-
# Can't test Llama 2 because it requires Hugging Face credentials
|
|
53
|
-
if "llama-2-" in model.name:
|
|
54
|
-
return
|
|
55
|
-
|
|
56
|
-
# Can't test Vertex AI because it requires Google credentials
|
|
57
|
-
if deployment_name.startswith("google/"):
|
|
58
|
-
return
|
|
59
|
-
|
|
60
|
-
# Can't test Bedrock because it requires Amazon credentials
|
|
61
|
-
if deployment_name.startswith("amazon/"):
|
|
62
|
-
return
|
|
63
|
-
|
|
64
|
-
# Loads the model, window service and tokenizer
|
|
65
|
-
# which checks that the model, window service and tokenizer are all valid,
|
|
66
|
-
# and that no Client, WindowService or Tokenizer are crashing.
|
|
67
|
-
client: Client = auto_client._get_client(deployment_name) # noqa: F841
|
|
68
|
-
window_service: WindowService = WindowServiceFactory.get_window_service(deployment_name, tokenizer_service)
|
|
69
|
-
tokenizer: Tokenizer = auto_tokenizer._get_tokenizer(tokenizer_name) # noqa: F841
|
|
70
|
-
|
|
71
|
-
# Verify that the parameters that are redundant between the ModelDeployment, Tokenizer and the
|
|
72
|
-
# WindowService are the same.
|
|
73
|
-
assert window_service.tokenizer_name == deployment.tokenizer_name
|
|
74
|
-
assert window_service.max_sequence_length == deployment.max_sequence_length
|
|
75
|
-
assert (
|
|
76
|
-
window_service.max_request_length == deployment.max_request_length
|
|
77
|
-
if deployment.max_request_length
|
|
78
|
-
else deployment.max_sequence_length
|
|
79
|
-
)
|
|
80
|
-
assert (
|
|
81
|
-
window_service.max_sequence_and_generated_tokens_length
|
|
82
|
-
== deployment.max_sequence_and_generated_tokens_length
|
|
83
|
-
if deployment.max_sequence_and_generated_tokens_length
|
|
84
|
-
else INT_MAX
|
|
85
|
-
)
|
|
86
|
-
assert tokenizer_config.end_of_text_token == window_service.end_of_text_token
|
|
87
|
-
assert tokenizer_config.prefix_token == window_service.prefix_token
|
|
88
|
-
|
|
89
|
-
# TODO: Add a dummy tokenize, decode and make_request request to each client/tokenizer
|
|
90
|
-
# Do this once we have a proper Cache for tests.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|