crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
helm/proxy/server.py CHANGED
@@ -6,6 +6,7 @@ Look at `index.js` to see how the functionality is invoked.
6
6
  from urllib.parse import unquote_plus
7
7
  import argparse
8
8
  import dataclasses
9
+ import importlib_resources as resources
9
10
  import json
10
11
  import os
11
12
  import sys
@@ -29,9 +30,9 @@ from helm.common.perspective_api_request import PerspectiveAPIRequest
29
30
  from helm.common.moderations_api_request import ModerationAPIRequest
30
31
  from helm.common.tokenization_request import TokenizationRequest, DecodeRequest
31
32
  from helm.proxy.services.service import CACHE_DIR
32
- from .accounts import Account
33
- from .services.server_service import ServerService
34
- from .query import Query
33
+ from helm.proxy.accounts import Account
34
+ from helm.proxy.services.server_service import ServerService
35
+ from helm.proxy.query import Query
35
36
 
36
37
  try:
37
38
  import gunicorn # noqa
@@ -86,7 +87,7 @@ def handle_root():
86
87
 
87
88
  @app.get("/static/<filename:path>")
88
89
  def handle_static_filename(filename):
89
- resp = bottle.static_file(filename, root=os.path.join(os.path.dirname(__file__), "static"))
90
+ resp = bottle.static_file(filename, root=app.config["helm.staticpath"])
90
91
  resp.add_header("Cache-Control", "no-store, must-revalidate ")
91
92
  return resp
92
93
 
@@ -284,6 +285,12 @@ def main():
284
285
  ensure_directory_exists(sqlite_cache_path)
285
286
  cache_backend_config = SqliteCacheBackendConfig(sqlite_cache_path)
286
287
 
288
+ static_package_name = "helm.proxy.static"
289
+ resource_path = resources.files(static_package_name).joinpath("index.html")
290
+ with resources.as_file(resource_path) as resource_filename:
291
+ static_path = str(resource_filename.parent)
292
+ app.config["helm.staticpath"] = static_path
293
+
287
294
  service = ServerService(base_path=args.base_path, cache_backend_config=cache_backend_config)
288
295
 
289
296
  gunicorn_args = {
@@ -24,7 +24,7 @@ from helm.common.request import Request, RequestResult
24
24
  from dacite import from_dict
25
25
  from helm.proxy.accounts import Account
26
26
  from helm.proxy.query import Query, QueryResult
27
- from .service import Service, GeneralInfo
27
+ from helm.proxy.services.service import Service, GeneralInfo
28
28
 
29
29
 
30
30
  class RemoteServiceError(Exception):
@@ -1,8 +1,8 @@
1
- import dataclasses
2
1
  import os
3
2
  import signal
4
- from typing import List, Optional
3
+ from typing import List
5
4
 
5
+ from helm.common.local_context import LocalContext
6
6
  from helm.common.cache import CacheConfig
7
7
  from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
8
8
  from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
@@ -11,7 +11,6 @@ from helm.common.moderations_api_request import ModerationAPIRequest, Moderation
11
11
  from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
12
12
  from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
13
13
  from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
14
- from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
15
14
  from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
16
15
  from helm.common.tokenization_request import (
17
16
  TokenizationRequest,
@@ -22,27 +21,13 @@ from helm.common.tokenization_request import (
22
21
  from helm.common.request import Request, RequestResult
23
22
  from helm.common.hierarchical_logger import hlog
24
23
  from helm.proxy.accounts import Accounts, Account
25
- from helm.clients.auto_client import AutoClient
26
- from helm.clients.moderation_api_client import ModerationAPIClient
27
- from helm.clients.image_generation.nudity_check_client import NudityCheckClient
28
- from helm.clients.gcs_client import GCSClient
29
- from helm.clients.clip_score_client import CLIPScoreClient
30
- from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
31
- from helm.proxy.example_queries import example_queries
32
- from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
33
24
  from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization
34
25
  from helm.proxy.query import Query, QueryResult
35
- from helm.proxy.retry import retry_request
36
26
  from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
37
- from helm.tokenizers.auto_tokenizer import AutoTokenizer
38
- from .service import (
27
+ from helm.proxy.services.service import (
39
28
  Service,
40
- CACHE_DIR,
41
29
  ACCOUNTS_FILE,
42
30
  GeneralInfo,
43
- VERSION,
44
- expand_environments,
45
- synthesize_request,
46
31
  )
47
32
 
48
33
 
@@ -57,43 +42,17 @@ class ServerService(Service):
57
42
  root_mode: bool = False,
58
43
  cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
59
44
  ):
60
- ensure_directory_exists(base_path)
61
- client_file_storage_path = os.path.join(base_path, CACHE_DIR)
62
- ensure_directory_exists(client_file_storage_path)
63
-
64
- credentials = get_credentials(base_path)
65
45
  accounts_path = os.path.join(base_path, ACCOUNTS_FILE)
66
46
 
67
- self.cache_backend_config = cache_backend_config
68
- self.client = AutoClient(credentials, client_file_storage_path, cache_backend_config)
69
- self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
70
- self.token_counter = AutoTokenCounter(self.tokenizer)
47
+ self.context = LocalContext(base_path, cache_backend_config)
48
+ self.token_counter = AutoTokenCounter(self.context.tokenizer)
71
49
  self.accounts = Accounts(accounts_path, root_mode=root_mode)
72
50
 
73
- # Lazily instantiate the following clients
74
- self.moderation_api_client: Optional[ModerationAPIClient] = None
75
- self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
76
- self.perspective_api_client: Optional[ToxicityClassifierClient] = None
77
- self.nudity_check_client: Optional[NudityCheckClient] = None
78
- self.clip_score_client: Optional[CLIPScoreClient] = None
79
- self.gcs_client: Optional[GCSClient] = None
80
-
81
51
  def get_general_info(self) -> GeneralInfo:
82
- # Can't send release_dates in ModelMetadata bacause dates cannot be round-tripped to and from JSON easily.
83
- # TODO(#2158): Either fix this or delete get_general_info.
84
- all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
85
- return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
52
+ return self.context.get_general_info()
86
53
 
87
54
  def expand_query(self, query: Query) -> QueryResult:
88
- """Turn the `query` into requests."""
89
- prompt = query.prompt
90
- settings = query.settings
91
- environments = parse_hocon(query.environments)
92
- requests = []
93
- for environment in expand_environments(environments):
94
- request = synthesize_request(prompt, settings, environment)
95
- requests.append(request)
96
- return QueryResult(requests=requests)
55
+ return self.context.expand_query(query)
97
56
 
98
57
  def _get_model_group_for_model_deployment(self, model_deployment: str) -> str:
99
58
  if model_deployment.startswith("openai/"):
@@ -105,7 +64,11 @@ class ServerService(Service):
105
64
  return "gpt4"
106
65
  elif model_deployment.startswith("openai/gpt-3"):
107
66
  return "gpt3"
108
- elif model_deployment.startswith("openai/o1"):
67
+ elif (
68
+ model_deployment.startswith("openai/o1")
69
+ or model_deployment.startswith("openai/o3")
70
+ or model_deployment.startswith("openai/o4")
71
+ ):
109
72
  return "o1"
110
73
  else:
111
74
  return "openai"
@@ -126,7 +89,7 @@ class ServerService(Service):
126
89
  self.accounts.check_can_use(auth.api_key, model_group)
127
90
 
128
91
  # Use!
129
- request_result: RequestResult = self.client.make_request(request)
92
+ request_result: RequestResult = self.context.make_request(request)
130
93
 
131
94
  # Only deduct if not cached
132
95
  if not request_result.cached:
@@ -139,66 +102,39 @@ class ServerService(Service):
139
102
  def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
140
103
  """Tokenize via an API."""
141
104
  self.accounts.authenticate(auth)
142
- return self.tokenizer.tokenize(request)
105
+ return self.context.tokenize(request)
143
106
 
144
107
  def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
145
108
  """Decodes to text."""
146
109
  self.accounts.authenticate(auth)
147
- return self.tokenizer.decode(request)
110
+ return self.context.decode(request)
148
111
 
149
112
  def upload(self, auth: Authentication, request: FileUploadRequest) -> FileUploadResult:
150
113
  """Uploads a file to external storage."""
151
114
  self.accounts.authenticate(auth)
152
-
153
- if not self.gcs_client:
154
- self.gcs_client = self.client.get_gcs_client()
155
-
156
- assert self.gcs_client
157
- return self.gcs_client.upload(request)
115
+ return self.context.upload(request)
158
116
 
159
117
  def check_nudity(self, auth: Authentication, request: NudityCheckRequest) -> NudityCheckResult:
160
118
  """Check for nudity."""
161
119
  self.accounts.authenticate(auth)
162
-
163
- if not self.nudity_check_client:
164
- self.nudity_check_client = self.client.get_nudity_check_client()
165
-
166
- assert self.nudity_check_client
167
- return self.nudity_check_client.check_nudity(request)
120
+ return self.context.check_nudity(request)
168
121
 
169
122
  def compute_clip_score(self, auth: Authentication, request: CLIPScoreRequest) -> CLIPScoreResult:
170
123
  """Computes CLIPScore for a given caption and image."""
171
124
  self.accounts.authenticate(auth)
172
-
173
- if not self.clip_score_client:
174
- self.clip_score_client = self.client.get_clip_score_client()
175
-
176
- assert self.clip_score_client
177
- return self.clip_score_client.compute_score(request)
125
+ return self.context.compute_clip_score(request)
178
126
 
179
127
  def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
180
- @retry_request
181
- def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
182
- if not self.toxicity_classifier_client:
183
- self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
184
- return self.toxicity_classifier_client.get_toxicity_scores(request)
185
-
186
128
  self.accounts.authenticate(auth)
187
- return get_toxicity_scores_with_retry(request)
129
+ return self.context.get_toxicity_scores(request)
188
130
 
189
131
  def get_moderation_results(self, auth: Authentication, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
190
- @retry_request
191
- def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
192
- if not self.moderation_api_client:
193
- self.moderation_api_client = self.client.get_moderation_api_client()
194
- return self.moderation_api_client.get_moderation_results(request)
195
-
196
132
  self.accounts.authenticate(auth)
197
- return get_moderation_results_with_retry(request)
133
+ return self.context.get_moderation_results(request)
198
134
 
199
135
  def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
200
136
  self.accounts.authenticate(auth)
201
- return self.client.get_critique_client().make_critique_request(request)
137
+ return self.context.make_critique_request(request)
202
138
 
203
139
  def create_account(self, auth: Authentication) -> Account:
204
140
  """Creates a new account."""
@@ -233,4 +169,4 @@ class ServerService(Service):
233
169
  hlog("Done.")
234
170
 
235
171
  def get_cache_config(self, shard_name: str) -> CacheConfig:
236
- return self.cache_backend_config.get_cache_config(shard_name)
172
+ return self.context.get_cache_config(shard_name)
@@ -18,8 +18,8 @@ from helm.common.authentication import Authentication
18
18
  from helm.common.request import Request, RequestResult
19
19
  from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
20
20
  from helm.proxy.accounts import Account, set_default_quotas
21
- from .remote_service import RemoteService
22
- from .service import ACCOUNTS_FILE
21
+ from helm.proxy.services.remote_service import RemoteService
22
+ from helm.proxy.services.service import ACCOUNTS_FILE
23
23
 
24
24
 
25
25
  @dataclass(frozen=True)
@@ -8,7 +8,7 @@ from helm.common.authentication import Authentication
8
8
  from helm.common.request import Request
9
9
  from helm.proxy.accounts import AuthenticationError, Accounts
10
10
  from helm.proxy.query import Query
11
- from .server_service import ServerService
11
+ from helm.proxy.services.server_service import ServerService
12
12
 
13
13
 
14
14
  def get_authentication():
@@ -0,0 +1,122 @@
1
+ function assert(condition, message) {
2
+ if (!condition) {
3
+ throw message || "Assertion failed";
4
+ }
5
+ }
6
+
7
+ function encodeUrlParams(params) {
8
+ let s = "";
9
+ for (let k in params) {
10
+ if (params[k] != null) {
11
+ s += (s === "" ? "?" : "&") + k + "=" + encodeURIComponent(params[k]);
12
+ }
13
+ }
14
+ return s;
15
+ }
16
+
17
+ function decodeUrlParams(str) {
18
+ const params = {};
19
+ if (str === "") return params;
20
+ const items = str.substring(1).split(/&/);
21
+ for (let i = 0; i < items.length; i++) {
22
+ const pair = items[i].split(/=/);
23
+ params[pair[0]] = decodeURIComponent(pair[1]);
24
+ }
25
+ return params;
26
+ }
27
+
28
+ function updateBrowserLocation(params) {
29
+ // Update the address bar
30
+ window.history.pushState(
31
+ {},
32
+ "",
33
+ window.location.pathname + encodeUrlParams(params),
34
+ );
35
+ }
36
+
37
+ function createCookie(key, value, days) {
38
+ let expires = "";
39
+ if (days) {
40
+ const date = new Date();
41
+ date.setTime(date.getTime() + days * 24 * 60 * 60 * 1000);
42
+ expires = "; expires=" + date.toUTCString();
43
+ }
44
+ document.cookie = key + "=" + value + expires + "; path=/";
45
+ }
46
+
47
+ function readCookie(key) {
48
+ let tokens = document.cookie.split(";");
49
+ for (let i = 0; i < tokens.length; i++) {
50
+ const [k, v] = tokens[i].trim().split("=", 2);
51
+ if (key === k) return v;
52
+ }
53
+ return null;
54
+ }
55
+
56
+ function eraseCookie(key) {
57
+ createCookie(key, "", -1);
58
+ }
59
+
60
+ function renderTimestamp(timestamp) {
61
+ if (!timestamp) return null;
62
+ const d = new Date(timestamp * 1000);
63
+ return d.toLocaleString();
64
+ }
65
+
66
+ function renderDict(data) {
67
+ return JSON.stringify(data).substring(0, 10000);
68
+ }
69
+
70
+ function loadScript(src, onload, onerror) {
71
+ // Using jquery doesn't work, so do it in with our bare hands.
72
+ const s = document.createElement("script");
73
+ s.src = src;
74
+ s.onload = onload;
75
+ s.onerror = onerror;
76
+ document.head.appendChild(s);
77
+ }
78
+
79
+ function getRandomString() {
80
+ const vocab =
81
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
82
+ let text = "";
83
+ for (let i = 0; i < 6; i++)
84
+ text += vocab.charAt(Math.floor(Math.random() * vocab.length));
85
+ return text;
86
+ }
87
+
88
+ function round(x, n) {
89
+ const base = Math.pow(10, n);
90
+ return Math.round(x * base) / base;
91
+ }
92
+
93
+ function multilineHtml(s) {
94
+ return s.replace(/\n/g, "<br>");
95
+ }
96
+
97
+ function renderError(e) {
98
+ return $("<div>").addClass("alert alert-danger").append(multilineHtml(e));
99
+ }
100
+
101
+ function helpIcon(help, link) {
102
+ // Show a ?
103
+ return $("<a>", { href: link, target: "blank_", class: "help-icon" }).append(
104
+ $("<img>", { src: "info-icon.png", width: 15, title: help }),
105
+ );
106
+ }
107
+
108
+ const markdownConverter = new showdown.Converter({ optionKey: "value" });
109
+ function renderMarkdown(markdown) {
110
+ return markdown && markdownConverter.makeHtml(markdown);
111
+ }
112
+
113
+ function refreshHashLocation() {
114
+ // If we request a hash location (URL contains #foo), the problem is #foo
115
+ // might not exist (since it's generated). Call this function to jump to the
116
+ // hash location once all the anchors are generated.
117
+ if (location.hash) {
118
+ const hash = location.hash;
119
+ location.hash = "";
120
+ location.hash = hash;
121
+ }
122
+ }
@@ -0,0 +1,99 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <title>CRFM Models API</title>
5
+ <meta charset="utf-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
7
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/css/bootstrap.min.css" integrity="sha384-/Y6pD6FV/Vv2HJnA6t+vslU6fwYXjCFtcEpHbNJ0lyAFsXTsjBbfaDjzALeQsN6M" crossorigin="anonymous">
8
+ <link rel="stylesheet" type="text/css" href="index.css">
9
+ </head>
10
+
11
+ <body>
12
+ <div class="container-fluid">
13
+ <nav class="navbar navbar-expand-sm navbar-light bg-faded">
14
+ <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#nav-content" aria-controls="nav-content" aria-expanded="false" aria-label="Toggle navigation">
15
+ <span class="navbar-toggler-icon"></span>
16
+ </button>
17
+
18
+ <div class="collapse navbar-collapse" id="nav-content">
19
+ <ul class="navbar-nav">
20
+ <li class="nav-item"><a class="nav-link" href="index.html">Query</a></li>
21
+ <li class="nav-item"><a class="nav-link active" href="help.html">Help</a></li>
22
+ <li class="nav-item" id="loginInfo"></li>
23
+ </ul>
24
+ </div>
25
+ </nav>
26
+
27
+ <div class="row">
28
+ <div class="col-sm-12">
29
+ <p>
30
+ This web interface and API allows you to experiment with multiple large language models using a unified interface.
31
+ To log in, you need an API key.
32
+ </p>
33
+ <h2>Query</h2>
34
+ <ul>
35
+ <li>In the web interface, you can enter a <b>query</b>, which consists of the following components:
36
+ <ul>
37
+ <li><b>prompt</b>, which is what text we want to feed into the language model. The prompt can have variables (e.g., <tt>${name}</tt>) which are filled in later.</li>
38
+ <li><b>settings</b>, which configures how we're going to call the backend API (HOCON format):
39
+ <ul>
40
+ <li><tt>model</tt>: which model to query; options are:
41
+ <div id="help-models" />
42
+ </li>
43
+ <li><tt>temperature</tt>: a non-negative number determining amount of stochasticity (e.g., <tt>1</tt> is sampling from the model, <tt>0</tt> is returning the maximum probability output)</li>
44
+ <li><tt>num_completions</tt>: number of completions (sequences, independent sampled) to return</li>
45
+ <li><tt>top_k_per_token</tt>: number of candidates per token position in each completion</li>
46
+ <li><tt>max_tokens</tt>: maximum number of tokens before generation stops</li>
47
+ <li><tt>stop_sequences</tt>: list of strings that will stop generation (e.g., <tt>'.'</tt> or <tt>'\n'</tt>)</li>
48
+ <li><tt>echo_prompt</tt>: Whether to include the prompt as a prefix of the completion</li>
49
+ <li><tt>top_p</tt>: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.</li>
50
+ <li><tt>presence_penalty</tt>: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. (OpenAI only)</li>
51
+ <li><tt>frequency_penalty</tt>: Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. (OpenAI only)</li>
52
+ </ul>
53
+ Settings can also have variables in them (e.g., <tt>temperature</tt>).
54
+ </li>
55
+ <li><b>environments</b>, which specifies for each variable, a list of values (HOCON format).
56
+ </ul>
57
+ </li>
58
+ <li>
59
+ When the query is submitted,
60
+ we consider all possible assignments of values to variables.
61
+ For example:
62
+ <ul>
63
+ <li>environments has <tt>name: [Boston, New York]</tt> and <tt>temperature: [0, 1]</tt></li>
64
+ <li>prompt is <tt>${name} is a</tt></li>
65
+ <li>settings is <tt>temperature: ${temperature}</tt></li>
66
+ </ul>
67
+ This gives rise to 4 <b>requests</b>:
68
+ <ul>
69
+ <li>prompt: <tt>Boston is a</tt>, temperature: <tt>0</tt></li>
70
+ <li>prompt: <tt>Boston is a</tt>, temperature: <tt>1</tt></li>
71
+ <li>prompt: <tt>New York is a</tt>, temperature: <tt>0</tt></li>
72
+ <li>prompt: <tt>New York is a</tt>, temperature: <tt>1</tt></li>
73
+ </ul>
74
+ </li>
75
+ </ul>
76
+
77
+ <h2><a name="quotas">Quotas</a></h2>
78
+ <p>
79
+ For each model group (e.g., <tt>gpt3</tt>) and time granularity
80
+ (e.g., <tt>daily</tt>, <tt>monthly</tt>, <tt>total</tt>), you are
81
+ given a quota of a certain number of tokens.
82
+ Once you go over that number, you won't be able to use the API.
83
+ However, note that requests that have already been made (by you or
84
+ anyone) that are cached are not counted towards your quota.
85
+ For example, if your daily quota for <tt>gpt3</tt> is <tt>10000</tt>,
86
+ that means each day, you will get 10000 tokens.
87
+ </p>
88
+ </div>
89
+ </div>
90
+ </div>
91
+
92
+ <script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>
93
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js" integrity="sha384-b/U6ypiBEHpOf/4+1nzFpr53nxSS+GLCkfwBdFNTxtclqqenISfwAzpKaMNFNmj4" crossorigin="anonymous"></script>
94
+ <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/js/bootstrap.min.js" integrity="sha384-h0AbiXch4ZDo7tp9hKZ4TsHbi047NrKGLO3SEJAg45jXxnGIfYzk4Si90RDIqNm1" crossorigin="anonymous"></script>
95
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.tablesorter/2.29.0/js/jquery.tablesorter.min.js"></script>
96
+ <script src="general.js"></script>
97
+ <script src="index.js"></script>
98
+ </body>
99
+ </html>
@@ -0,0 +1,57 @@
1
+ .warning {
2
+ color: red;
3
+ }
4
+
5
+ .active {
6
+ background-color: #f0f0f0;
7
+ }
8
+
9
+ .examples-block {
10
+ margin-left: 120px;
11
+ margin-top: 10px;
12
+ margin-bottom: 10px;
13
+ }
14
+
15
+ .request {
16
+ background-color: #f0f0f0;
17
+ margin-top: 20px;
18
+ margin-bottom: 20px;
19
+ }
20
+
21
+ .completion {
22
+ border: solid 1px;
23
+ border-color: #c0c0c0;
24
+ padding: 3px;
25
+ margin-top: 5px;
26
+ margin-bottom: 5px;
27
+ }
28
+
29
+ .completion:hover {
30
+ background-color: #e8e8e8;
31
+ }
32
+
33
+ .metadata {
34
+ float: right;
35
+ font-style: italic;
36
+ }
37
+
38
+ .token:hover {
39
+ background-color: lightgreen;
40
+ }
41
+
42
+ .block {
43
+ margin: 15px;
44
+ padding: 15px;
45
+ border: solid 1px;
46
+ border-color: lightgray;
47
+ }
48
+
49
+ td {
50
+ text-align: left;
51
+ vertical-align: top;
52
+ }
53
+
54
+ .help-icon {
55
+ margin-left: 3px;
56
+ margin-right: 3px;
57
+ }
@@ -0,0 +1,40 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <title>CRFM Models API</title>
5
+ <meta charset="utf-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
7
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/css/bootstrap.min.css" integrity="sha384-/Y6pD6FV/Vv2HJnA6t+vslU6fwYXjCFtcEpHbNJ0lyAFsXTsjBbfaDjzALeQsN6M" crossorigin="anonymous">
8
+ <link rel="stylesheet" type="text/css" href="index.css">
9
+ </head>
10
+
11
+ <body>
12
+ <div class="container-fluid">
13
+ <nav class="navbar navbar-expand-sm navbar-light bg-faded">
14
+ <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#nav-content" aria-controls="nav-content" aria-expanded="false" aria-label="Toggle navigation">
15
+ <span class="navbar-toggler-icon"></span>
16
+ </button>
17
+
18
+ <div class="collapse navbar-collapse" id="nav-content">
19
+ <ul class="navbar-nav">
20
+ <li class="nav-item"><a class="nav-link active" href="index.html">Query</a></li>
21
+ <li class="nav-item"><a class="nav-link" href="help.html">Help</a></li>
22
+ <li class="nav-item" id="loginInfo"></li>
23
+ </ul>
24
+ </div>
25
+ </nav>
26
+
27
+ <div class="row">
28
+ <div class="col-sm-12" id="main">
29
+ </div>
30
+ </div>
31
+ </div>
32
+
33
+ <script src="https://code.jquery.com/jquery-3.2.1.min.js"></script>
34
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js" integrity="sha384-b/U6ypiBEHpOf/4+1nzFpr53nxSS+GLCkfwBdFNTxtclqqenISfwAzpKaMNFNmj4" crossorigin="anonymous"></script>
35
+ <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/js/bootstrap.min.js" integrity="sha384-h0AbiXch4ZDo7tp9hKZ4TsHbi047NrKGLO3SEJAg45jXxnGIfYzk4Si90RDIqNm1" crossorigin="anonymous"></script>
36
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.tablesorter/2.29.0/js/jquery.tablesorter.min.js"></script>
37
+ <script src="general.js"></script>
38
+ <script src="index.js"></script>
39
+ </body>
40
+ </html>