crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +1 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +76 -59
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +78 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/long_context_run_specs.py +67 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/numeracy_scenario.py +2 -1
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +63 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +100 -54
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/together_client.py +31 -4
- helm/clients/vertexai_client.py +6 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/local_context.py +140 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/config/model_deployments.yaml +864 -193
- helm/config/model_metadata.yaml +667 -53
- helm/config/tokenizer_configs.yaml +144 -3
- helm/proxy/cli.py +3 -1
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Mapping, Optional
|
|
2
|
+
|
|
3
|
+
from helm.clients.client import CachingClient
|
|
4
|
+
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
|
+
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from writerai import Writer
|
|
10
|
+
from writerai.types.chat_completion import ChatCompletion
|
|
11
|
+
except ModuleNotFoundError as e:
|
|
12
|
+
handle_module_not_found_error(e, ["openai"])
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class WriterClient(CachingClient):
|
|
16
|
+
def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None):
|
|
17
|
+
super().__init__(cache_config=cache_config)
|
|
18
|
+
self._writer_client = Writer(api_key=api_key)
|
|
19
|
+
|
|
20
|
+
def _get_messages_from_request(self, request: Request) -> List[Dict]:
|
|
21
|
+
if request.prompt and request.messages:
|
|
22
|
+
raise ValueError(f"Only one of `prompt` and `messages` may be set in request: {request}")
|
|
23
|
+
if request.multimodal_prompt:
|
|
24
|
+
raise ValueError("`multimodal_prompt` is not supported by WriterClient")
|
|
25
|
+
if request.messages:
|
|
26
|
+
return [{"role": message["role"], "content": message["content"]} for message in request.messages]
|
|
27
|
+
else:
|
|
28
|
+
return [{"role": "user", "content": request.prompt}]
|
|
29
|
+
|
|
30
|
+
def _convert_chat_completion_to_generated_outputs(
|
|
31
|
+
self, chat_completion: ChatCompletion, request: Request
|
|
32
|
+
) -> List[GeneratedOutput]:
|
|
33
|
+
generated_outputs: List[GeneratedOutput] = []
|
|
34
|
+
for choice in chat_completion.choices:
|
|
35
|
+
raw_completion_content = choice.message.content
|
|
36
|
+
# The Writer chat completion API doesn't support echo.
|
|
37
|
+
# If `echo_prompt` is true, combine the prompt and completion.
|
|
38
|
+
text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
|
|
39
|
+
tokens: List[Token] = []
|
|
40
|
+
if choice.logprobs and choice.logprobs.content:
|
|
41
|
+
tokens = [
|
|
42
|
+
Token(text=choice_token.token, logprob=choice_token.logprob)
|
|
43
|
+
for choice_token in choice.logprobs.content
|
|
44
|
+
]
|
|
45
|
+
generated_output = GeneratedOutput(
|
|
46
|
+
text=text,
|
|
47
|
+
logprob=sum(token.logprob for token in tokens) if tokens else 0.0,
|
|
48
|
+
tokens=tokens,
|
|
49
|
+
finish_reason={"reason": choice.finish_reason},
|
|
50
|
+
)
|
|
51
|
+
generated_outputs.append(generated_output)
|
|
52
|
+
return generated_outputs
|
|
53
|
+
|
|
54
|
+
def _convert_request_to_raw_request(self, request: Request) -> Dict:
|
|
55
|
+
raw_request = {
|
|
56
|
+
"messages": self._get_messages_from_request(request),
|
|
57
|
+
"model": request.model.split("/")[-1],
|
|
58
|
+
"logprobs": bool(request.top_k_per_token),
|
|
59
|
+
"max_tokens": request.max_tokens,
|
|
60
|
+
"n": request.num_completions,
|
|
61
|
+
"stop": request.stop_sequences,
|
|
62
|
+
"temperature": request.temperature,
|
|
63
|
+
"top_p": request.top_p,
|
|
64
|
+
}
|
|
65
|
+
if request.response_format and request.response_format.json_schema:
|
|
66
|
+
raw_request["response_format"] = {
|
|
67
|
+
"type": "json_schema",
|
|
68
|
+
"json_schema": {
|
|
69
|
+
"schema": request.response_format.json_schema,
|
|
70
|
+
},
|
|
71
|
+
}
|
|
72
|
+
return raw_request
|
|
73
|
+
|
|
74
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
75
|
+
raw_request = self._convert_request_to_raw_request(request)
|
|
76
|
+
cache_key: Mapping = CachingClient.make_cache_key(raw_request, request)
|
|
77
|
+
|
|
78
|
+
def do_it() -> Dict[Any, Any]:
|
|
79
|
+
return self._writer_client.chat.chat(**raw_request).model_dump()
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
83
|
+
chat_completion: ChatCompletion = ChatCompletion.model_validate(raw_response)
|
|
84
|
+
except Exception as error:
|
|
85
|
+
return RequestResult(
|
|
86
|
+
success=False,
|
|
87
|
+
cached=False,
|
|
88
|
+
error=str(error),
|
|
89
|
+
completions=[],
|
|
90
|
+
embedding=[],
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
generated_outputs = self._convert_chat_completion_to_generated_outputs(chat_completion, request)
|
|
94
|
+
|
|
95
|
+
return RequestResult(
|
|
96
|
+
success=True,
|
|
97
|
+
cached=cached,
|
|
98
|
+
request_time=raw_response["request_time"],
|
|
99
|
+
request_datetime=raw_response["request_datetime"],
|
|
100
|
+
completions=generated_outputs,
|
|
101
|
+
embedding=[],
|
|
102
|
+
)
|
helm/common/context.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
4
|
+
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
5
|
+
from helm.common.file_upload_request import FileUploadResult, FileUploadRequest
|
|
6
|
+
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
7
|
+
from helm.common.perspective_api_request import PerspectiveAPIRequestResult, PerspectiveAPIRequest
|
|
8
|
+
from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
|
|
9
|
+
from helm.common.tokenization_request import (
|
|
10
|
+
TokenizationRequest,
|
|
11
|
+
TokenizationRequestResult,
|
|
12
|
+
DecodeRequest,
|
|
13
|
+
DecodeRequestResult,
|
|
14
|
+
)
|
|
15
|
+
from helm.common.request import Request, RequestResult
|
|
16
|
+
from helm.proxy.query import Query, QueryResult
|
|
17
|
+
from helm.common.cache import CacheConfig
|
|
18
|
+
from helm.proxy.services.service import GeneralInfo
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Context(ABC):
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def get_general_info(self) -> GeneralInfo:
|
|
24
|
+
"""Get general info."""
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def expand_query(self, query: Query) -> QueryResult:
|
|
29
|
+
"""Turn the `query` into requests."""
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
34
|
+
"""Actually make a request to an API."""
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
39
|
+
"""Tokenize via an API."""
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
|
|
44
|
+
"""Decodes to text."""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def upload(self, request: FileUploadRequest) -> FileUploadResult:
|
|
49
|
+
"""Uploads a file to external storage."""
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
|
|
54
|
+
"""Check for nudity for a batch of images."""
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
@abstractmethod
|
|
58
|
+
def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
59
|
+
"""Computes CLIPScore for a given caption and image."""
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
64
|
+
"""Get toxicity scores for a batch of text."""
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
69
|
+
"""Get OpenAI's moderation results for some text."""
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
@abstractmethod
|
|
73
|
+
def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
|
|
74
|
+
"""Get responses to a critique request."""
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def get_cache_config(self, shard_name: str) -> CacheConfig:
|
|
79
|
+
"""Returns a CacheConfig"""
|
|
80
|
+
pass
|
helm/common/credentials_utils.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from typing import Any, Mapping, Optional
|
|
4
4
|
|
|
5
|
-
from helm.common.hierarchical_logger import hlog
|
|
5
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def provide_api_key(
|
|
@@ -13,16 +13,16 @@ def provide_api_key(
|
|
|
13
13
|
hlog(f"Using host_organization api key defined in credentials.conf: {api_key_name}")
|
|
14
14
|
return credentials[api_key_name]
|
|
15
15
|
if "deployments" not in credentials:
|
|
16
|
-
|
|
17
|
-
"
|
|
16
|
+
hwarn(
|
|
17
|
+
"Could not find key 'deployments' in credentials.conf, "
|
|
18
18
|
f"therefore the API key {api_key_name} should be specified."
|
|
19
19
|
)
|
|
20
20
|
return None
|
|
21
21
|
deployment_api_keys = credentials["deployments"]
|
|
22
22
|
if model is None:
|
|
23
|
-
|
|
23
|
+
hwarn(f"Could not find key '{host_organization}' in credentials.conf and no model provided")
|
|
24
24
|
return None
|
|
25
25
|
if model not in deployment_api_keys:
|
|
26
|
-
|
|
26
|
+
hwarn(f"Could not find key '{model}' under key 'deployments' in credentials.conf")
|
|
27
27
|
return None
|
|
28
28
|
return deployment_api_keys[model]
|
helm/common/general.py
CHANGED
|
@@ -42,6 +42,13 @@ def ensure_directory_exists(path: str):
|
|
|
42
42
|
os.makedirs(path, exist_ok=True)
|
|
43
43
|
|
|
44
44
|
|
|
45
|
+
def check_file_exists(path: str, msg: Optional[str] = None):
|
|
46
|
+
"""Checks that `path` exists, raises FileNotFoundError if it doesn't."""
|
|
47
|
+
if not os.path.exists(path):
|
|
48
|
+
error_msg = msg if msg else f"Required file not found: {path}"
|
|
49
|
+
raise FileNotFoundError(error_msg)
|
|
50
|
+
|
|
51
|
+
|
|
45
52
|
def parse_hocon(text: str):
|
|
46
53
|
"""Parse `text` (in HOCON format) into a dict-like object."""
|
|
47
54
|
return pyhocon.ConfigFactory.parse_string(text)
|
|
@@ -156,7 +163,7 @@ def format_split(split: str) -> str:
|
|
|
156
163
|
|
|
157
164
|
|
|
158
165
|
def asdict_without_nones(obj: Any) -> Dict[str, Any]:
|
|
159
|
-
if not is_dataclass(obj):
|
|
166
|
+
if not is_dataclass(obj) or isinstance(obj, type):
|
|
160
167
|
raise ValueError(f"Expected dataclass, got '{obj}'")
|
|
161
168
|
return asdict(obj, dict_factory=lambda x: {k: v for (k, v) in x if v is not None})
|
|
162
169
|
|
|
@@ -178,7 +185,7 @@ def binarize_dict(d: Dict[str, int]) -> Dict[str, int]:
|
|
|
178
185
|
|
|
179
186
|
def serialize(obj: Any) -> List[str]:
|
|
180
187
|
"""Takes in a dataclass and outputs all of its fields and values in a list."""
|
|
181
|
-
if not is_dataclass(obj):
|
|
188
|
+
if not is_dataclass(obj) or isinstance(obj, type):
|
|
182
189
|
raise ValueError(f"Expected dataclass, got '{obj}'")
|
|
183
190
|
return [f"{key}: {json.dumps(value)}" for key, value in asdict(obj).items()]
|
|
184
191
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import sys
|
|
2
3
|
import time
|
|
3
4
|
from typing import Any, Callable, List, Optional
|
|
5
|
+
from colorlog import ColoredFormatter
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
class HierarchicalLogger(object):
|
|
@@ -20,6 +22,12 @@ class HierarchicalLogger(object):
|
|
|
20
22
|
} [0s]
|
|
21
23
|
"""
|
|
22
24
|
|
|
25
|
+
# Far too much effort to unwind every call to hlog to go via logging,
|
|
26
|
+
# And is a terrible idea to inspect the stack every time hlog is called
|
|
27
|
+
# to figure out the caller,
|
|
28
|
+
# So just log everything under "helm".
|
|
29
|
+
logger = logging.getLogger("helm")
|
|
30
|
+
|
|
23
31
|
def __init__(self) -> None:
|
|
24
32
|
self.start_times: List[float] = []
|
|
25
33
|
|
|
@@ -27,17 +35,21 @@ class HierarchicalLogger(object):
|
|
|
27
35
|
return " " * len(self.start_times)
|
|
28
36
|
|
|
29
37
|
def track_begin(self, x: Any) -> None:
|
|
30
|
-
|
|
38
|
+
self.logger.info(self.indent() + str(x) + " {")
|
|
31
39
|
sys.stdout.flush()
|
|
32
40
|
self.start_times.append(time.time())
|
|
33
41
|
|
|
34
42
|
def track_end(self) -> None:
|
|
35
43
|
t = time.time() - self.start_times.pop()
|
|
36
|
-
|
|
44
|
+
self.logger.info(self.indent() + "} [%s]" % (format_time(t)))
|
|
37
45
|
sys.stdout.flush()
|
|
38
46
|
|
|
39
47
|
def log(self, x: Any) -> None:
|
|
40
|
-
|
|
48
|
+
self.logger.info(self.indent() + str(x))
|
|
49
|
+
sys.stdout.flush()
|
|
50
|
+
|
|
51
|
+
def warn(self, x: Any) -> None:
|
|
52
|
+
self.logger.warning(self.indent() + str(x))
|
|
41
53
|
sys.stdout.flush()
|
|
42
54
|
|
|
43
55
|
|
|
@@ -61,6 +73,10 @@ def hlog(x: Any) -> None:
|
|
|
61
73
|
singleton.log(x)
|
|
62
74
|
|
|
63
75
|
|
|
76
|
+
def hwarn(x: Any) -> None:
|
|
77
|
+
singleton.warn(x)
|
|
78
|
+
|
|
79
|
+
|
|
64
80
|
class htrack_block:
|
|
65
81
|
def __init__(self, x: Any) -> None:
|
|
66
82
|
self.x = x
|
|
@@ -104,3 +120,30 @@ class htrack:
|
|
|
104
120
|
return fn(*args, **kwargs)
|
|
105
121
|
|
|
106
122
|
return wrapper
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def setup_default_logging():
|
|
126
|
+
"""
|
|
127
|
+
Setup a default logger to STDOUT for HELM via Python logging
|
|
128
|
+
"""
|
|
129
|
+
formatter = ColoredFormatter(
|
|
130
|
+
"%(bold_black)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s %(message)s",
|
|
131
|
+
datefmt="%Y-%m-%dT%H:%M:%S",
|
|
132
|
+
reset=True,
|
|
133
|
+
log_colors={
|
|
134
|
+
"DEBUG": "cyan",
|
|
135
|
+
"INFO": "green",
|
|
136
|
+
"WARNING": "yellow",
|
|
137
|
+
"ERROR": "red",
|
|
138
|
+
"CRITICAL": "red,bg_white",
|
|
139
|
+
},
|
|
140
|
+
secondary_log_colors={},
|
|
141
|
+
style="%",
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
logger = logging.getLogger("helm")
|
|
145
|
+
logger.setLevel(logging.INFO)
|
|
146
|
+
logger.propagate = False
|
|
147
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
148
|
+
handler.setFormatter(formatter)
|
|
149
|
+
logger.addHandler(handler)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import os
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from helm.common.context import Context
|
|
6
|
+
from helm.common.cache import CacheConfig
|
|
7
|
+
from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
|
|
8
|
+
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
9
|
+
from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
|
|
10
|
+
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
11
|
+
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
12
|
+
from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
13
|
+
from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
|
|
14
|
+
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
15
|
+
from helm.common.tokenization_request import (
|
|
16
|
+
TokenizationRequest,
|
|
17
|
+
TokenizationRequestResult,
|
|
18
|
+
DecodeRequest,
|
|
19
|
+
DecodeRequestResult,
|
|
20
|
+
)
|
|
21
|
+
from helm.common.request import Request, RequestResult
|
|
22
|
+
from helm.clients.auto_client import AutoClient
|
|
23
|
+
from helm.clients.moderation_api_client import ModerationAPIClient
|
|
24
|
+
from helm.clients.image_generation.nudity_check_client import NudityCheckClient
|
|
25
|
+
from helm.clients.gcs_client import GCSClient
|
|
26
|
+
from helm.clients.clip_score_client import CLIPScoreClient
|
|
27
|
+
from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
|
|
28
|
+
from helm.proxy.example_queries import example_queries
|
|
29
|
+
from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
|
|
30
|
+
from helm.proxy.query import Query, QueryResult
|
|
31
|
+
from helm.proxy.retry import retry_request
|
|
32
|
+
from helm.tokenizers.auto_tokenizer import AutoTokenizer
|
|
33
|
+
from helm.proxy.services.service import (
|
|
34
|
+
CACHE_DIR,
|
|
35
|
+
GeneralInfo,
|
|
36
|
+
VERSION,
|
|
37
|
+
expand_environments,
|
|
38
|
+
synthesize_request,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class LocalContext(Context):
|
|
43
|
+
"""
|
|
44
|
+
Main class that supports various functionality for the server.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
base_path: str = "prod_env",
|
|
50
|
+
cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
|
|
51
|
+
):
|
|
52
|
+
ensure_directory_exists(base_path)
|
|
53
|
+
client_file_storage_path = os.path.join(base_path, CACHE_DIR)
|
|
54
|
+
ensure_directory_exists(client_file_storage_path)
|
|
55
|
+
|
|
56
|
+
credentials = get_credentials(base_path)
|
|
57
|
+
|
|
58
|
+
self.cache_backend_config = cache_backend_config
|
|
59
|
+
self.client = AutoClient(credentials, client_file_storage_path, cache_backend_config)
|
|
60
|
+
self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
|
|
61
|
+
|
|
62
|
+
# Lazily instantiate the following clients
|
|
63
|
+
self.moderation_api_client: Optional[ModerationAPIClient] = None
|
|
64
|
+
self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
|
|
65
|
+
self.perspective_api_client: Optional[ToxicityClassifierClient] = None
|
|
66
|
+
self.nudity_check_client: Optional[NudityCheckClient] = None
|
|
67
|
+
self.clip_score_client: Optional[CLIPScoreClient] = None
|
|
68
|
+
self.gcs_client: Optional[GCSClient] = None
|
|
69
|
+
|
|
70
|
+
def get_general_info(self) -> GeneralInfo:
|
|
71
|
+
# Can't send release_dates in ModelMetadata bacause dates cannot be round-tripped to and from JSON easily.
|
|
72
|
+
# TODO(#2158): Either fix this or delete get_general_info.
|
|
73
|
+
all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
|
|
74
|
+
return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
|
|
75
|
+
|
|
76
|
+
def expand_query(self, query: Query) -> QueryResult:
|
|
77
|
+
"""Turn the `query` into requests."""
|
|
78
|
+
prompt = query.prompt
|
|
79
|
+
settings = query.settings
|
|
80
|
+
environments = parse_hocon(query.environments)
|
|
81
|
+
requests = []
|
|
82
|
+
for environment in expand_environments(environments):
|
|
83
|
+
request = synthesize_request(prompt, settings, environment)
|
|
84
|
+
requests.append(request)
|
|
85
|
+
return QueryResult(requests=requests)
|
|
86
|
+
|
|
87
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
88
|
+
"""Actually make a request to an API."""
|
|
89
|
+
return self.client.make_request(request)
|
|
90
|
+
|
|
91
|
+
def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
92
|
+
return self.tokenizer.tokenize(request)
|
|
93
|
+
|
|
94
|
+
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
|
|
95
|
+
return self.tokenizer.decode(request)
|
|
96
|
+
|
|
97
|
+
def upload(self, request: FileUploadRequest) -> FileUploadResult:
|
|
98
|
+
if not self.gcs_client:
|
|
99
|
+
self.gcs_client = self.client.get_gcs_client()
|
|
100
|
+
|
|
101
|
+
assert self.gcs_client
|
|
102
|
+
return self.gcs_client.upload(request)
|
|
103
|
+
|
|
104
|
+
def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
|
|
105
|
+
if not self.nudity_check_client:
|
|
106
|
+
self.nudity_check_client = self.client.get_nudity_check_client()
|
|
107
|
+
|
|
108
|
+
assert self.nudity_check_client
|
|
109
|
+
return self.nudity_check_client.check_nudity(request)
|
|
110
|
+
|
|
111
|
+
def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
112
|
+
if not self.clip_score_client:
|
|
113
|
+
self.clip_score_client = self.client.get_clip_score_client()
|
|
114
|
+
|
|
115
|
+
assert self.clip_score_client
|
|
116
|
+
return self.clip_score_client.compute_score(request)
|
|
117
|
+
|
|
118
|
+
def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
119
|
+
@retry_request
|
|
120
|
+
def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
121
|
+
if not self.toxicity_classifier_client:
|
|
122
|
+
self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
|
|
123
|
+
return self.toxicity_classifier_client.get_toxicity_scores(request)
|
|
124
|
+
|
|
125
|
+
return get_toxicity_scores_with_retry(request)
|
|
126
|
+
|
|
127
|
+
def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
128
|
+
@retry_request
|
|
129
|
+
def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
130
|
+
if not self.moderation_api_client:
|
|
131
|
+
self.moderation_api_client = self.client.get_moderation_api_client()
|
|
132
|
+
return self.moderation_api_client.get_moderation_results(request)
|
|
133
|
+
|
|
134
|
+
return get_moderation_results_with_retry(request)
|
|
135
|
+
|
|
136
|
+
def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
|
|
137
|
+
return self.client.get_critique_client().make_critique_request(request)
|
|
138
|
+
|
|
139
|
+
def get_cache_config(self, shard_name: str) -> CacheConfig:
|
|
140
|
+
return self.cache_backend_config.get_cache_config(shard_name)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from helm.common.context import Context
|
|
2
|
+
from helm.common.cache import CacheConfig
|
|
3
|
+
from helm.common.authentication import Authentication
|
|
4
|
+
from helm.common.moderations_api_request import ModerationAPIRequest, ModerationAPIRequestResult
|
|
5
|
+
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
6
|
+
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
7
|
+
from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
|
|
8
|
+
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
9
|
+
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
10
|
+
from helm.common.tokenization_request import (
|
|
11
|
+
TokenizationRequest,
|
|
12
|
+
TokenizationRequestResult,
|
|
13
|
+
DecodeRequestResult,
|
|
14
|
+
DecodeRequest,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.request import Request, RequestResult
|
|
17
|
+
from helm.proxy.query import Query, QueryResult
|
|
18
|
+
from helm.proxy.services.remote_service import RemoteService
|
|
19
|
+
from helm.proxy.services.service import GeneralInfo, Service
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RemoteContext(Context):
|
|
23
|
+
def __init__(self, base_url: str, auth: Authentication):
|
|
24
|
+
self.service: Service = RemoteService(base_url)
|
|
25
|
+
self.auth = auth
|
|
26
|
+
|
|
27
|
+
def get_general_info(self) -> GeneralInfo:
|
|
28
|
+
return self.service.get_general_info()
|
|
29
|
+
|
|
30
|
+
def expand_query(self, query: Query) -> QueryResult:
|
|
31
|
+
return self.service.expand_query(query)
|
|
32
|
+
|
|
33
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
34
|
+
return self.service.make_request(self.auth, request)
|
|
35
|
+
|
|
36
|
+
def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
37
|
+
return self.service.tokenize(self.auth, request)
|
|
38
|
+
|
|
39
|
+
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
|
|
40
|
+
return self.service.decode(self.auth, request)
|
|
41
|
+
|
|
42
|
+
def upload(self, request: FileUploadRequest) -> FileUploadResult:
|
|
43
|
+
return self.service.upload(self.auth, request)
|
|
44
|
+
|
|
45
|
+
def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
|
|
46
|
+
return self.service.check_nudity(self.auth, request)
|
|
47
|
+
|
|
48
|
+
def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
49
|
+
return self.service.compute_clip_score(self.auth, request)
|
|
50
|
+
|
|
51
|
+
def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
52
|
+
return self.service.get_toxicity_scores(self.auth, request)
|
|
53
|
+
|
|
54
|
+
def get_moderation_results(self, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
|
|
55
|
+
return self.service.get_moderation_results(self.auth, request)
|
|
56
|
+
|
|
57
|
+
def make_critique_request(self, request: CritiqueRequest) -> CritiqueRequestResult:
|
|
58
|
+
return self.service.make_critique_request(self.auth, request)
|
|
59
|
+
|
|
60
|
+
def get_cache_config(self, shard_name: str) -> CacheConfig:
|
|
61
|
+
return self.service.get_cache_config(shard_name)
|
helm/common/request.py
CHANGED
|
@@ -131,6 +131,11 @@ class Token:
|
|
|
131
131
|
]
|
|
132
132
|
|
|
133
133
|
|
|
134
|
+
@dataclass(frozen=True)
|
|
135
|
+
class Thinking:
|
|
136
|
+
text: Optional[str] = None
|
|
137
|
+
|
|
138
|
+
|
|
134
139
|
@dataclass(frozen=True)
|
|
135
140
|
class GeneratedOutput:
|
|
136
141
|
"""A `GeneratedOutput` is a single generated output that may contain text or multimodal content."""
|
|
@@ -150,6 +155,9 @@ class GeneratedOutput:
|
|
|
150
155
|
# Could be a sequence made up of multimedia content
|
|
151
156
|
multimodal_content: Optional[MultimediaObject] = None
|
|
152
157
|
|
|
158
|
+
# Could be reasoning
|
|
159
|
+
thinking: Optional[Thinking] = None
|
|
160
|
+
|
|
153
161
|
def __add__(self, other: "GeneratedOutput") -> "GeneratedOutput":
|
|
154
162
|
return GeneratedOutput(self.text + other.text, self.logprob + other.logprob, self.tokens + other.tokens)
|
|
155
163
|
|