crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +112 -63
- helm/benchmark/run_spec_factory.py +15 -10
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +444 -65
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +447 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/schema_vhelm.yaml +824 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +78 -14
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +11 -5
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +199 -2
- helm/clients/vertexai_client.py +117 -64
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +12 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +29 -3
- helm/config/model_deployments.yaml +504 -12
- helm/config/model_metadata.yaml +579 -52
- helm/config/tokenizer_configs.yaml +100 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/schema_vlm.yaml +0 -576
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
from helm.common.cache import CacheConfig
|
|
7
|
+
from helm.common.images_utils import encode_base64
|
|
8
|
+
from helm.common.media_object import TEXT_TYPE
|
|
9
|
+
from helm.common.request import Request, RequestResult, GeneratedOutput
|
|
10
|
+
from helm.common.request import wrap_request_time
|
|
11
|
+
from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt, truncate_and_tokenize_response_text
|
|
12
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PalmyraVisionClient(CachingClient):
|
|
16
|
+
def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, endpoint: str, cache_config: CacheConfig):
|
|
17
|
+
super().__init__(cache_config)
|
|
18
|
+
self.tokenizer: Tokenizer = tokenizer
|
|
19
|
+
self.tokenizer_name: str = tokenizer_name
|
|
20
|
+
|
|
21
|
+
# Currently, the Palmyra Vision model does not have a public API, so we need to use a secret endpoint
|
|
22
|
+
self.endpoint: str = endpoint
|
|
23
|
+
|
|
24
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
25
|
+
assert request.multimodal_prompt is not None, "Multimodal prompt is required"
|
|
26
|
+
|
|
27
|
+
# Build the prompt
|
|
28
|
+
prompt: List[Dict[str, str]] = []
|
|
29
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
30
|
+
if media_object.is_type("image") and media_object.location:
|
|
31
|
+
prompt.append(
|
|
32
|
+
{
|
|
33
|
+
"type": "InlineData",
|
|
34
|
+
"value": encode_base64(media_object.location, format="JPEG"),
|
|
35
|
+
"contentType": "image/jpeg",
|
|
36
|
+
}
|
|
37
|
+
)
|
|
38
|
+
elif media_object.is_type(TEXT_TYPE):
|
|
39
|
+
if media_object.text is None:
|
|
40
|
+
raise ValueError("MediaObject of text type has missing text field value")
|
|
41
|
+
prompt.append({"type": "Text", "value": media_object.text})
|
|
42
|
+
else:
|
|
43
|
+
raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
|
|
44
|
+
|
|
45
|
+
# Generate
|
|
46
|
+
try:
|
|
47
|
+
|
|
48
|
+
def do_it():
|
|
49
|
+
response = requests.post(
|
|
50
|
+
self.endpoint, headers={"Content-Type": "application/json"}, data=json.dumps({"parts": prompt})
|
|
51
|
+
)
|
|
52
|
+
if response.status_code != 200:
|
|
53
|
+
curl_command: str = (
|
|
54
|
+
f"curl --location '{self.endpoint}' --header 'Content-Type: application/json' "
|
|
55
|
+
f"--data '{json.dumps({'parts': prompt})}'"
|
|
56
|
+
)
|
|
57
|
+
assert False, f"Got status code {response.status_code}. Try {curl_command}"
|
|
58
|
+
|
|
59
|
+
json_response = json.loads(response.text)
|
|
60
|
+
assert (
|
|
61
|
+
"choices" in json_response and "errors" not in json_response
|
|
62
|
+
), f"Invalid response: {response.text}"
|
|
63
|
+
return json_response
|
|
64
|
+
|
|
65
|
+
cache_key = CachingClient.make_cache_key(
|
|
66
|
+
raw_request={"prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt)},
|
|
67
|
+
request=request,
|
|
68
|
+
)
|
|
69
|
+
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
70
|
+
except RuntimeError as ex:
|
|
71
|
+
return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
|
|
72
|
+
|
|
73
|
+
# The internal endpoint doesn't support any other parameters, so we have to truncate ourselves
|
|
74
|
+
completions: List[GeneratedOutput] = [
|
|
75
|
+
truncate_and_tokenize_response_text(choice["text"], request, self.tokenizer, self.tokenizer_name)
|
|
76
|
+
for choice in result["choices"]
|
|
77
|
+
]
|
|
78
|
+
return RequestResult(
|
|
79
|
+
success=True,
|
|
80
|
+
cached=cached,
|
|
81
|
+
request_time=result["request_time"],
|
|
82
|
+
completions=completions,
|
|
83
|
+
embedding=[],
|
|
84
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from helm.clients.openai_client import OpenAIClient
|
|
4
|
+
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class YiChatClient(OpenAIClient):
|
|
9
|
+
|
|
10
|
+
BASE_URL = "http://api.01ww.xyz/v1"
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
tokenizer: Tokenizer,
|
|
15
|
+
tokenizer_name: str,
|
|
16
|
+
cache_config: CacheConfig,
|
|
17
|
+
api_key: Optional[str] = None,
|
|
18
|
+
):
|
|
19
|
+
self.tokenizer = tokenizer
|
|
20
|
+
self.tokenizer_name = tokenizer_name
|
|
21
|
+
super().__init__(
|
|
22
|
+
tokenizer=tokenizer,
|
|
23
|
+
tokenizer_name=tokenizer_name,
|
|
24
|
+
cache_config=cache_config,
|
|
25
|
+
api_key=api_key,
|
|
26
|
+
org_id=None,
|
|
27
|
+
base_url=YiChatClient.BASE_URL,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
def _is_chat_model_engine(self, model_engine: str) -> bool:
|
|
31
|
+
return True
|
helm/common/critique_request.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Dict, List, Union
|
|
2
|
+
from typing import Dict, List, Union, Optional
|
|
3
|
+
from helm.common.media_object import MediaObject
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
class QuestionType:
|
|
@@ -34,6 +35,11 @@ class CritiqueQuestionTemplate:
|
|
|
34
35
|
|
|
35
36
|
Can contain placeholders like {{placeholder}} that will be interpolated using the fields in CritiqueRequest."""
|
|
36
37
|
|
|
38
|
+
media_object: Optional[MediaObject] = None
|
|
39
|
+
"""Path of image for multimodal input.
|
|
40
|
+
|
|
41
|
+
Image path or URL of the question."""
|
|
42
|
+
|
|
37
43
|
|
|
38
44
|
@dataclass(frozen=True)
|
|
39
45
|
class CritiqueTaskTemplate:
|
|
@@ -53,6 +59,9 @@ class CritiqueTaskTemplate:
|
|
|
53
59
|
questions: List[CritiqueQuestionTemplate]
|
|
54
60
|
"""List of templates for the questions."""
|
|
55
61
|
|
|
62
|
+
max_tokens: Optional[int] = None
|
|
63
|
+
"""Max token to be generated for the free-end generation."""
|
|
64
|
+
|
|
56
65
|
|
|
57
66
|
@dataclass(frozen=True)
|
|
58
67
|
class CritiqueRequest:
|
helm/common/images_utils.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import io
|
|
3
|
+
import os
|
|
4
|
+
|
|
3
5
|
import requests
|
|
4
6
|
import shutil
|
|
5
|
-
from typing import List, Optional
|
|
7
|
+
from typing import List, Optional, Tuple
|
|
6
8
|
from urllib.request import urlopen
|
|
7
9
|
|
|
8
10
|
import numpy as np
|
|
@@ -28,6 +30,12 @@ def open_image(image_location: str) -> Image.Image:
|
|
|
28
30
|
return image.convert("RGB")
|
|
29
31
|
|
|
30
32
|
|
|
33
|
+
def get_dimensions(image_location: str) -> Tuple[int, int]:
|
|
34
|
+
"""Returns the dimensions of the image."""
|
|
35
|
+
image: Image.Image = open_image(image_location)
|
|
36
|
+
return image.size
|
|
37
|
+
|
|
38
|
+
|
|
31
39
|
def encode_base64(image_location: str, format="JPEG") -> str:
|
|
32
40
|
"""Returns the base64 representation of an image file."""
|
|
33
41
|
image_file = io.BytesIO()
|
|
@@ -36,7 +44,7 @@ def encode_base64(image_location: str, format="JPEG") -> str:
|
|
|
36
44
|
return base64.b64encode(image_file.getvalue()).decode("ascii")
|
|
37
45
|
|
|
38
46
|
|
|
39
|
-
def copy_image(src: str, dest: str, width: Optional[int] = None, height: Optional[int] = None):
|
|
47
|
+
def copy_image(src: str, dest: str, width: Optional[int] = None, height: Optional[int] = None) -> None:
|
|
40
48
|
"""
|
|
41
49
|
Copies the image file from `src` path to `dest` path. If dimensions `width` and `height`
|
|
42
50
|
are specified, resizes the image before copying. `src` can be a URL.
|
|
@@ -44,12 +52,30 @@ def copy_image(src: str, dest: str, width: Optional[int] = None, height: Optiona
|
|
|
44
52
|
if (width is not None and height is not None) or is_url(src):
|
|
45
53
|
image = open_image(src)
|
|
46
54
|
if width is not None and height is not None:
|
|
47
|
-
image = image.resize((width, height), Image.
|
|
55
|
+
image = image.resize((width, height), Image.Resampling.LANCZOS)
|
|
48
56
|
image.save(dest)
|
|
49
57
|
else:
|
|
50
58
|
shutil.copy(src, dest)
|
|
51
59
|
|
|
52
60
|
|
|
61
|
+
def resize_image_to_max_file_size(src: str, dest: str, max_size_in_bytes: int, step=10):
|
|
62
|
+
# Open an image file
|
|
63
|
+
with Image.open(src) as img:
|
|
64
|
+
width, height = img.size
|
|
65
|
+
|
|
66
|
+
# Reduce dimensions iteratively until the file size is under the limit
|
|
67
|
+
while True:
|
|
68
|
+
# Save the image temporarily to check the file size
|
|
69
|
+
img.save(dest, quality=95) # Start with high quality
|
|
70
|
+
if os.path.getsize(dest) < max_size_in_bytes:
|
|
71
|
+
break
|
|
72
|
+
|
|
73
|
+
# Reduce dimensions
|
|
74
|
+
width -= step
|
|
75
|
+
height -= step
|
|
76
|
+
img = img.resize((width, height), Image.Resampling.LANCZOS)
|
|
77
|
+
|
|
78
|
+
|
|
53
79
|
def is_blacked_out_image(image_location: str) -> bool:
|
|
54
80
|
"""Returns True if the image is all black. False otherwise."""
|
|
55
81
|
try:
|