crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,84 @@
1
+ from typing import Dict, List
2
+ import json
3
+
4
+ import requests
5
+
6
+ from helm.common.cache import CacheConfig
7
+ from helm.common.images_utils import encode_base64
8
+ from helm.common.media_object import TEXT_TYPE
9
+ from helm.common.request import Request, RequestResult, GeneratedOutput
10
+ from helm.common.request import wrap_request_time
11
+ from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt, truncate_and_tokenize_response_text
12
+ from helm.tokenizers.tokenizer import Tokenizer
13
+
14
+
15
+ class PalmyraVisionClient(CachingClient):
16
+ def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, endpoint: str, cache_config: CacheConfig):
17
+ super().__init__(cache_config)
18
+ self.tokenizer: Tokenizer = tokenizer
19
+ self.tokenizer_name: str = tokenizer_name
20
+
21
+ # Currently, the Palmyra Vision model does not have a public API, so we need to use a secret endpoint
22
+ self.endpoint: str = endpoint
23
+
24
+ def make_request(self, request: Request) -> RequestResult:
25
+ assert request.multimodal_prompt is not None, "Multimodal prompt is required"
26
+
27
+ # Build the prompt
28
+ prompt: List[Dict[str, str]] = []
29
+ for media_object in request.multimodal_prompt.media_objects:
30
+ if media_object.is_type("image") and media_object.location:
31
+ prompt.append(
32
+ {
33
+ "type": "InlineData",
34
+ "value": encode_base64(media_object.location, format="JPEG"),
35
+ "contentType": "image/jpeg",
36
+ }
37
+ )
38
+ elif media_object.is_type(TEXT_TYPE):
39
+ if media_object.text is None:
40
+ raise ValueError("MediaObject of text type has missing text field value")
41
+ prompt.append({"type": "Text", "value": media_object.text})
42
+ else:
43
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
44
+
45
+ # Generate
46
+ try:
47
+
48
+ def do_it():
49
+ response = requests.post(
50
+ self.endpoint, headers={"Content-Type": "application/json"}, data=json.dumps({"parts": prompt})
51
+ )
52
+ if response.status_code != 200:
53
+ curl_command: str = (
54
+ f"curl --location '{self.endpoint}' --header 'Content-Type: application/json' "
55
+ f"--data '{json.dumps({'parts': prompt})}'"
56
+ )
57
+ assert False, f"Got status code {response.status_code}. Try {curl_command}"
58
+
59
+ json_response = json.loads(response.text)
60
+ assert (
61
+ "choices" in json_response and "errors" not in json_response
62
+ ), f"Invalid response: {response.text}"
63
+ return json_response
64
+
65
+ cache_key = CachingClient.make_cache_key(
66
+ raw_request={"prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt)},
67
+ request=request,
68
+ )
69
+ result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
70
+ except RuntimeError as ex:
71
+ return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
72
+
73
+ # The internal endpoint doesn't support any other parameters, so we have to truncate ourselves
74
+ completions: List[GeneratedOutput] = [
75
+ truncate_and_tokenize_response_text(choice["text"], request, self.tokenizer, self.tokenizer_name)
76
+ for choice in result["choices"]
77
+ ]
78
+ return RequestResult(
79
+ success=True,
80
+ cached=cached,
81
+ request_time=result["request_time"],
82
+ completions=completions,
83
+ embedding=[],
84
+ )
@@ -0,0 +1,31 @@
1
+ from typing import Optional
2
+
3
+ from helm.clients.openai_client import OpenAIClient
4
+ from helm.common.cache import CacheConfig
5
+ from helm.tokenizers.tokenizer import Tokenizer
6
+
7
+
8
+ class YiChatClient(OpenAIClient):
9
+
10
+ BASE_URL = "http://api.01ww.xyz/v1"
11
+
12
+ def __init__(
13
+ self,
14
+ tokenizer: Tokenizer,
15
+ tokenizer_name: str,
16
+ cache_config: CacheConfig,
17
+ api_key: Optional[str] = None,
18
+ ):
19
+ self.tokenizer = tokenizer
20
+ self.tokenizer_name = tokenizer_name
21
+ super().__init__(
22
+ tokenizer=tokenizer,
23
+ tokenizer_name=tokenizer_name,
24
+ cache_config=cache_config,
25
+ api_key=api_key,
26
+ org_id=None,
27
+ base_url=YiChatClient.BASE_URL,
28
+ )
29
+
30
+ def _is_chat_model_engine(self, model_engine: str) -> bool:
31
+ return True
@@ -1,5 +1,6 @@
1
1
  from dataclasses import dataclass
2
- from typing import Dict, List, Union
2
+ from typing import Dict, List, Union, Optional
3
+ from helm.common.media_object import MediaObject
3
4
 
4
5
 
5
6
  class QuestionType:
@@ -34,6 +35,11 @@ class CritiqueQuestionTemplate:
34
35
 
35
36
  Can contain placeholders like {{placeholder}} that will be interpolated using the fields in CritiqueRequest."""
36
37
 
38
+ media_object: Optional[MediaObject] = None
39
+ """Path of image for multimodal input.
40
+
41
+ Image path or URL of the question."""
42
+
37
43
 
38
44
  @dataclass(frozen=True)
39
45
  class CritiqueTaskTemplate:
@@ -53,6 +59,9 @@ class CritiqueTaskTemplate:
53
59
  questions: List[CritiqueQuestionTemplate]
54
60
  """List of templates for the questions."""
55
61
 
62
+ max_tokens: Optional[int] = None
63
+ """Max token to be generated for the free-end generation."""
64
+
56
65
 
57
66
  @dataclass(frozen=True)
58
67
  class CritiqueRequest:
@@ -1,8 +1,10 @@
1
1
  import base64
2
2
  import io
3
+ import os
4
+
3
5
  import requests
4
6
  import shutil
5
- from typing import List, Optional
7
+ from typing import List, Optional, Tuple
6
8
  from urllib.request import urlopen
7
9
 
8
10
  import numpy as np
@@ -28,6 +30,12 @@ def open_image(image_location: str) -> Image.Image:
28
30
  return image.convert("RGB")
29
31
 
30
32
 
33
+ def get_dimensions(image_location: str) -> Tuple[int, int]:
34
+ """Returns the dimensions of the image."""
35
+ image: Image.Image = open_image(image_location)
36
+ return image.size
37
+
38
+
31
39
  def encode_base64(image_location: str, format="JPEG") -> str:
32
40
  """Returns the base64 representation of an image file."""
33
41
  image_file = io.BytesIO()
@@ -36,7 +44,7 @@ def encode_base64(image_location: str, format="JPEG") -> str:
36
44
  return base64.b64encode(image_file.getvalue()).decode("ascii")
37
45
 
38
46
 
39
- def copy_image(src: str, dest: str, width: Optional[int] = None, height: Optional[int] = None):
47
+ def copy_image(src: str, dest: str, width: Optional[int] = None, height: Optional[int] = None) -> None:
40
48
  """
41
49
  Copies the image file from `src` path to `dest` path. If dimensions `width` and `height`
42
50
  are specified, resizes the image before copying. `src` can be a URL.
@@ -44,12 +52,30 @@ def copy_image(src: str, dest: str, width: Optional[int] = None, height: Optiona
44
52
  if (width is not None and height is not None) or is_url(src):
45
53
  image = open_image(src)
46
54
  if width is not None and height is not None:
47
- image = image.resize((width, height), Image.ANTIALIAS)
55
+ image = image.resize((width, height), Image.Resampling.LANCZOS)
48
56
  image.save(dest)
49
57
  else:
50
58
  shutil.copy(src, dest)
51
59
 
52
60
 
61
+ def resize_image_to_max_file_size(src: str, dest: str, max_size_in_bytes: int, step=10):
62
+ # Open an image file
63
+ with Image.open(src) as img:
64
+ width, height = img.size
65
+
66
+ # Reduce dimensions iteratively until the file size is under the limit
67
+ while True:
68
+ # Save the image temporarily to check the file size
69
+ img.save(dest, quality=95) # Start with high quality
70
+ if os.path.getsize(dest) < max_size_in_bytes:
71
+ break
72
+
73
+ # Reduce dimensions
74
+ width -= step
75
+ height -= step
76
+ img = img.resize((width, height), Image.Resampling.LANCZOS)
77
+
78
+
53
79
  def is_blacked_out_image(image_location: str) -> bool:
54
80
  """Returns True if the image is all black. False otherwise."""
55
81
  try: