crfm-helm 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (98) hide show
  1. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +13 -3
  2. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +96 -63
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  5. helm/benchmark/annotation/annotator_factory.py +6 -0
  6. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  7. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  8. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  9. helm/benchmark/huggingface_registration.py +16 -6
  10. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  11. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  12. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  13. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  14. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  15. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  16. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  17. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  18. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  19. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  20. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  21. helm/benchmark/metrics/vision_language/image_metrics.py +29 -71
  22. helm/benchmark/presentation/schema.py +54 -4
  23. helm/benchmark/presentation/test_schema.py +11 -0
  24. helm/benchmark/run.py +16 -2
  25. helm/benchmark/run_expander.py +77 -0
  26. helm/benchmark/run_spec_factory.py +4 -0
  27. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  29. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  30. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  31. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  32. helm/benchmark/run_specs/vlm_run_specs.py +168 -45
  33. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  34. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  35. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  36. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  37. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  38. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  39. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  40. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  41. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +0 -4
  42. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +4 -2
  43. helm/benchmark/scenarios/vision_language/pairs_scenario.py +6 -5
  44. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  45. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  46. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  47. helm/benchmark/static/schema_classic.yaml +3 -59
  48. helm/benchmark/static/schema_finance.yaml +143 -0
  49. helm/benchmark/static/schema_image2structure.yaml +254 -111
  50. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  51. helm/benchmark/static/schema_lite.yaml +3 -61
  52. helm/benchmark/static/schema_medical.yaml +255 -0
  53. helm/benchmark/static/schema_mmlu.yaml +3 -61
  54. helm/benchmark/static/schema_tables.yaml +200 -0
  55. helm/benchmark/static/schema_thai.yaml +223 -0
  56. helm/benchmark/static/schema_unitxt.yaml +3 -61
  57. helm/benchmark/static/{schema_vlm.yaml → schema_vhelm.yaml} +294 -293
  58. helm/benchmark/static/schema_vhelm_lite.yaml +4 -59
  59. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  60. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  61. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  62. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  63. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  64. helm/benchmark/static_build/index.html +2 -2
  65. helm/clients/anthropic_client.py +43 -9
  66. helm/clients/auto_client.py +11 -0
  67. helm/clients/client.py +24 -7
  68. helm/clients/cohere_client.py +98 -3
  69. helm/clients/huggingface_client.py +71 -12
  70. helm/clients/openai_client.py +9 -2
  71. helm/clients/reka_client.py +189 -0
  72. helm/clients/test_client.py +3 -3
  73. helm/clients/test_huggingface_client.py +19 -3
  74. helm/clients/test_together_client.py +72 -2
  75. helm/clients/together_client.py +129 -23
  76. helm/clients/vertexai_client.py +62 -18
  77. helm/clients/vision_language/huggingface_vlm_client.py +1 -0
  78. helm/clients/vision_language/paligemma_client.py +146 -0
  79. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  80. helm/clients/yi_client.py +31 -0
  81. helm/common/critique_request.py +10 -1
  82. helm/common/images_utils.py +19 -0
  83. helm/config/model_deployments.yaml +412 -18
  84. helm/config/model_metadata.yaml +447 -25
  85. helm/config/tokenizer_configs.yaml +93 -1
  86. helm/proxy/critique/model_critique_client.py +32 -4
  87. helm/proxy/services/server_service.py +1 -1
  88. helm/tokenizers/auto_tokenizer.py +1 -1
  89. helm/tokenizers/cohere_tokenizer.py +44 -2
  90. helm/tokenizers/huggingface_tokenizer.py +36 -13
  91. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  92. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  93. helm/benchmark/static_build/assets/index-737eef9e.js +0 -10
  94. helm/benchmark/static_build/assets/index-878a1094.css +0 -1
  95. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  96. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  97. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  98. {crfm_helm-0.5.1.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,84 @@
1
+ from typing import Dict, List
2
+ import json
3
+
4
+ import requests
5
+
6
+ from helm.common.cache import CacheConfig
7
+ from helm.common.images_utils import encode_base64
8
+ from helm.common.media_object import TEXT_TYPE
9
+ from helm.common.request import Request, RequestResult, GeneratedOutput
10
+ from helm.common.request import wrap_request_time
11
+ from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt, truncate_and_tokenize_response_text
12
+ from helm.tokenizers.tokenizer import Tokenizer
13
+
14
+
15
+ class PalmyraVisionClient(CachingClient):
16
+ def __init__(self, tokenizer: Tokenizer, tokenizer_name: str, endpoint: str, cache_config: CacheConfig):
17
+ super().__init__(cache_config)
18
+ self.tokenizer: Tokenizer = tokenizer
19
+ self.tokenizer_name: str = tokenizer_name
20
+
21
+ # Currently, the Palmyra Vision model does not have a public API, so we need to use a secret endpoint
22
+ self.endpoint: str = endpoint
23
+
24
+ def make_request(self, request: Request) -> RequestResult:
25
+ assert request.multimodal_prompt is not None, "Multimodal prompt is required"
26
+
27
+ # Build the prompt
28
+ prompt: List[Dict[str, str]] = []
29
+ for media_object in request.multimodal_prompt.media_objects:
30
+ if media_object.is_type("image") and media_object.location:
31
+ prompt.append(
32
+ {
33
+ "type": "InlineData",
34
+ "value": encode_base64(media_object.location, format="JPEG"),
35
+ "contentType": "image/jpeg",
36
+ }
37
+ )
38
+ elif media_object.is_type(TEXT_TYPE):
39
+ if media_object.text is None:
40
+ raise ValueError("MediaObject of text type has missing text field value")
41
+ prompt.append({"type": "Text", "value": media_object.text})
42
+ else:
43
+ raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
44
+
45
+ # Generate
46
+ try:
47
+
48
+ def do_it():
49
+ response = requests.post(
50
+ self.endpoint, headers={"Content-Type": "application/json"}, data=json.dumps({"parts": prompt})
51
+ )
52
+ if response.status_code != 200:
53
+ curl_command: str = (
54
+ f"curl --location '{self.endpoint}' --header 'Content-Type: application/json' "
55
+ f"--data '{json.dumps({'parts': prompt})}'"
56
+ )
57
+ assert False, f"Got status code {response.status_code}. Try {curl_command}"
58
+
59
+ json_response = json.loads(response.text)
60
+ assert (
61
+ "choices" in json_response and "errors" not in json_response
62
+ ), f"Invalid response: {response.text}"
63
+ return json_response
64
+
65
+ cache_key = CachingClient.make_cache_key(
66
+ raw_request={"prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt)},
67
+ request=request,
68
+ )
69
+ result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
70
+ except RuntimeError as ex:
71
+ return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
72
+
73
+ # The internal endpoint doesn't support any other parameters, so we have to truncate ourselves
74
+ completions: List[GeneratedOutput] = [
75
+ truncate_and_tokenize_response_text(choice["text"], request, self.tokenizer, self.tokenizer_name)
76
+ for choice in result["choices"]
77
+ ]
78
+ return RequestResult(
79
+ success=True,
80
+ cached=cached,
81
+ request_time=result["request_time"],
82
+ completions=completions,
83
+ embedding=[],
84
+ )
@@ -0,0 +1,31 @@
1
+ from typing import Optional
2
+
3
+ from helm.clients.openai_client import OpenAIClient
4
+ from helm.common.cache import CacheConfig
5
+ from helm.tokenizers.tokenizer import Tokenizer
6
+
7
+
8
+ class YiChatClient(OpenAIClient):
9
+
10
+ BASE_URL = "http://api.01ww.xyz/v1"
11
+
12
+ def __init__(
13
+ self,
14
+ tokenizer: Tokenizer,
15
+ tokenizer_name: str,
16
+ cache_config: CacheConfig,
17
+ api_key: Optional[str] = None,
18
+ ):
19
+ self.tokenizer = tokenizer
20
+ self.tokenizer_name = tokenizer_name
21
+ super().__init__(
22
+ tokenizer=tokenizer,
23
+ tokenizer_name=tokenizer_name,
24
+ cache_config=cache_config,
25
+ api_key=api_key,
26
+ org_id=None,
27
+ base_url=YiChatClient.BASE_URL,
28
+ )
29
+
30
+ def _is_chat_model_engine(self, model_engine: str) -> bool:
31
+ return True
@@ -1,5 +1,6 @@
1
1
  from dataclasses import dataclass
2
- from typing import Dict, List, Union
2
+ from typing import Dict, List, Union, Optional
3
+ from helm.common.media_object import MediaObject
3
4
 
4
5
 
5
6
  class QuestionType:
@@ -34,6 +35,11 @@ class CritiqueQuestionTemplate:
34
35
 
35
36
  Can contain placeholders like {{placeholder}} that will be interpolated using the fields in CritiqueRequest."""
36
37
 
38
+ media_object: Optional[MediaObject] = None
39
+ """Path of image for multimodal input.
40
+
41
+ Image path or URL of the question."""
42
+
37
43
 
38
44
  @dataclass(frozen=True)
39
45
  class CritiqueTaskTemplate:
@@ -53,6 +59,9 @@ class CritiqueTaskTemplate:
53
59
  questions: List[CritiqueQuestionTemplate]
54
60
  """List of templates for the questions."""
55
61
 
62
+ max_tokens: Optional[int] = None
63
+ """Max token to be generated for the free-end generation."""
64
+
56
65
 
57
66
  @dataclass(frozen=True)
58
67
  class CritiqueRequest:
@@ -1,5 +1,6 @@
1
1
  import base64
2
2
  import io
3
+ import os
3
4
 
4
5
  import requests
5
6
  import shutil
@@ -57,6 +58,24 @@ def copy_image(src: str, dest: str, width: Optional[int] = None, height: Optiona
57
58
  shutil.copy(src, dest)
58
59
 
59
60
 
61
+ def resize_image_to_max_file_size(src: str, dest: str, max_size_in_bytes: int, step=10):
62
+ # Open an image file
63
+ with Image.open(src) as img:
64
+ width, height = img.size
65
+
66
+ # Reduce dimensions iteratively until the file size is under the limit
67
+ while True:
68
+ # Save the image temporarily to check the file size
69
+ img.save(dest, quality=95) # Start with high quality
70
+ if os.path.getsize(dest) < max_size_in_bytes:
71
+ break
72
+
73
+ # Reduce dimensions
74
+ width -= step
75
+ height -= step
76
+ img = img.resize((width, height), Image.Resampling.LANCZOS)
77
+
78
+
60
79
  def is_blacked_out_image(image_location: str) -> bool:
61
80
  """Returns True if the image is all black. False otherwise."""
62
81
  try: