crfm-helm 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (103) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +56 -49
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +99 -66
  3. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  4. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  5. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  6. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  7. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  8. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  9. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  10. helm/benchmark/metrics/comet_metric.py +1 -1
  11. helm/benchmark/metrics/copyright_metrics.py +1 -1
  12. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  13. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  14. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  15. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  16. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  17. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  18. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  19. helm/benchmark/metrics/summac/model_summac.py +1 -1
  20. helm/benchmark/model_deployment_registry.py +11 -19
  21. helm/benchmark/presentation/create_plots.py +11 -2
  22. helm/benchmark/presentation/schema.py +5 -0
  23. helm/benchmark/presentation/summarize.py +9 -3
  24. helm/benchmark/presentation/test_create_plots.py +4 -1
  25. helm/benchmark/run.py +7 -1
  26. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  27. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  28. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  29. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  30. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  31. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  32. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  33. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  34. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  35. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
  36. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  37. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  38. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  39. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  40. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  41. helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
  42. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  43. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
  44. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
  45. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
  46. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  47. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  48. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  49. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  50. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  51. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  52. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  53. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  54. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  55. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  56. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  57. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  58. helm/benchmark/scenarios/math_scenario.py +21 -20
  59. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  60. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  61. helm/benchmark/scenarios/mimic_bhc_scenario.py +1 -1
  62. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  63. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  64. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  65. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  66. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  67. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  68. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  69. helm/benchmark/slurm_jobs.py +1 -2
  70. helm/benchmark/slurm_runner.py +8 -1
  71. helm/benchmark/static/schema_arabic.yaml +228 -0
  72. helm/benchmark/static/schema_classic.yaml +0 -17
  73. helm/benchmark/static/schema_long_context.yaml +19 -1
  74. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  75. helm/benchmark/static_build/index.html +1 -1
  76. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  77. helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
  78. helm/clients/huggingface_client.py +2 -2
  79. helm/clients/openai_client.py +2 -1
  80. helm/clients/openai_responses_client.py +6 -4
  81. helm/clients/test_huggingface_client.py +3 -3
  82. helm/clients/together_client.py +0 -2
  83. helm/clients/vertexai_client.py +11 -9
  84. helm/clients/vllm_client.py +43 -7
  85. helm/clients/vllm_granite_thinking_client.py +56 -0
  86. helm/common/critique_request.py +0 -1
  87. helm/common/hierarchical_logger.py +83 -34
  88. helm/common/object_spec.py +23 -8
  89. helm/common/test_logging.py +94 -0
  90. helm/config/model_deployments.yaml +454 -175
  91. helm/config/model_metadata.yaml +117 -10
  92. helm/config/tokenizer_configs.yaml +81 -1
  93. helm/proxy/cli.py +1 -1
  94. helm/proxy/retry.py +5 -0
  95. helm/tokenizers/grok_tokenizer.py +2 -0
  96. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  97. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  98. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  99. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  100. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +0 -0
  101. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  102. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  103. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
@@ -1,794 +0,0 @@
1
- # flake8: noqa
2
- from collections import defaultdict
3
- from dataclasses import dataclass, field
4
- from itertools import combinations_with_replacement, product
5
- import math
6
- from math import comb
7
- import numpy as np
8
- import numpy.typing as npt
9
- import random
10
- from typing import List, Optional, Tuple, Dict
11
-
12
- from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
13
- from helm.benchmark.adaptation.adapter_spec import AdapterSpec
14
- from helm.common.local_context import LocalContext
15
- from helm.benchmark.window_services.tokenizer_service import TokenizerService
16
- from helm.common.authentication import Authentication
17
- from helm.common.optional_dependencies import handle_module_not_found_error
18
- from helm.proxy.services.server_service import ServerService
19
- from helm.benchmark.scenarios.scenario import (
20
- Scenario,
21
- Instance,
22
- Reference,
23
- TRAIN_SPLIT,
24
- TEST_SPLIT,
25
- CORRECT_TAG,
26
- Input,
27
- Output,
28
- )
29
-
30
- try:
31
- import sympy
32
- from sympy import Symbol, Poly, diff
33
- from sympy.parsing.sympy_parser import standard_transformations, implicit_multiplication_application
34
- except ModuleNotFoundError as e:
35
- handle_module_not_found_error(e, ["scenarios"])
36
-
37
-
38
- # TODO: we shouldn't create an Adapter and TokenizerService in a scenario
39
- # The Adapter and Scenarios should be completely decoupled.
40
- # https://github.com/stanford-crfm/benchmarking/issues/569
41
- def get_test_tokenizer_service() -> TokenizerService:
42
- # Pointed to the default local path set in run.py (--local-path)
43
- return TokenizerService(LocalContext(base_path="prod_env"))
44
-
45
-
46
- SOLUTION_TAG: str = "solution"
47
- CLASS_TAG: str = "class"
48
- Range = List[Tuple[int, int]]
49
-
50
- SYMPY_TRANSFORMATIONS = standard_transformations + (implicit_multiplication_application,)
51
-
52
-
53
- def generate_terms(degree: int, num_variables: int) -> List[List[int]]:
54
- """Lists out multisets corresponding to all possible terms up to degree `degree` and `num_variables` variables."""
55
- return sum(
56
- [
57
- list(map(lambda _: list(_), combinations_with_replacement(range(num_variables), d)))
58
- for d in reversed(range(degree + 1))
59
- ],
60
- [],
61
- )
62
-
63
-
64
- def get_powers(terms: List[List[int]]) -> List[List[Tuple[int, int]]]:
65
- return list(map(lambda _: list(zip(*np.unique(_, return_counts=True))), terms))
66
-
67
-
68
- def sympy_power_to_power(power: Tuple[int, ...]) -> List[Tuple[int, int]]:
69
- return [(idx, exp) for idx, exp in enumerate(power) if exp]
70
-
71
-
72
- def stringify_terms(terms: List[List[int]], variable_names: List[str] = list("xyz")) -> List[str]:
73
- """Formatting utility for multisets."""
74
-
75
- def stringify_power(index: int, degree: int) -> str:
76
- """Helper formatting utility for powers."""
77
- var = variable_names[index]
78
- if degree == 0:
79
- return ""
80
- if degree == 1:
81
- return var
82
- return f"{var}^{degree}"
83
-
84
- powers = get_powers(terms)
85
- return list(map(lambda _: "".join([stringify_power(*el) for el in _]), powers))
86
-
87
-
88
- @dataclass
89
- class Polynomial:
90
- """A simple polynomial class over the integers that supports evaluation and pretty-printing."""
91
-
92
- degree: int
93
- num_variables: int
94
- coeffs: npt.NDArray[np.int64]
95
- terms: List[List[int]] = field(init=False)
96
-
97
- def __post_init__(self):
98
- self.terms = generate_terms(self.degree, self.num_variables)
99
-
100
- def eval(self, vals: List[int]):
101
- return np.dot(self.coeffs, np.array(list(map(lambda _: np.prod(np.array(vals).__getitem__(_)), self.terms))))
102
-
103
- def __str__(self):
104
- def stringify_monomial(coeff: int, term: str) -> Optional[str]:
105
- if coeff == 0:
106
- return None
107
- if coeff == 1:
108
- return term or str(coeff)
109
- if coeff == -1:
110
- return f"-{term}" if term else "-1"
111
- return f"{coeff}{term}"
112
-
113
- monomials = [stringify_monomial(c, x) for c, x in zip(self.coeffs, stringify_terms(self.terms))]
114
- present_monomials: List[str] = [m for m in monomials if m]
115
- return " + ".join(present_monomials).replace(" + -", " - ")
116
-
117
- @classmethod
118
- def from_string(cls, expr_str: str, degree: int, num_variables: int):
119
- expr = sympy.parse_expr(expr_str.replace("^", "**"), transformations=SYMPY_TRANSFORMATIONS)
120
- poly = Poly(expr, list(sorted(expr.free_symbols, key=lambda _: _.name)))
121
- return sympy_poly_to_poly(poly, degree, num_variables)
122
-
123
-
124
- def sympy_poly_to_poly(poly: Poly, degree: int, num_variables: int) -> Polynomial:
125
- terms = poly.terms()
126
- all_terms = generate_terms(degree, num_variables)
127
- all_powers = get_powers(all_terms)
128
- coeffs_dict = defaultdict(int, {tuple(sympy_power_to_power(power)): coeff for power, coeff in terms})
129
- coeffs = [coeffs_dict[tuple(_)] for _ in all_powers]
130
- return Polynomial(degree=degree, num_variables=num_variables, coeffs=np.array(coeffs))
131
-
132
-
133
- def generate_polynomial(
134
- degree: int,
135
- num_variables: int,
136
- range_coeffs: Range, # inclusive
137
- seed: Optional[int] = None,
138
- strict_degree=True,
139
- strict_variables=True,
140
- strict_constant=True,
141
- ) -> Polynomial:
142
- """Sample the coefficients (A, B, ...) of the polynomial equation y = ... + A x + B.
143
- A generic method used by the function class-specific methods below.
144
-
145
- Args:
146
- strict_degree (bool): if True, require `rel` to have degree strictly equal to `degree`
147
- strict_variables (bool): if True, require `rel` to use exactly `num_variables`
148
- strict_constant (bool): if True, require the constant (ie. term of degree 0) to be non-zero
149
- Returns:
150
- `rel` (Polynomial)
151
- """
152
- MAX_ATTEMPTS = 100
153
- if seed is not None:
154
- random.seed(seed)
155
- np.random.seed(seed)
156
- count = 0
157
- terms = generate_terms(degree, num_variables)
158
- while count < MAX_ATTEMPTS:
159
- done = True
160
- coeffs = [random.randint(r[0], r[1]) for r in range_coeffs]
161
- if strict_constant and coeffs[-1] == 0:
162
- done = False
163
- if strict_degree and not sum(coeffs[: comb(degree + num_variables - 1, num_variables - 1)]):
164
- done = False
165
- if strict_variables:
166
- for idx in range(num_variables):
167
- vals = np.zeros(num_variables)
168
- vals[idx] = 1
169
- res = np.dot(coeffs[:-1], np.array(list(map(lambda _: np.prod(vals.__getitem__(_)), terms[:-1]))))
170
- if not res:
171
- done = False
172
- break
173
- if done:
174
- break
175
- count += 1
176
- if count >= MAX_ATTEMPTS:
177
- raise ValueError(
178
- "Failed to sample valid polynomial equation within "
179
- + f"{MAX_ATTEMPTS} attempts from ranges {str(range_coeffs)}."
180
- )
181
- return Polynomial(degree=degree, num_variables=num_variables, coeffs=np.array(coeffs))
182
-
183
-
184
- def generate_linear(range_coeffs: Range) -> Polynomial:
185
- return generate_polynomial(
186
- degree=1,
187
- num_variables=1,
188
- range_coeffs=range_coeffs,
189
- strict_degree=True,
190
- strict_variables=True,
191
- strict_constant=True,
192
- )
193
-
194
-
195
- def generate_parabola(range_coeffs: Range) -> Polynomial:
196
- return generate_polynomial(
197
- degree=2,
198
- num_variables=1,
199
- range_coeffs=range_coeffs,
200
- strict_degree=True,
201
- strict_variables=True,
202
- strict_constant=True,
203
- )
204
-
205
-
206
- def generate_plane(range_coeffs: Range) -> Polynomial:
207
- return generate_polynomial(
208
- degree=1,
209
- num_variables=2,
210
- range_coeffs=range_coeffs,
211
- strict_degree=True,
212
- strict_variables=True,
213
- strict_constant=True,
214
- )
215
-
216
-
217
- def generate_paraboloid(range_coeffs: Range) -> Polynomial:
218
- return generate_polynomial(
219
- degree=2,
220
- num_variables=2,
221
- range_coeffs=range_coeffs,
222
- strict_degree=True,
223
- strict_variables=True,
224
- strict_constant=True,
225
- )
226
-
227
-
228
- def generate_rotated_translated_paraboloid(range_coeffs: Range) -> Polynomial:
229
- """Unused."""
230
- do_sample = True
231
- while do_sample:
232
- coeffs_0 = generate_plane(range_coeffs).coeffs
233
- coeffs_1 = generate_plane(range_coeffs).coeffs
234
- mat = np.array(
235
- [
236
- coeffs_0,
237
- coeffs_1,
238
- ]
239
- )
240
- if np.linalg.matrix_rank(mat) == 2:
241
- do_sample = False
242
- x = Symbol("x")
243
- y = Symbol("y")
244
- xprime = coeffs_0[0] * x + coeffs_0[1] * y + coeffs_0[2]
245
- yprime = coeffs_1[0] * x + coeffs_1[1] * y + coeffs_1[2]
246
- expr = xprime**2 + yprime**2
247
- poly = Poly(expr, [x, y])
248
- return sympy_poly_to_poly(poly, 2, 2)
249
-
250
-
251
- def distance_linear(point: List[int], rel_str: str):
252
- """
253
- Returns the minimum distance from the given point to the relation given by `rel_str` which has the form:
254
- A x - y + B = 0
255
- """
256
- relation_type = "linear"
257
- degree: int = RELTYPE_INFO[relation_type].degree
258
- num_variables: int = RELTYPE_INFO[relation_type].num_variables
259
- rel = Polynomial.from_string(rel_str.split(" = ")[-1], degree, num_variables)
260
- A = rel.coeffs[0]
261
- B = -1
262
- C = rel.coeffs[1]
263
- x, y = point
264
- return float(abs((A * x + B * y + C)) / (math.sqrt(A**2 + B**2)))
265
-
266
-
267
- def distance_parabola(point: List[int], rel_str: str, TOL: float = 1e-10):
268
- """
269
- Returns the minimum distance from the given point to the relation given by `rel_str` which has the form:
270
- y = A x^2 + B x + C
271
- """
272
- rel_str = rel_str.split(" = ")[-1]
273
- expr = sympy.parse_expr(rel_str.replace("^", "**"), transformations=SYMPY_TRANSFORMATIONS)
274
- poly = sympy.Poly(expr, list(expr.free_symbols))
275
- x = list(expr.free_symbols)[0]
276
- x0, y0 = point
277
- dist = (x - x0) ** 2 + (poly - y0) ** 2
278
- deriv = sympy.diff(dist, x)
279
- try:
280
- sols = sympy.solve(deriv, x)
281
- except ZeroDivisionError:
282
- # This shouldn't happen, but has happened for a prior implementation of
283
- # `distance_paraboloid`, so catch it conservatively:
284
- print("Failed to compute minimum distance.")
285
- # pdb.set_trace()
286
- return float(0.0)
287
- dist_vals = list(map(lambda _: sympy.N(dist.eval(_)), sols))
288
- try:
289
- dist_val = min([sympy.re(_) for _ in dist_vals if abs(sympy.im(_)) < TOL and sympy.re(_) >= 0])
290
- except ValueError:
291
- # A real solution should exist, but if not (eg. numerical error exceeds TOL):
292
- print("Failed to compute minimum distance.")
293
- # pdb.set_trace()
294
- return float(0.0)
295
- return np.sqrt(float(dist_val))
296
-
297
-
298
- def distance_plane(point: List[int], rel_str: str):
299
- """
300
- Returns the minimum distance from the given point to the relation given by `rel_str` which has the form:
301
- A x + B y - z + C = 0
302
- """
303
- relation_type = "plane"
304
- degree: int = RELTYPE_INFO[relation_type].degree
305
- num_variables: int = RELTYPE_INFO[relation_type].num_variables
306
- rel = Polynomial.from_string(rel_str.split(" = ")[-1], degree, num_variables)
307
- A = rel.coeffs[0]
308
- B = rel.coeffs[1]
309
- C = -1
310
- D = rel.coeffs[2]
311
- x, y, z = point
312
- d = abs((A * x + B * y + C * z + D))
313
- e = math.sqrt(A**2 + B**2 + C**2)
314
- return float(d / e)
315
-
316
-
317
- def distance_paraboloid(point: List[int], rel_str: str, TOL: float = 1e-10):
318
- """
319
- Returns the minimum distance from the given point to the relation given by `rel_str` which has the form:
320
- z = A x^2 + B x y + C y^2 + D x + E y + F
321
- Uses method of Lagrange multipliers.
322
- """
323
- rel_str = rel_str.split(" = ")[-1]
324
- expr = sympy.parse_expr(rel_str.replace("^", "**"), transformations=SYMPY_TRANSFORMATIONS)
325
- x, y = list(expr.free_symbols)
326
- if x.name == "y":
327
- x, y = y, x
328
- z = Symbol("z")
329
- x0, y0, z0 = point
330
- f = (x - x0) ** 2 + (y - y0) ** 2 + (z - z0) ** 2
331
- g = z - expr
332
- if abs(g.subs([(x, x0), (y, y0), (z, z0)])) < TOL:
333
- return float(0.0)
334
- λ = Symbol("λ")
335
- # The code below is meant to be equivalent to
336
- # `sols = sympy.solve([eq_x, eq_y, eq_z, g], [x, y, z, λ])`
337
- # but sympy.solve was failing to find any solution on many inputs
338
- # as well as not finding some solutions
339
- # so this breaks it down for the special case of `f - λ g` which is at most quadratic.
340
-
341
- # Set up the equations from method of Lagrange multipliers
342
- eq_x = diff(f, x) - λ * diff(g, x)
343
- eq_y = diff(f, y) - λ * diff(g, y)
344
- eq_z = diff(f, z) - λ * diff(g, z)
345
- # Solve for each variable individually
346
- has_xy = y in eq_x.free_symbols # has xy term
347
- if has_xy:
348
- sols_x = sympy.solve(eq_x, [x, y, λ])
349
- sols_y = sympy.solve(eq_y, [x, y, λ])
350
- sols_z = sympy.solve(eq_z, [z, λ])
351
- else:
352
- sols_x = sympy.solve(eq_x, [x, λ])
353
- sols_y = sympy.solve(eq_y, [y, λ])
354
- sols_z = sympy.solve(eq_z, [z, λ])
355
- try:
356
- # Put the solutions together
357
-
358
- # Extract x,y,z resp. from tuples
359
- sols_lst_xyz = [[_[0] for _ in lst] for lst in [sols_x, sols_y, sols_z]]
360
-
361
- # Extract solutions for λ from tuples
362
- sols_lst_λλλ = [[_[-1] for _ in lst] for lst in [sols_x, sols_y, sols_z]]
363
-
364
- # Get list of possible solution tuples and corresponding solutions for λ
365
- sols_xyz = list(product(*sols_lst_xyz))
366
- vals_λ = list(product(*sols_lst_λλλ))
367
-
368
- sols = []
369
- # Try each possible combined solution for x, y, z, λ
370
- for sol_xyz, val_λs in zip(sols_xyz, vals_λ):
371
- val_λs = tuple(set(filter(lambda _: not _.is_symbol, val_λs))) # get distinct values for λ if there are any
372
- if len(val_λs) > 1: # there can be at most one distinct value for λ
373
- continue
374
- val_λ = val_λs[0] if val_λs else λ
375
- sol_x, sol_y, sol_z = sol_xyz
376
- if not val_λ.is_symbol:
377
- # Substitute in values of λ
378
- sol_x = sol_x.subs(λ, val_λ)
379
- sol_y = sol_y.subs(λ, val_λ)
380
- sol_z = sol_z.subs(λ, val_λ)
381
- g_λ = g.subs(λ, val_λ)
382
- else:
383
- g_λ = g
384
-
385
- # Substitute in solutions for x, y, z
386
- if has_xy:
387
- g_λ = g_λ.subs([(x, sol_x), (z, sol_z)])
388
- sol_ys = sympy.solve(sol_x - sol_y, y)
389
- for sol_y in sol_ys:
390
- g_λy = g_λ.subs(y, sol_y)
391
- sol_xy = sol_x.subs(y, sol_y)
392
- syms = list(g_λy.free_symbols)
393
- if len(syms) > 1: # underdetermined system
394
- continue
395
- sym = syms[0]
396
- vals = [sympy.N(_) for _ in sympy.solveset(g_λy, sym)]
397
- sols.extend([(sol_xy.subs(sym, _), sol_y.subs(sym, _), sol_z.subs(sym, _)) for _ in vals])
398
- else:
399
- g_λ = g_λ.subs([(x, sol_x), (y, sol_y), (z, sol_z)])
400
- syms = list(g_λ.free_symbols)
401
- if len(syms) > 1: # underdetermined system
402
- continue
403
- # Solve for remaining variable
404
- sym = syms[0]
405
- vals = [sympy.N(_) for _ in sympy.solveset(g_λ, sym)]
406
- sols.extend([(sol_x.subs(sym, _), sol_y.subs(sym, _), sol_z.subs(sym, _)) for _ in vals])
407
- except ZeroDivisionError:
408
- # This shouldn't happen, but has happened for a prior implementation of
409
- # `distance_paraboloid`, so catch it conservatively:
410
- print("Failed to compute minimum distance.")
411
- # pdb.set_trace()
412
- return float(0.0)
413
- poly_f = sympy.Poly(f, [x, y, z])
414
- # Evaluate f on found solutions
415
- try:
416
- dist_vals = list(map(lambda _: sympy.N(poly_f.eval(_)), sols))
417
- except sympy.polys.polyerrors.UnificationFailed:
418
- # Forgot to substitute all variables in some expression.
419
- # This shouldn't happen, but has happened for a prior implementation of
420
- # `distance_paraboloid`, so catch it conservatively:
421
- print("sympy error: Unification failed.")
422
- # pdb.set_trace()
423
- return float(0.0)
424
- # Get the minimum nonnegative real value
425
- try:
426
- dist_val = min([sympy.re(_) for _ in dist_vals if abs(sympy.im(_)) < TOL and sympy.re(_) >= 0])
427
- except ValueError:
428
- # A real solution should exist, but if not (eg. numerical error exceeds TOL):
429
- print("Failed to compute minimum distance.")
430
- print([eq_x, eq_y, eq_z, g])
431
- print(sols)
432
- # pdb.set_trace()
433
- return float(0.0)
434
- return np.sqrt(float(dist_val))
435
-
436
-
437
- def select_ranges(
438
- num_train: int, num_test: int, dim: int, overlap: bool = True, nonnegative_only: bool = False
439
- ) -> Tuple[Range, Range]:
440
- """
441
- Choose disjoint intervals from which to sample points, where
442
- the test points lie within a region bounded by the region
443
- that the train points are sampled from.
444
- """
445
- choices: npt.NDArray[np.int64] = np.array([0, 1, 2, 5, 10, 20, 50, 100, 200])
446
-
447
- def select_index(lst: npt.NDArray[np.int64], val: int) -> int:
448
- return list((lst - val) >= 0).index(True)
449
-
450
- def construct_range(index: int, dim: int) -> List[Tuple[int, int]]:
451
- if nonnegative_only:
452
- return [(0, choices[index]) for _ in range(dim)]
453
- return [(-choices[index], choices[index]) for _ in range(dim)]
454
-
455
- if nonnegative_only:
456
- num_points = (choices + 1) ** dim # list of ints
457
- else:
458
- num_points = (2 * choices + 1) ** dim # list of ints
459
-
460
- if overlap:
461
- train_index = test_index = select_index(num_points, num_train + num_test)
462
- else:
463
- test_index = select_index(num_points, num_test)
464
- train_index = select_index(num_points - num_points[test_index], num_train)
465
-
466
- test_range = construct_range(test_index, dim)
467
- train_range = construct_range(train_index, dim)
468
- return (train_range, test_range)
469
-
470
-
471
- @dataclass(frozen=True)
472
- class RelationTypeInfo:
473
- name: str
474
- degree: int
475
- num_variables: int
476
- range: Range
477
- example_coeffs: npt.NDArray[np.int64]
478
-
479
-
480
- RELTYPE_INFO: Dict[str, RelationTypeInfo] = {
481
- "linear": RelationTypeInfo(
482
- name="linear", degree=1, num_variables=1, range=[(1, 5), (1, 5)], example_coeffs=np.array([2, 5])
483
- ), # 2x + 5
484
- "parabola": RelationTypeInfo(
485
- # parabolas with axis of symmetry to the left of the origin
486
- name="parabola",
487
- degree=2,
488
- num_variables=1,
489
- range=[(1, 2), (0, 2), (1, 5)],
490
- example_coeffs=np.array([1, 0, 2]),
491
- ), # x^2 + 2
492
- "plane": RelationTypeInfo(
493
- name="plane", degree=1, num_variables=2, range=[(1, 5), (1, 5), (1, 5)], example_coeffs=np.array([2, 1, 5])
494
- ), # 2x + y + 5
495
- "paraboloid": RelationTypeInfo(
496
- # axis-aligned elliptic paraboloids only, ie. of the form z = A x^2 + B y^2 + C
497
- name="paraboloid",
498
- degree=2,
499
- num_variables=2,
500
- range=[(1, 2), (0, 1), (1, 2), (0, 0), (0, 0), (1, 5)],
501
- example_coeffs=np.array([2, 0, 1, 0, 0, 2]),
502
- ), # 2x^2 + y^2 + 2
503
- }
504
-
505
-
506
- # MODE_INFO = { # Testing purposes
507
- # "example": {"num_function_train": 1, "num_function_test": 1, "num_train": 10, "num_test": 1,},
508
- # "standard": {"num_function_train": 1, "num_function_test": 1, "num_train": 10, "num_test": 1,},
509
- # "function": {"num_function_train": 2, "num_function_test": 2, "num_train": 2, "num_test": 1,},
510
- # }
511
-
512
-
513
- MODE_INFO = {
514
- "example": {
515
- "num_function_train": 1,
516
- "num_function_test": 1,
517
- "num_train": 100,
518
- "num_test": 100,
519
- },
520
- "standard": {
521
- "num_function_train": 1,
522
- "num_function_test": 1,
523
- "num_train": 100,
524
- "num_test": 100,
525
- },
526
- "function": {
527
- "num_function_train": 1000,
528
- "num_function_test": 1000, # don't bother excluding from train set
529
- "num_train": 100,
530
- "num_test": 1,
531
- },
532
- }
533
-
534
-
535
- def get_var(dim: int, variable_names=list("xyz")):
536
- return variable_names[dim - 1]
537
-
538
-
539
- def get_dataset_header(
540
- dim: int, variable_names: List[str] = list("xyz"), delimiter: str = ", ", output_prefix: str = ", "
541
- ):
542
- return delimiter.join(variable_names[: dim - 1]) + output_prefix + variable_names[dim - 1]
543
-
544
-
545
- def get_numeracy_adapter_spec(
546
- max_train_instances: int, max_eval_instances: int, dim: int, delimiter: str = ", ", **kwargs
547
- ) -> AdapterSpec:
548
- return AdapterSpec(
549
- **{
550
- **{
551
- "method": ADAPT_GENERATION,
552
- "instructions": get_dataset_header(dim, delimiter=delimiter, output_prefix=", "),
553
- "max_train_instances": max_train_instances,
554
- "max_eval_instances": max_eval_instances,
555
- "num_outputs": 1,
556
- "num_train_trials": 1,
557
- "model_deployment": "openai/davinci",
558
- "temperature": 0,
559
- "stop_sequences": ["\n"],
560
- "max_tokens": 20,
561
- "input_prefix": "",
562
- "output_prefix": ", ",
563
- "instance_prefix": "\n",
564
- },
565
- **kwargs,
566
- }
567
- ) # enable override
568
-
569
-
570
- class NumeracyScenario(Scenario):
571
- """
572
- A task that asks the model to induce an unknown polynomial at a point given a set of function evaluations.
573
- Unlike pre-existing tasks testing arithmetic, this task attempts to test a deeper notion of numeracy
574
- which the model cannot rely purely on rote memorization of standard tables of arithmetic operations
575
- in order to succeed on and which intuitively occurs as a implicit subroutine in broader contexts.
576
-
577
- Decomposes into 4 function classes:
578
- - linear (1 degree, 1 variable)
579
- - parabola (2 degrees, 2 variables)
580
- - plane (1 degree, 2 variables)
581
- - (elliptic) paraboloid (2 degrees, 2 variables)
582
-
583
- with coefficients drawn from restricted ranges
584
- (see dict `RELTYPE_INFO`), and
585
- where {parabola, paraboloid} have nonnegative domains,
586
- ie. the right ray of the x-axis or upper-right
587
- quadrant of the plane resp. so that the model cannot
588
- rely on symmetry.
589
-
590
- and independently 2 + 1 modes:
591
- - standard
592
- - A single dataset corresponding to the same polynomial.
593
- Evaluate on different points.
594
- - function
595
- - Multiple datasets, where each dataset instance corresponds to
596
- an independently sampled polynomial belonging to the same class.
597
- Evaluate on different (dataset, point) pairs.
598
- and
599
- - example
600
- - A single dataset corresponding to the same fixed representative for each class.
601
-
602
- If `overlap` is `True`:
603
- Train and test datapoints are drawn from the same rectilinear region
604
- centered at the origin (see function `select_ranges`),
605
- making sure to exclude the training set from the test set.
606
- Otherwise:
607
- Train datapoints are drawn from a rectilinear border region while
608
- test datapoints are drawn from a disjoint rectilinear interior region,
609
- centered at the origin (see function `select_ranges`).
610
-
611
- Example prompt for `relation_type=parabola,mode=function` with `num_function_train=num_function_test=num_train=2`:
612
- x,y
613
- 1,4
614
- -1,2
615
- 0,2
616
-
617
- x,y
618
- -1,0
619
- 1,20
620
- 0,8
621
-
622
- x,y
623
- -1,7
624
- 1,11
625
- 0,
626
- """
627
-
628
- name = "numeracy"
629
- description = "polynomial induction"
630
- tags: List[str] = []
631
- RELTYPES: List[str] = ["linear", "parabola", "plane", "paraboloid"]
632
- MODES: List[str] = ["example", "standard", "function"]
633
- delimiter: str = ", "
634
-
635
- def __init__(
636
- self,
637
- relation_type: str = "linear",
638
- mode: str = "function",
639
- seed: Optional[int] = None,
640
- overlap: bool = True, # whether the in-context and eval points are drawn from the same region
641
- sort_vals: bool = False, # whether to sort the in-context examples
642
- ):
643
- super().__init__()
644
- assert relation_type in NumeracyScenario.RELTYPES
645
- assert mode in NumeracyScenario.MODES
646
- self.random_seed = seed
647
-
648
- self.relation_type = relation_type
649
- self.mode = mode
650
- self.delimiter = NumeracyScenario.delimiter
651
- self.seed = seed
652
- self.overlap = overlap
653
- self.sort_vals = sort_vals
654
-
655
- self.degree: int = RELTYPE_INFO[relation_type].degree
656
- self.num_variables: int = RELTYPE_INFO[relation_type].num_variables
657
- self.range_coeffs = RELTYPE_INFO[relation_type].range
658
- self.dim = self.num_variables + 1
659
-
660
- self.num_function_train = MODE_INFO[mode]["num_function_train"]
661
- self.num_function_test = MODE_INFO[mode]["num_function_test"]
662
- self.num_train = MODE_INFO[mode]["num_train"]
663
- self.num_test = MODE_INFO[mode]["num_test"]
664
-
665
- def get_instances(self, output_path: str) -> List[Instance]:
666
- assert self.random_seed is not None
667
- random.seed(self.random_seed)
668
- np.random.seed(self.random_seed)
669
-
670
- train_range, test_range = select_ranges(
671
- num_train=100,
672
- num_test=100,
673
- dim=self.num_variables, # not a typo
674
- overlap=self.overlap,
675
- nonnegative_only=self.relation_type in ["parabola", "paraboloid"],
676
- )
677
- # train_range = test_range:
678
- # -------------------------
679
- # linear: [(-100, 100)]
680
- # parabola: [(0, 200)]
681
- # plane: [(-10, 10), (-10, 10)]
682
- # paraboloid: [(0, 20), (0, 20)]
683
-
684
- test_vals = list(product(*[range(r[0], r[1] + 1) for r in test_range]))
685
- if self.overlap:
686
- train_vals = test_vals
687
- else:
688
- train_vals = list(set(product(*[range(r[0], r[1] + 1) for r in train_range])) - set(test_vals))
689
- if self.sort_vals:
690
- train_vals = list(sorted(train_vals))
691
- if self.num_variables == 2:
692
- test_vals = list(filter(lambda _: _[0] <= _[1], test_vals))
693
- train_vals = list(filter(lambda _: _[0] <= _[1], train_vals))
694
-
695
- def generate_datapoint(rel: Polynomial, vals: List[int]) -> Tuple[List[str], str]:
696
- y = rel.eval(vals)
697
- return list(map(str, vals)), str(y)
698
-
699
- def generate_datapoint_instances_for_split(rel, idxs, eval_vals, split):
700
- instances = []
701
- for idx in idxs:
702
- vals = eval_vals[idx]
703
- str_vals, y = generate_datapoint(rel, vals)
704
- input = self.delimiter.join(str_vals)
705
- output = y
706
- var = get_var(self.dim)
707
- solution = f"{var} = {rel}"
708
- references = [
709
- Reference(Output(text=output), tags=[CORRECT_TAG]),
710
- Reference(Output(text=solution), tags=[SOLUTION_TAG]),
711
- Reference(Output(text=self.relation_type), tags=[CLASS_TAG]),
712
- ]
713
- instance = Instance(Input(text=input), references=references, split=split)
714
- instances.append(instance)
715
- return instances
716
-
717
- def generate_datapoint_instances(rel: Polynomial):
718
- train_idxs = list(np.random.choice(range(len(train_vals)), self.num_train, replace=False))
719
- if self.sort_vals:
720
- train_idxs = list(sorted(train_idxs))
721
- if self.overlap:
722
- all_test_idxs = list(set(range(len(test_vals))) - set(train_idxs))
723
- else:
724
- all_test_idxs = list(range(len(test_vals)))
725
- test_idxs = np.random.choice(all_test_idxs, self.num_test, replace=False)
726
-
727
- train_instances = generate_datapoint_instances_for_split(rel, train_idxs, train_vals, TRAIN_SPLIT)
728
- test_instances = generate_datapoint_instances_for_split(rel, test_idxs, test_vals, TEST_SPLIT)
729
- instances = train_instances + test_instances
730
- return instances
731
-
732
- def generate_dataset():
733
- generate_func = globals()[f"generate_{self.relation_type}"]
734
- rel = generate_func(self.range_coeffs)
735
- instances = generate_datapoint_instances(rel)
736
- return instances
737
-
738
- def generate_datasets(num_instances: int, split: str):
739
- # TODO: construct_prompt is no longer part of adapter, and this function needs to be rewritten
740
- # https://github.com/stanford-crfm/benchmarking/issues/569
741
- return []
742
- # spec = get_numeracy_adapter_spec(self.num_train, self.num_test, self.dim, self.delimiter)
743
- # service = get_test_tokenizer_service()
744
- # adapter = Adapter(spec, service)
745
- # outer_spec = get_numeracy_adapter_spec(
746
- # self.num_train,
747
- # self.num_test,
748
- # self.dim,
749
- # instructions="",
750
- # instance_prefix="\n\n",
751
- # delimiter=self.delimiter,
752
- # )
753
- # outer_adapter = Adapter(outer_spec, service)
754
- # instances = []
755
- # for idx in range(num_instances):
756
- # datapoint_instances = generate_dataset()
757
- # train_instances = datapoint_instances[: self.num_train]
758
- # eval_instances = datapoint_instances[self.num_train :]
759
- # dataset_instances = []
760
- # for idx in range(self.num_test):
761
- # eval_instance = eval_instances[idx]
762
- # input = adapter.construct_prompt(
763
- # train_instances, eval_instance, include_output=False, reference_index=None
764
- # ).text
765
- # input = input[: -len(spec.output_prefix.rstrip())] # strip output_prefix
766
- # references = eval_instance.references
767
- # dataset_instance = Instance(input=input, references=references, split=split) # split doesn't matter
768
- # dataset_instances.append(dataset_instance)
769
-
770
- # input = outer_adapter.construct_prompt(
771
- # dataset_instances[:-1], dataset_instances[-1], include_output=False, reference_index=None
772
- # ).text
773
- # input = input[: -len(spec.output_prefix.rstrip())] # strip output_prefix
774
- # references = dataset_instances[-1].references
775
- # instance = Instance(input=input, references=references, split=split)
776
- # instances.append(instance)
777
-
778
- # return instances
779
-
780
- def generate_instances():
781
- generate_func = globals()[f"generate_{self.relation_type}"]
782
- if self.mode == "example":
783
- coeffs = RELTYPE_INFO[self.relation_type].example_coeffs
784
- rel = Polynomial(self.degree, self.num_variables, coeffs)
785
- return generate_datapoint_instances(rel)
786
- if self.mode == "standard":
787
- rel = generate_func(self.range_coeffs)
788
- return generate_datapoint_instances(rel)
789
- if self.mode == "function":
790
- return generate_datasets(self.num_function_train, TRAIN_SPLIT) + generate_datasets(
791
- self.num_function_test, TEST_SPLIT
792
- )
793
-
794
- return generate_instances()