PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/scenarios/numeracy_scenario.py DELETED Viewed

@@ -1,793 +0,0 @@
-# flake8: noqa
-from collections import defaultdict
-from dataclasses import dataclass, field
-from itertools import combinations_with_replacement, product
-import math
-from math import comb
-import numpy as np
-import numpy.typing as npt
-import random
-from typing import List, Optional, Tuple, Dict
-from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark.window_services.tokenizer_service import TokenizerService
-from helm.common.authentication import Authentication
-from helm.common.optional_dependencies import handle_module_not_found_error
-from helm.proxy.services.server_service import ServerService
-from helm.benchmark.scenarios.scenario import (
-    Scenario,
-    Instance,
-    Reference,
-    TRAIN_SPLIT,
-    TEST_SPLIT,
-    CORRECT_TAG,
-    Input,
-    Output,
-)
-try:
-    import sympy
-    from sympy import Symbol, Poly, diff
-    from sympy.parsing.sympy_parser import standard_transformations, implicit_multiplication_application
-except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, ["scenarios"])
-# TODO: we shouldn't create an Adapter and TokenizerService in a scenario
-#       The Adapter and Scenarios should be completely decoupled.
-#       https://github.com/stanford-crfm/benchmarking/issues/569
-def get_test_tokenizer_service() -> TokenizerService:
-    # Pointed to the default local path set in run.py (--local-path)
-    return TokenizerService(ServerService(base_path="prod_env", root_mode=True), Authentication("test"))
-SOLUTION_TAG: str = "solution"
-CLASS_TAG: str = "class"
-Range = List[Tuple[int, int]]
-SYMPY_TRANSFORMATIONS = standard_transformations + (implicit_multiplication_application,)
-def generate_terms(degree: int, num_variables: int) -> List[List[int]]:
-    """Lists out multisets corresponding to all possible terms up to degree `degree` and `num_variables` variables."""
-    return sum(
-        [
-            list(map(lambda _: list(_), combinations_with_replacement(range(num_variables), d)))
-            for d in reversed(range(degree + 1))
-        ],
-        [],
-    )
-def get_powers(terms: List[List[int]]) -> List[List[Tuple[int, int]]]:
-    return list(map(lambda _: list(zip(*np.unique(_, return_counts=True))), terms))
-def sympy_power_to_power(power: Tuple[int, ...]) -> List[Tuple[int, int]]:
-    return [(idx, exp) for idx, exp in enumerate(power) if exp]
-def stringify_terms(terms: List[List[int]], variable_names: List[str] = list("xyz")) -> List[str]:
-    """Formatting utility for multisets."""
-    def stringify_power(index: int, degree: int) -> str:
-        """Helper formatting utility for powers."""
-        var = variable_names[index]
-        if degree == 0:
-            return ""
-        if degree == 1:
-            return var
-        return f"{var}^{degree}"
-    powers = get_powers(terms)
-    return list(map(lambda _: "".join([stringify_power(*el) for el in _]), powers))
-@dataclass
-class Polynomial:
-    """A simple polynomial class over the integers that supports evaluation and pretty-printing."""
-    degree: int
-    num_variables: int
-    coeffs: npt.NDArray[np.int64]
-    terms: List[List[int]] = field(init=False)
-    def __post_init__(self):
-        self.terms = generate_terms(self.degree, self.num_variables)
-    def eval(self, vals: List[int]):
-        return np.dot(self.coeffs, np.array(list(map(lambda _: np.prod(np.array(vals).__getitem__(_)), self.terms))))
-    def __str__(self):
-        def stringify_monomial(coeff: int, term: str) -> Optional[str]:
-            if coeff == 0:
-                return None
-            if coeff == 1:
-                return term or str(coeff)
-            if coeff == -1:
-                return f"-{term}" if term else "-1"
-            return f"{coeff}{term}"
-        monomials = [stringify_monomial(c, x) for c, x in zip(self.coeffs, stringify_terms(self.terms))]
-        present_monomials: List[str] = [m for m in monomials if m]
-        return " + ".join(present_monomials).replace(" + -", " - ")
-    @classmethod
-    def from_string(cls, expr_str: str, degree: int, num_variables: int):
-        expr = sympy.parse_expr(expr_str.replace("^", "**"), transformations=SYMPY_TRANSFORMATIONS)
-        poly = Poly(expr, list(sorted(expr.free_symbols, key=lambda _: _.name)))
-        return sympy_poly_to_poly(poly, degree, num_variables)
-def sympy_poly_to_poly(poly: Poly, degree: int, num_variables: int) -> Polynomial:
-    terms = poly.terms()
-    all_terms = generate_terms(degree, num_variables)
-    all_powers = get_powers(all_terms)
-    coeffs_dict = defaultdict(int, {tuple(sympy_power_to_power(power)): coeff for power, coeff in terms})
-    coeffs = [coeffs_dict[tuple(_)] for _ in all_powers]
-    return Polynomial(degree=degree, num_variables=num_variables, coeffs=np.array(coeffs))
-def generate_polynomial(
-    degree: int,
-    num_variables: int,
-    range_coeffs: Range,  # inclusive
-    seed: Optional[int] = None,
-    strict_degree=True,
-    strict_variables=True,
-    strict_constant=True,
-) -> Polynomial:
-    """Sample the coefficients (A, B, ...) of the polynomial equation y = ... + A x + B.
-    A generic method used by the function class-specific methods below.
-    Args:
-        strict_degree (bool): if True, require `rel` to have degree strictly equal to `degree`
-        strict_variables (bool): if True, require `rel` to use exactly `num_variables`
-        strict_constant (bool): if True, require the constant (ie. term of degree 0) to be non-zero
-    Returns:
-        `rel` (Polynomial)
-    """
-    MAX_ATTEMPTS = 100
-    if seed is not None:
-        random.seed(seed)
-        np.random.seed(seed)
-    count = 0
-    terms = generate_terms(degree, num_variables)
-    while count < MAX_ATTEMPTS:
-        done = True
-        coeffs = [random.randint(r[0], r[1]) for r in range_coeffs]
-        if strict_constant and coeffs[-1] == 0:
-            done = False
-        if strict_degree and not sum(coeffs[: comb(degree + num_variables - 1, num_variables - 1)]):
-            done = False
-        if strict_variables:
-            for idx in range(num_variables):
-                vals = np.zeros(num_variables)
-                vals[idx] = 1
-                res = np.dot(coeffs[:-1], np.array(list(map(lambda _: np.prod(vals.__getitem__(_)), terms[:-1]))))
-                if not res:
-                    done = False
-                    break
-        if done:
-            break
-        count += 1
-        if count >= MAX_ATTEMPTS:
-            raise ValueError(
-                "Failed to sample valid polynomial equation within "
-                + f"{MAX_ATTEMPTS} attempts from ranges {str(range_coeffs)}."
-            )
-    return Polynomial(degree=degree, num_variables=num_variables, coeffs=np.array(coeffs))
-def generate_linear(range_coeffs: Range) -> Polynomial:
-    return generate_polynomial(
-        degree=1,
-        num_variables=1,
-        range_coeffs=range_coeffs,
-        strict_degree=True,
-        strict_variables=True,
-        strict_constant=True,
-    )
-def generate_parabola(range_coeffs: Range) -> Polynomial:
-    return generate_polynomial(
-        degree=2,
-        num_variables=1,
-        range_coeffs=range_coeffs,
-        strict_degree=True,
-        strict_variables=True,
-        strict_constant=True,
-    )
-def generate_plane(range_coeffs: Range) -> Polynomial:
-    return generate_polynomial(
-        degree=1,
-        num_variables=2,
-        range_coeffs=range_coeffs,
-        strict_degree=True,
-        strict_variables=True,
-        strict_constant=True,
-    )
-def generate_paraboloid(range_coeffs: Range) -> Polynomial:
-    return generate_polynomial(
-        degree=2,
-        num_variables=2,
-        range_coeffs=range_coeffs,
-        strict_degree=True,
-        strict_variables=True,
-        strict_constant=True,
-    )
-def generate_rotated_translated_paraboloid(range_coeffs: Range) -> Polynomial:
-    """Unused."""
-    do_sample = True
-    while do_sample:
-        coeffs_0 = generate_plane(range_coeffs).coeffs
-        coeffs_1 = generate_plane(range_coeffs).coeffs
-        mat = np.array(
-            [
-                coeffs_0,
-                coeffs_1,
-            ]
-        )
-        if np.linalg.matrix_rank(mat) == 2:
-            do_sample = False
-    x = Symbol("x")
-    y = Symbol("y")
-    xprime = coeffs_0[0] * x + coeffs_0[1] * y + coeffs_0[2]
-    yprime = coeffs_1[0] * x + coeffs_1[1] * y + coeffs_1[2]
-    expr = xprime**2 + yprime**2
-    poly = Poly(expr, [x, y])
-    return sympy_poly_to_poly(poly, 2, 2)
-def distance_linear(point: List[int], rel_str: str):
-    """
-    Returns the minimum distance from the given point to the relation given by `rel_str` which has the form:
-    A x - y + B = 0
-    """
-    relation_type = "linear"
-    degree: int = RELTYPE_INFO[relation_type].degree
-    num_variables: int = RELTYPE_INFO[relation_type].num_variables
-    rel = Polynomial.from_string(rel_str.split(" = ")[-1], degree, num_variables)
-    A = rel.coeffs[0]
-    B = -1
-    C = rel.coeffs[1]
-    x, y = point
-    return float(abs((A * x + B * y + C)) / (math.sqrt(A**2 + B**2)))
-def distance_parabola(point: List[int], rel_str: str, TOL: float = 1e-10):
-    """
-    Returns the minimum distance from the given point to the relation given by `rel_str` which has the form:
-    y = A x^2 + B x + C
-    """
-    rel_str = rel_str.split(" = ")[-1]
-    expr = sympy.parse_expr(rel_str.replace("^", "**"), transformations=SYMPY_TRANSFORMATIONS)
-    poly = sympy.Poly(expr, list(expr.free_symbols))
-    x = list(expr.free_symbols)[0]
-    x0, y0 = point
-    dist = (x - x0) ** 2 + (poly - y0) ** 2
-    deriv = sympy.diff(dist, x)
-    try:
-        sols = sympy.solve(deriv, x)
-    except ZeroDivisionError:
-        # This shouldn't happen, but has happened for a prior implementation of
-        # `distance_paraboloid`, so catch it conservatively:
-        print("Failed to compute minimum distance.")
-        # pdb.set_trace()
-        return float(0.0)
-    dist_vals = list(map(lambda _: sympy.N(dist.eval(_)), sols))
-    try:
-        dist_val = min([sympy.re(_) for _ in dist_vals if abs(sympy.im(_)) < TOL and sympy.re(_) >= 0])
-    except ValueError:
-        # A real solution should exist, but if not (eg. numerical error exceeds TOL):
-        print("Failed to compute minimum distance.")
-        # pdb.set_trace()
-        return float(0.0)
-    return np.sqrt(float(dist_val))
-def distance_plane(point: List[int], rel_str: str):
-    """
-    Returns the minimum distance from the given point to the relation given by `rel_str` which has the form:
-    A x + B y - z + C = 0
-    """
-    relation_type = "plane"
-    degree: int = RELTYPE_INFO[relation_type].degree
-    num_variables: int = RELTYPE_INFO[relation_type].num_variables
-    rel = Polynomial.from_string(rel_str.split(" = ")[-1], degree, num_variables)
-    A = rel.coeffs[0]
-    B = rel.coeffs[1]
-    C = -1
-    D = rel.coeffs[2]
-    x, y, z = point
-    d = abs((A * x + B * y + C * z + D))
-    e = math.sqrt(A**2 + B**2 + C**2)
-    return float(d / e)
-def distance_paraboloid(point: List[int], rel_str: str, TOL: float = 1e-10):
-    """
-    Returns the minimum distance from the given point to the relation given by `rel_str` which has the form:
-    z = A x^2 + B x y + C y^2 + D x + E y + F
-    Uses method of Lagrange multipliers.
-    """
-    rel_str = rel_str.split(" = ")[-1]
-    expr = sympy.parse_expr(rel_str.replace("^", "**"), transformations=SYMPY_TRANSFORMATIONS)
-    x, y = list(expr.free_symbols)
-    if x.name == "y":
-        x, y = y, x
-    z = Symbol("z")
-    x0, y0, z0 = point
-    f = (x - x0) ** 2 + (y - y0) ** 2 + (z - z0) ** 2
-    g = z - expr
-    if abs(g.subs([(x, x0), (y, y0), (z, z0)])) < TOL:
-        return float(0.0)
-    λ = Symbol("λ")
-    # The code below is meant to be equivalent to
-    # `sols = sympy.solve([eq_x, eq_y, eq_z, g], [x, y, z, λ])`
-    # but sympy.solve was failing to find any solution on many inputs
-    # as well as not finding some solutions
-    # so this breaks it down for the special case of `f - λ g` which is at most quadratic.
-    # Set up the equations from method of Lagrange multipliers
-    eq_x = diff(f, x) - λ * diff(g, x)
-    eq_y = diff(f, y) - λ * diff(g, y)
-    eq_z = diff(f, z) - λ * diff(g, z)
-    # Solve for each variable individually
-    has_xy = y in eq_x.free_symbols  # has xy term
-    if has_xy:
-        sols_x = sympy.solve(eq_x, [x, y, λ])
-        sols_y = sympy.solve(eq_y, [x, y, λ])
-        sols_z = sympy.solve(eq_z, [z, λ])
-    else:
-        sols_x = sympy.solve(eq_x, [x, λ])
-        sols_y = sympy.solve(eq_y, [y, λ])
-        sols_z = sympy.solve(eq_z, [z, λ])
-    try:
-        # Put the solutions together
-        # Extract x,y,z resp. from tuples
-        sols_lst_xyz = [[_[0] for _ in lst] for lst in [sols_x, sols_y, sols_z]]
-        # Extract solutions for λ from tuples
-        sols_lst_λλλ = [[_[-1] for _ in lst] for lst in [sols_x, sols_y, sols_z]]
-        # Get list of possible solution tuples and corresponding solutions for λ
-        sols_xyz = list(product(*sols_lst_xyz))
-        vals_λ = list(product(*sols_lst_λλλ))
-        sols = []
-        # Try each possible combined solution for x, y, z, λ
-        for sol_xyz, val_λs in zip(sols_xyz, vals_λ):
-            val_λs = tuple(set(filter(lambda _: not _.is_symbol, val_λs)))  # get distinct values for λ if there are any
-            if len(val_λs) > 1:  # there can be at most one distinct value for λ
-                continue
-            val_λ = val_λs[0] if val_λs else λ
-            sol_x, sol_y, sol_z = sol_xyz
-            if not val_λ.is_symbol:
-                # Substitute in values of λ
-                sol_x = sol_x.subs(λ, val_λ)
-                sol_y = sol_y.subs(λ, val_λ)
-                sol_z = sol_z.subs(λ, val_λ)
-                g_λ = g.subs(λ, val_λ)
-            else:
-                g_λ = g
-            # Substitute in solutions for x, y, z
-            if has_xy:
-                g_λ = g_λ.subs([(x, sol_x), (z, sol_z)])
-                sol_ys = sympy.solve(sol_x - sol_y, y)
-                for sol_y in sol_ys:
-                    g_λy = g_λ.subs(y, sol_y)
-                    sol_xy = sol_x.subs(y, sol_y)
-                    syms = list(g_λy.free_symbols)
-                    if len(syms) > 1:  # underdetermined system
-                        continue
-                    sym = syms[0]
-                    vals = [sympy.N(_) for _ in sympy.solveset(g_λy, sym)]
-                    sols.extend([(sol_xy.subs(sym, _), sol_y.subs(sym, _), sol_z.subs(sym, _)) for _ in vals])
-            else:
-                g_λ = g_λ.subs([(x, sol_x), (y, sol_y), (z, sol_z)])
-                syms = list(g_λ.free_symbols)
-                if len(syms) > 1:  # underdetermined system
-                    continue
-                # Solve for remaining variable
-                sym = syms[0]
-                vals = [sympy.N(_) for _ in sympy.solveset(g_λ, sym)]
-                sols.extend([(sol_x.subs(sym, _), sol_y.subs(sym, _), sol_z.subs(sym, _)) for _ in vals])
-    except ZeroDivisionError:
-        # This shouldn't happen, but has happened for a prior implementation of
-        # `distance_paraboloid`, so catch it conservatively:
-        print("Failed to compute minimum distance.")
-        # pdb.set_trace()
-        return float(0.0)
-    poly_f = sympy.Poly(f, [x, y, z])
-    # Evaluate f on found solutions
-    try:
-        dist_vals = list(map(lambda _: sympy.N(poly_f.eval(_)), sols))
-    except sympy.polys.polyerrors.UnificationFailed:
-        # Forgot to substitute all variables in some expression.
-        # This shouldn't happen, but has happened for a prior implementation of
-        # `distance_paraboloid`, so catch it conservatively:
-        print("sympy error: Unification failed.")
-        # pdb.set_trace()
-        return float(0.0)
-    # Get the minimum nonnegative real value
-    try:
-        dist_val = min([sympy.re(_) for _ in dist_vals if abs(sympy.im(_)) < TOL and sympy.re(_) >= 0])
-    except ValueError:
-        # A real solution should exist, but if not (eg. numerical error exceeds TOL):
-        print("Failed to compute minimum distance.")
-        print([eq_x, eq_y, eq_z, g])
-        print(sols)
-        # pdb.set_trace()
-        return float(0.0)
-    return np.sqrt(float(dist_val))
-def select_ranges(
-    num_train: int, num_test: int, dim: int, overlap: bool = True, nonnegative_only: bool = False
-) -> Tuple[Range, Range]:
-    """
-    Choose disjoint intervals from which to sample points, where
-    the test points lie within a region bounded by the region
-    that the train points are sampled from.
-    """
-    choices: npt.NDArray[np.int64] = np.array([0, 1, 2, 5, 10, 20, 50, 100, 200])
-    def select_index(lst: npt.NDArray[np.int64], val: int) -> int:
-        return list((lst - val) >= 0).index(True)
-    def construct_range(index: int, dim: int) -> List[Tuple[int, int]]:
-        if nonnegative_only:
-            return [(0, choices[index]) for _ in range(dim)]
-        return [(-choices[index], choices[index]) for _ in range(dim)]
-    if nonnegative_only:
-        num_points = (choices + 1) ** dim  # list of ints
-    else:
-        num_points = (2 * choices + 1) ** dim  # list of ints
-    if overlap:
-        train_index = test_index = select_index(num_points, num_train + num_test)
-    else:
-        test_index = select_index(num_points, num_test)
-        train_index = select_index(num_points - num_points[test_index], num_train)
-    test_range = construct_range(test_index, dim)
-    train_range = construct_range(train_index, dim)
-    return (train_range, test_range)
-@dataclass(frozen=True)
-class RelationTypeInfo:
-    name: str
-    degree: int
-    num_variables: int
-    range: Range
-    example_coeffs: npt.NDArray[np.int64]
-RELTYPE_INFO: Dict[str, RelationTypeInfo] = {
-    "linear": RelationTypeInfo(
-        name="linear", degree=1, num_variables=1, range=[(1, 5), (1, 5)], example_coeffs=np.array([2, 5])
-    ),  # 2x + 5
-    "parabola": RelationTypeInfo(
-        # parabolas with axis of symmetry to the left of the origin
-        name="parabola",
-        degree=2,
-        num_variables=1,
-        range=[(1, 2), (0, 2), (1, 5)],
-        example_coeffs=np.array([1, 0, 2]),
-    ),  # x^2 + 2
-    "plane": RelationTypeInfo(
-        name="plane", degree=1, num_variables=2, range=[(1, 5), (1, 5), (1, 5)], example_coeffs=np.array([2, 1, 5])
-    ),  # 2x + y + 5
-    "paraboloid": RelationTypeInfo(
-        # axis-aligned elliptic paraboloids only, ie. of the form z = A x^2 + B y^2 + C
-        name="paraboloid",
-        degree=2,
-        num_variables=2,
-        range=[(1, 2), (0, 1), (1, 2), (0, 0), (0, 0), (1, 5)],
-        example_coeffs=np.array([2, 0, 1, 0, 0, 2]),
-    ),  # 2x^2 + y^2 + 2
-}
-# MODE_INFO = {  # Testing purposes
-#     "example": {"num_function_train": 1, "num_function_test": 1, "num_train": 10, "num_test": 1,},
-#     "standard": {"num_function_train": 1, "num_function_test": 1, "num_train": 10, "num_test": 1,},
-#     "function": {"num_function_train": 2, "num_function_test": 2, "num_train": 2, "num_test": 1,},
-# }
-MODE_INFO = {
-    "example": {
-        "num_function_train": 1,
-        "num_function_test": 1,
-        "num_train": 100,
-        "num_test": 100,
-    },
-    "standard": {
-        "num_function_train": 1,
-        "num_function_test": 1,
-        "num_train": 100,
-        "num_test": 100,
-    },
-    "function": {
-        "num_function_train": 1000,
-        "num_function_test": 1000,  # don't bother excluding from train set
-        "num_train": 100,
-        "num_test": 1,
-    },
-}
-def get_var(dim: int, variable_names=list("xyz")):
-    return variable_names[dim - 1]
-def get_dataset_header(
-    dim: int, variable_names: List[str] = list("xyz"), delimiter: str = ", ", output_prefix: str = ", "
-):
-    return delimiter.join(variable_names[: dim - 1]) + output_prefix + variable_names[dim - 1]
-def get_numeracy_adapter_spec(
-    max_train_instances: int, max_eval_instances: int, dim: int, delimiter: str = ", ", **kwargs
-) -> AdapterSpec:
-    return AdapterSpec(
-        **{
-            **{
-                "method": ADAPT_GENERATION,
-                "instructions": get_dataset_header(dim, delimiter=delimiter, output_prefix=", "),
-                "max_train_instances": max_train_instances,
-                "max_eval_instances": max_eval_instances,
-                "num_outputs": 1,
-                "num_train_trials": 1,
-                "model_deployment": "openai/davinci",
-                "temperature": 0,
-                "stop_sequences": ["\n"],
-                "max_tokens": 20,
-                "input_prefix": "",
-                "output_prefix": ", ",
-                "instance_prefix": "\n",
-            },
-            **kwargs,
-        }
-    )  # enable override
-class NumeracyScenario(Scenario):
-    """
-    A task that asks the model to induce an unknown polynomial at a point given a set of function evaluations.
-    Unlike pre-existing tasks testing arithmetic, this task attempts to test a deeper notion of numeracy
-    which the model cannot rely purely on rote memorization of standard tables of arithmetic operations
-    in order to succeed on and which intuitively occurs as a implicit subroutine in broader contexts.
-    Decomposes into 4 function classes:
-    - linear                    (1 degree,  1 variable)
-    - parabola                  (2 degrees, 2 variables)
-    - plane                     (1 degree,  2 variables)
-    - (elliptic) paraboloid    (2 degrees, 2 variables)
-        with coefficients drawn from restricted ranges
-        (see dict `RELTYPE_INFO`), and
-        where {parabola, paraboloid} have nonnegative domains,
-        ie. the right ray of the x-axis or upper-right
-        quadrant of the plane resp. so that the model cannot
-        rely on symmetry.
-    and independently 2 + 1 modes:
-    - standard
-        - A single dataset corresponding to the same polynomial.
-        Evaluate on different points.
-    - function
-        - Multiple datasets, where each dataset instance corresponds to
-        an independently sampled polynomial belonging to the same class.
-        Evaluate on different (dataset, point) pairs.
-    and
-    - example
-        - A single dataset corresponding to the same fixed representative for each class.
-    If `overlap` is `True`:
-        Train and test datapoints are drawn from the same rectilinear region
-        centered at the origin (see function `select_ranges`),
-        making sure to exclude the training set from the test set.
-    Otherwise:
-        Train datapoints are drawn from a rectilinear border region while
-        test datapoints are drawn from a disjoint rectilinear interior region,
-        centered at the origin (see function `select_ranges`).
-    Example prompt for `relation_type=parabola,mode=function` with `num_function_train=num_function_test=num_train=2`:
-        x,y
-        1,4
-        -1,2
-        0,2
-        x,y
-        -1,0
-        1,20
-        0,8
-        x,y
-        -1,7
-        1,11
-        0,
-    """
-    name = "numeracy"
-    description = "polynomial induction"
-    tags: List[str] = []
-    RELTYPES: List[str] = ["linear", "parabola", "plane", "paraboloid"]
-    MODES: List[str] = ["example", "standard", "function"]
-    delimiter: str = ", "
-    def __init__(
-        self,
-        relation_type: str = "linear",
-        mode: str = "function",
-        seed: Optional[int] = None,
-        overlap: bool = True,  # whether the in-context and eval points are drawn from the same region
-        sort_vals: bool = False,  # whether to sort the in-context examples
-    ):
-        super().__init__()
-        assert relation_type in NumeracyScenario.RELTYPES
-        assert mode in NumeracyScenario.MODES
-        self.random_seed = seed
-        self.relation_type = relation_type
-        self.mode = mode
-        self.delimiter = NumeracyScenario.delimiter
-        self.seed = seed
-        self.overlap = overlap
-        self.sort_vals = sort_vals
-        self.degree: int = RELTYPE_INFO[relation_type].degree
-        self.num_variables: int = RELTYPE_INFO[relation_type].num_variables
-        self.range_coeffs = RELTYPE_INFO[relation_type].range
-        self.dim = self.num_variables + 1
-        self.num_function_train = MODE_INFO[mode]["num_function_train"]
-        self.num_function_test = MODE_INFO[mode]["num_function_test"]
-        self.num_train = MODE_INFO[mode]["num_train"]
-        self.num_test = MODE_INFO[mode]["num_test"]
-    def get_instances(self, output_path: str) -> List[Instance]:
-        assert self.random_seed is not None
-        random.seed(self.random_seed)
-        np.random.seed(self.random_seed)
-        train_range, test_range = select_ranges(
-            num_train=100,
-            num_test=100,
-            dim=self.num_variables,  # not a typo
-            overlap=self.overlap,
-            nonnegative_only=self.relation_type in ["parabola", "paraboloid"],
-        )
-        #               train_range = test_range:
-        #               -------------------------
-        # linear:       [(-100, 100)]
-        # parabola:     [(0, 200)]
-        # plane:        [(-10, 10), (-10, 10)]
-        # paraboloid:   [(0, 20), (0, 20)]
-        test_vals = list(product(*[range(r[0], r[1] + 1) for r in test_range]))
-        if self.overlap:
-            train_vals = test_vals
-        else:
-            train_vals = list(set(product(*[range(r[0], r[1] + 1) for r in train_range])) - set(test_vals))
-        if self.sort_vals:
-            train_vals = list(sorted(train_vals))
-        if self.num_variables == 2:
-            test_vals = list(filter(lambda _: _[0] <= _[1], test_vals))
-            train_vals = list(filter(lambda _: _[0] <= _[1], train_vals))
-        def generate_datapoint(rel: Polynomial, vals: List[int]) -> Tuple[List[str], str]:
-            y = rel.eval(vals)
-            return list(map(str, vals)), str(y)
-        def generate_datapoint_instances_for_split(rel, idxs, eval_vals, split):
-            instances = []
-            for idx in idxs:
-                vals = eval_vals[idx]
-                str_vals, y = generate_datapoint(rel, vals)
-                input = self.delimiter.join(str_vals)
-                output = y
-                var = get_var(self.dim)
-                solution = f"{var} = {rel}"
-                references = [
-                    Reference(Output(text=output), tags=[CORRECT_TAG]),
-                    Reference(Output(text=solution), tags=[SOLUTION_TAG]),
-                    Reference(Output(text=self.relation_type), tags=[CLASS_TAG]),
-                ]
-                instance = Instance(Input(text=input), references=references, split=split)
-                instances.append(instance)
-            return instances
-        def generate_datapoint_instances(rel: Polynomial):
-            train_idxs = list(np.random.choice(range(len(train_vals)), self.num_train, replace=False))
-            if self.sort_vals:
-                train_idxs = list(sorted(train_idxs))
-            if self.overlap:
-                all_test_idxs = list(set(range(len(test_vals))) - set(train_idxs))
-            else:
-                all_test_idxs = list(range(len(test_vals)))
-            test_idxs = np.random.choice(all_test_idxs, self.num_test, replace=False)
-            train_instances = generate_datapoint_instances_for_split(rel, train_idxs, train_vals, TRAIN_SPLIT)
-            test_instances = generate_datapoint_instances_for_split(rel, test_idxs, test_vals, TEST_SPLIT)
-            instances = train_instances + test_instances
-            return instances
-        def generate_dataset():
-            generate_func = globals()[f"generate_{self.relation_type}"]
-            rel = generate_func(self.range_coeffs)
-            instances = generate_datapoint_instances(rel)
-            return instances
-        def generate_datasets(num_instances: int, split: str):
-            # TODO: construct_prompt is no longer part of adapter, and this function needs to be rewritten
-            #       https://github.com/stanford-crfm/benchmarking/issues/569
-            return []
-            # spec = get_numeracy_adapter_spec(self.num_train, self.num_test, self.dim, self.delimiter)
-            # service = get_test_tokenizer_service()
-            # adapter = Adapter(spec, service)
-            # outer_spec = get_numeracy_adapter_spec(
-            #    self.num_train,
-            #    self.num_test,
-            #    self.dim,
-            #    instructions="",
-            #    instance_prefix="\n\n",
-            #    delimiter=self.delimiter,
-            # )
-            # outer_adapter = Adapter(outer_spec, service)
-            # instances = []
-            # for idx in range(num_instances):
-            #    datapoint_instances = generate_dataset()
-            #    train_instances = datapoint_instances[: self.num_train]
-            #    eval_instances = datapoint_instances[self.num_train :]
-            #    dataset_instances = []
-            #    for idx in range(self.num_test):
-            #        eval_instance = eval_instances[idx]
-            #        input = adapter.construct_prompt(
-            #            train_instances, eval_instance, include_output=False, reference_index=None
-            #        ).text
-            #        input = input[: -len(spec.output_prefix.rstrip())]  # strip output_prefix
-            #        references = eval_instance.references
-            #        dataset_instance = Instance(input=input, references=references, split=split)  # split doesn't matter
-            #        dataset_instances.append(dataset_instance)
-            #    input = outer_adapter.construct_prompt(
-            #        dataset_instances[:-1], dataset_instances[-1], include_output=False, reference_index=None
-            #    ).text
-            #    input = input[: -len(spec.output_prefix.rstrip())]  # strip output_prefix
-            #    references = dataset_instances[-1].references
-            #    instance = Instance(input=input, references=references, split=split)
-            #    instances.append(instance)
-            # return instances
-        def generate_instances():
-            generate_func = globals()[f"generate_{self.relation_type}"]
-            if self.mode == "example":
-                coeffs = RELTYPE_INFO[self.relation_type].example_coeffs
-                rel = Polynomial(self.degree, self.num_variables, coeffs)
-                return generate_datapoint_instances(rel)
-            if self.mode == "standard":
-                rel = generate_func(self.range_coeffs)
-                return generate_datapoint_instances(rel)
-            if self.mode == "function":
-                return generate_datasets(self.num_function_train, TRAIN_SPLIT) + generate_datasets(
-                    self.num_function_test, TEST_SPLIT
-                )
-        return generate_instances()

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl