PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/examples/scripts/results/benchmark_evaluators_clean.json DELETED Viewed

@@ -1,469 +0,0 @@
-{
-  "wmt14_en_fr": {
-    "wisent_evaluator": "generation",
-    "actual_lm_eval_harness_metrics": ["BLEU", "TER", "CHRF"],
-    "notes": "Wisent uses generic 'generation' evaluator with string comparison. Actual WMT14 benchmark uses BLEU (n-gram overlap), TER (Translation Edit Rate), and CHRF (character-level F-score) - translation-specific metrics."
-  },
-  "mmlu": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format. LM-eval measures accuracy via mean. Wisent uses log_likelihoods to compare probabilities of answer choices A/B/C/D."
-  },
-  "gsm8k": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "Both use exact_match for numerical answers. LM-eval uses regex to extract answers from generated text with patterns like '#### 42' or 'answer is 42'. Wisent also extracts numerical answers for comparison."
-  },
-  "hellaswag": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for sentence completion. LM-eval measures accuracy (acc) and normalized accuracy (acc_norm). Wisent uses log_likelihoods to compare probabilities of completion choices."
-  },
-  "arc_easy": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for science questions. LM-eval measures accuracy (acc) and normalized accuracy (acc_norm). Wisent uses log_likelihoods to compare probabilities of answer choices."
-  },
-  "humaneval": {
-    "wisent_evaluator": "MISSING",
-    "actual_lm_eval_harness_metrics": ["pass_at_k"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "LM-eval uses pass@k metric by executing generated code against test cases (allows unsafe code execution). Wisent has NO evaluator defined for HumanEval - benchmark is broken. Should use docker_code evaluator."
-  },
-  "truthfulqa_mc1": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format. LM-eval measures accuracy where first choice is correct answer. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "winogrande": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for commonsense reasoning (partial evaluation method). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities of fill-in-blank options."
-  },
-  "drop": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match", "f1"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "LM-eval uses both exact_match and F1 score for reading comprehension with numerical reasoning. Wisent only uses exact_match. Both generate text until period."
-  },
-  "boolq": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for yes/no questions. LM-eval measures accuracy with choices ['no', 'yes']. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "piqa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for physical commonsense QA with 2 solutions. LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "openbookqa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for open-domain reading comprehension. LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "copa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for Choice of Plausible Alternatives (SuperGLUE). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "mathqa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for math word problems with 5 options (a-e). LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "lambada_openai": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["perplexity", "acc"],
-    "actual_lm_eval_harness_output_type": "loglikelihood",
-    "notes": "Both use loglikelihood for predicting final word in context. LM-eval measures both perplexity (lower is better) and accuracy. Wisent uses log_likelihoods similarly."
-  },
-  "coqa": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match", "f1"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "LM-eval uses both exact_match and F1 score for conversational question answering. Wisent only uses exact_match. Both generate text until '\\nQ:'."
-  },
-  "record": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["f1", "em"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "LM-eval uses F1 and exact_match for SuperGLUE reading comprehension. Wisent uses log_likelihoods for multiple-choice format. Different approaches - LM-eval treats as generation task, Wisent as probability comparison."
-  },
-  "race": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for reading comprehension. LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "headqa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for medical knowledge questions. LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "logiqa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for logical reasoning. LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "storycloze": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for story ending selection (2 choices). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "swag": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for situation completion with 4 endings. LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "arithmetic": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "loglikelihood",
-    "notes": "Both use loglikelihood for arithmetic problems. LM-eval measures accuracy. Wisent uses log_likelihoods similarly."
-  },
-  "asdiv": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "loglikelihood",
-    "notes": "Both use loglikelihood for arithmetic word problems. LM-eval measures accuracy by extracting answers before parentheses. Wisent uses log_likelihoods similarly."
-  },
-  "medqa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for USMLE medical questions with 4 options (A/B/C/D). LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "cb": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "f1"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for SuperGLUE CommitmentBank with 3 options (True/False/Neither). LM-eval measures accuracy and F1 score with multi-class aggregation. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "rte": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for SuperGLUE Recognizing Textual Entailment with binary choices (True/False). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "wic": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for SuperGLUE Word-in-Context with binary choices (no/yes). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "wsc": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for SuperGLUE Winograd Schema Challenge with binary choices (no/yes). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "multirc": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for SuperGLUE MultiRC reading comprehension. LM-eval measures accuracy by asking 'Is the answer correct? yes/no' for each option. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "mrpc": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "f1"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for GLUE Microsoft Research Paraphrase Corpus with binary choices (no/yes). LM-eval measures accuracy and F1 score. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "qqp": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "f1"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for GLUE Quora Question Pairs with binary choices (no/yes). LM-eval measures accuracy and F1 score. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "sst2": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for GLUE sentiment classification with binary choices (negative/positive). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "qnli": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for GLUE Question NLI with binary choices (yes/no). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "wnli": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for GLUE Winograd NLI with binary choices (False/True). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "sciq": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for science questions with 4 options (3 distractors + correct). LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "commonsense_qa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for commonsense reasoning with 5 options (A-E). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "triviaqa": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "Both use exact_match for trivia QA. LM-eval generates text until newline/period/comma with case-insensitive and punctuation-normalized matching. Wisent uses exact_match similarly."
-  },
-  "nq_open": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "Both use exact_match for Natural Questions Open. LM-eval generates text until newline/period/comma with case-insensitive matching and article stripping. Wisent uses exact_match similarly."
-  },
-  "webqs": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use exact_match for web questions. LM-eval uses multiple-choice format with Freebase knowledge. Wisent uses exact_match similarly."
-  },
-  "arc_challenge": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for AI2 ARC Challenge science questions. LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "toxigen": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for hateful content detection with binary choices (No/Yes). LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "pubmedqa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for PubMed abstracts with 3 options (yes/no/maybe). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "anli_r1": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for Adversarial NLI Round 1 with 3 options (True/False/Neither). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "mnli": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for GLUE Multi-Genre NLI with 3 options (True/Neither/False). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "cola": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["mcc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "LM-eval uses Matthews Correlation Coefficient for GLUE Corpus of Linguistic Acceptability with binary choices (no/yes). Wisent uses log_likelihoods. Different metrics - MCC vs log probability comparison."
-  },
-  "logiqa2": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for logical reasoning. LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "mc_taco": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "f1"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for temporal reasoning with binary plausibility judgments (no/yes). LM-eval measures accuracy and F1 score. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "glue_rte": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for GLUE RTE with binary choices (True/False). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "polemo2_in": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["f1", "acc"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "LM-eval uses micro-averaged F1 and accuracy for Polish sentiment with 4 classes (A-D). Generates text until period/comma, extracts letter. Wisent uses log_likelihoods for probability comparison."
-  },
-  "mutual": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["r@1", "r@2", "mrr"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "LM-eval uses ranking metrics (recall@1, recall@2, MRR) for multiple-choice dialogue with 4 options. Wisent uses log_likelihoods. Different evaluation approach - ranking vs probability comparison."
-  },
-  "siqa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for Social IQA with 3 options (answerA/B/C). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "ifeval": {
-    "wisent_evaluator": "generation",
-    "actual_lm_eval_harness_metrics": ["prompt_level_strict_acc", "inst_level_strict_acc", "prompt_level_loose_acc", "inst_level_loose_acc"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "LM-eval uses 4 instruction-following accuracy metrics (strict/loose at prompt/instruction level) for IFEval. Generates up to 1280 tokens. Wisent uses generic generation evaluator. Different evaluation - instruction compliance vs string comparison."
-  },
-  "babi": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "Both use exact_match for bAbI reading comprehension. LM-eval generates text until newline or 'Passage:'. Wisent uses exact_match similarly."
-  },
-  "crows_pairs_english": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["likelihood_difference", "pct_stereotype"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "LM-eval uses bias metrics (likelihood difference, stereotype percentage) for CROWS-Pairs bias detection. Lower scores are better. Wisent uses log_likelihoods. Different purpose - bias measurement vs task performance."
-  },
-  "mbpp": {
-    "wisent_evaluator": "MISSING",
-    "actual_lm_eval_harness_metrics": ["pass@1"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "LM-eval uses pass@1 metric by executing generated Python code against test cases (allows unsafe code execution). Wisent has NO evaluator defined for MBPP - benchmark is broken. Should use docker_code evaluator."
-  },
-  "realtoxicityprompts": {
-    "wisent_evaluator": "generation",
-    "actual_lm_eval_harness_metrics": ["toxicity_perspective_api", "perspective_api_toxicity_score"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "LM-eval uses Perspective API to measure toxicity in generated completions. Lower scores are better. Wisent uses generic generation evaluator. Different purpose - toxicity measurement vs string comparison."
-  },
-  "leaderboard_mmlu_pro": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for MMLU-Pro leaderboard task. LM-eval uses 5-shot with validation examples, measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "leaderboard_ifeval": {
-    "wisent_evaluator": "generation",
-    "actual_lm_eval_harness_metrics": ["prompt_level_strict_acc", "inst_level_strict_acc", "prompt_level_loose_acc", "inst_level_loose_acc"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "LM-eval uses 4 instruction-following accuracy metrics for leaderboard IFEval. Generates up to 1280 tokens. Wisent uses generic generation evaluator. Different evaluation - instruction compliance vs string comparison."
-  },
-  "medmcqa": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for medical MCQ with 4 options (A/B/C/D). LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "tinyHellaswag": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for tiny HellaSwag subset. LM-eval uses normalized accuracy with custom GPIRT aggregation. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "tinyMMLU": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for tiny MMLU subset with 4 options (A/B/C/D). LM-eval uses normalized accuracy with custom GPIRT aggregation. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "tinyGSM8k": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "Both use exact_match for tiny GSM8k subset. LM-eval uses custom GPIRT aggregation with regex answer extraction. Wisent uses exact_match similarly."
-  },
-  "tinyArc": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for tiny ARC-Challenge subset with 25-shot. LM-eval uses normalized accuracy with custom GPIRT aggregation. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "tinyWinogrande": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for tiny Winogrande subset with 5-shot. LM-eval uses normalized accuracy with custom GPIRT aggregation. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "wikitext": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
-    "actual_lm_eval_harness_output_type": "loglikelihood_rolling",
-    "notes": "LM-eval uses perplexity metrics for language modeling with loglikelihood_rolling evaluation. Wisent uses log_likelihoods. Different purpose - language modeling evaluation vs task performance."
-  },
-  "qa4mre_2011": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc", "acc_norm"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for QA4MRE 2011 reading comprehension. LM-eval measures accuracy and normalized accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "agieval_en": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "unknown",
-    "notes": "LM-eval uses accuracy metric with mean aggregation and weight_by_size: true. Output type not explicitly specified in config. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "squadv2": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match", "f1"],
-    "actual_lm_eval_harness_output_type": "function-based (SQuAD2)",
-    "notes": "LM-eval uses function-based task class for SQuAD v2 with exact_match and F1 metrics. Wisent uses exact_match. Both handle questions where no answer is possible."
-  },
-  "xwinograd_en": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for cross-lingual Winograd Schema Challenge. LM-eval measures accuracy with mean aggregation. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "scrolls_qasper": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match", "f1"],
-    "actual_lm_eval_harness_output_type": "function-based (Qasper)",
-    "notes": "LM-eval uses function-based task class for SCROLLS Qasper with exact_match and F1 metrics for scientific paper QA. Wisent uses exact_match. Both handle long document reasoning."
-  },
-  "truthfulqa_mc2": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "mc2",
-    "notes": "LM-eval uses mc2 (multiple choice with 2 options) format for TruthfulQA. Measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "truthfulqa_gen": {
-    "wisent_evaluator": "generation",
-    "actual_lm_eval_harness_metrics": ["bleu_max", "bleu_acc", "bleu_diff", "rouge1_max", "rouge1_acc", "rouge1_diff", "rouge2_max", "rouge2_acc", "rouge2_diff", "rougeL_max", "rougeL_acc", "rougeL_diff"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "LM-eval uses 12 BLEU/ROUGE metrics (max/acc/diff variants) for generation-based TruthfulQA. Wisent uses generic generation evaluator. Different evaluation - semantic similarity vs string comparison."
-  },
-  "xnli_en": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for cross-lingual NLI with 3 options (True/Neither/False). LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "winogender": {
-    "wisent_evaluator": "log_likelihoods",
-    "actual_lm_eval_harness_metrics": ["acc"],
-    "actual_lm_eval_harness_output_type": "multiple_choice",
-    "notes": "Both use multiple-choice format for gender bias detection in Winograd schemas. LM-eval measures accuracy. Wisent uses log_likelihoods to compare probabilities."
-  },
-  "hendrycks_math": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "Both use exact_match for Hendrycks MATH dataset with LaTeX math problems. LM-eval generates text with weighted aggregation by dataset size. Wisent uses exact_match similarly."
-  },
-  "aime": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "Both use exact_match for AIME (American Invitational Mathematics Examination) competition problems. LM-eval generates text with mean aggregation. Wisent uses exact_match similarly."
-  },
-  "xquad_en": {
-    "wisent_evaluator": "exact_match",
-    "actual_lm_eval_harness_metrics": ["exact_match", "f1"],
-    "actual_lm_eval_harness_output_type": "generate_until",
-    "notes": "LM-eval uses exact_match and F1 for cross-lingual QA (English subset of XQuAD). Generates text for reading comprehension. Wisent uses exact_match. Both evaluate extractive QA."
-  }
-}

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl