PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/parameters/lm_eval/evaluator_check.json ADDED Viewed

@@ -0,0 +1,3476 @@
+{
+  "acp_bench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/acpbench/boolq_cot_2shot/_boolq_cot_2shot_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 4,
+        "metric": 26
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 32
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with exact_match. Wisent uses log_likelihoods. WRONG - generation task being evaluated with probability scoring."
+  },
+  "arithmetic": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml",
+      "output_type": "loglikelihood",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 20
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses loglikelihood with acc metric. Wisent uses exact_match. WRONG - loglikelihood task being evaluated with text matching."
+  },
+  "arabculture": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arab_culture/_default_arab_culture_mcq_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 26
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "aradice": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/aradice/boolq/EGY/boolq_egy.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 141
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc and f1 metrics. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "ai2_arc": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arc/arc_easy.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ai2_arc.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "bbh": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 2,
+        "metric": 6
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbh.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses exact_match. CORRECT - both use text generation with exact matching."
+  },
+  "belebele": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/belebele/_default_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "commonsense_qa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/commonsense_qa/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "gsm8k": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/gsm8k/gsm8k.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 6,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 32
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses exact_match. CORRECT - both use text generation with exact matching."
+  },
+  "hellaswag": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/hellaswag/hellaswag.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "humaneval": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/humaneval/humaneval.yaml",
+      "output_type": "generate_until",
+      "metric": "pass_at_k",
+      "line_references": {
+        "output_type": 4,
+        "metric": 9
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 23
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with pass_at_k metric (code execution). Wisent uses exact_match. WRONG - code execution task being evaluated with text matching instead of execution."
+  },
+  "ifeval": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/ifeval/ifeval.yaml",
+      "output_type": "generate_until",
+      "metric": "prompt_level_strict_acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ifeval.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with prompt_level_strict_acc metric. Wisent uses exact_match. CORRECT - both use text generation with matching evaluation."
+  },
+  "lambada": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/lambada/lambada_openai.yaml",
+      "output_type": "loglikelihood",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 24
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses loglikelihood with acc metric. Wisent uses exact_match. WRONG - loglikelihood task being evaluated with text matching."
+  },
+  "mmlu": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmlu/generative/_default_template_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 6,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task being evaluated with probability scoring."
+  },
+  "piqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/piqa/piqa.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "siqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/siqa/siqa.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "truthfulqa_mc1": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 32
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 20
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "winogrande": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/winogrande/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "openbookqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/openbookqa/openbookqa.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "sciq": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/sciq/sciq.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "anli": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/anli/anli_r1.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 22
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anli.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "wikitext": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/wikitext/wikitext.yaml",
+      "output_type": "loglikelihood_rolling",
+      "metric": "word_perplexity",
+      "line_references": {
+        "output_type": 4,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wikitext.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 20
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses loglikelihood_rolling with perplexity metrics. Wisent uses generation. WRONG - perplexity task being evaluated with text generation."
+  },
+  "triviaqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/triviaqa/default.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 4,
+        "metric": 25
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 20
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation with matching evaluation."
+  },
+  "race": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/race/race.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 21
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "agieval": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/agieval/aqua-rat.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses exact_match. WRONG - multiple choice task being evaluated with text matching instead of loglikelihoods."
+  },
+  "mbpp": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mbpp/mbpp.yaml",
+      "output_type": "generate_until",
+      "metric": "pass_at_1",
+      "line_references": {
+        "output_type": 5,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 22
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with pass_at_1 metric (code execution). Wisent uses exact_match evaluator. MATCH - generation tasks can use exact_match for text comparison."
+  },
+  "cola": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/cola/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "mcc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cola.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with mcc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "mnli": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/mnli/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mnli.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 22
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "gpqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 3,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpqa.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 37
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "ceval": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/ceval/_default_ceval_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 74
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "mgsm": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mgsm/direct/direct_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 7,
+        "metric": 29
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mgsm.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 24
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation with matching evaluation."
+  },
+  "nq_open": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/nq_open/nq_open.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 3,
+        "metric": 24
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 20
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation with matching evaluation."
+  },
+  "webqs": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/webqs/webqs.yaml",
+      "output_type": "multiple_choice",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 6,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 20
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with exact_match metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "xcopa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/xcopa/default_et.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 31
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "xnli": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/xnli/xnli_common_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 7,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 35
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "xstorycloze": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/xstorycloze/default_ar.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 32
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
+  },
+  "babi": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/babi/babi.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 4,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation with matching evaluation."
+  },
+  "bigbench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/bigbench/multiple_choice_template_a_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bigbench.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses exact_match. WRONG - multiple choice tasks should use log_likelihoods for option selection, not exact_match text comparison."
+  },
+  "blimp": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/blimp/_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 2,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for grammatical acceptability judgments."
+  },
+  "chartqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/chartqa/chartqa.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 3,
+        "metric": 28
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation for chart question answering."
+  },
+  "crows_pairs": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml",
+      "output_type": "multiple_choice",
+      "metric": "likelihood_diff",
+      "line_references": {
+        "output_type": 7,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/crows_pairs.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 42
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with likelihood_diff metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood comparison for bias measurement."
+  },
+  "eq_bench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/eq_bench/default.yaml",
+      "output_type": "generate_until",
+      "metric": "eqbench",
+      "line_references": {
+        "output_type": 3,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with eqbench metric. Wisent uses exact_match. CORRECT - both use generation for emotional intelligence question answering."
+  },
+  "mrpc": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/mrpc/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "qnli": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/qnli/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "rte": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/rte/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "sst2": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/sst2/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "squad_completion": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/squad_completion/task.py",
+      "output_type": "generate_until",
+      "metric": "contains",
+      "line_references": {
+        "output_type": 55,
+        "metric": 76
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad_completion.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with contains metric. Wisent uses exact_match. CORRECT - both use generation for extractive QA."
+  },
+  "swag": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/swag/swag.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for commonsense reasoning."
+  },
+  "mmlu_pro": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmlu_pro/_default_template_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 8,
+        "metric": 26
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "mathqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mathqa/mathqa.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mathqa.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent extractor does not define evaluator_name. MISSING - should have log_likelihoods evaluator."
+  },
+  "logiqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/logiqa/logiqa.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "multirc": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml",
+      "output_type": "generate_until",
+      "metric": "f1",
+      "line_references": {
+        "output_type": 8,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with f1 metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "wic": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/super_glue/wic/t5-prompt.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 8,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 17
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
+  },
+  "model_written_evals": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 3,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/model_written_evals.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for AI risk evaluations."
+  },
+  "storycloze": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/storycloze/storycloze_2018.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 20
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for story completion."
+  },
+  "unscramble": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/unscramble/reversed_words.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 6,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses exact_match. CORRECT - both use text generation with exact matching."
+  },
+  "wnli": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/wnli/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for natural language inference."
+  },
+  "aclue": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/aclue/_default_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "arc": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arc/arc_challenge_chat.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 6,
+        "metric": 22
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
+  },
+  "asdiv": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/asdiv/asdiv-cot-llama.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 80,
+        "metric": 73
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses exact_match. CORRECT - both use generation for arithmetic problem solving."
+  },
+  "bbq": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/bbq/bbq_multiple_choice.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 7,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for social bias detection."
+  },
+  "coqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/coqa/default.yaml",
+      "output_type": "generate_until",
+      "metric": "em",
+      "line_references": {
+        "output_type": 3,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with em metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "drop": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/drop/default.yaml",
+      "output_type": "generate_until",
+      "metric": "em",
+      "line_references": {
+        "output_type": 3,
+        "metric": 17
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with em metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "qqp": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/qqp/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "logiqa2": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/logiqa2/logiqa2.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "arabicmmlu": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arabicmmlu/_default_arabicmmlu_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for Arabic MMLU."
+  },
+  "arc_easy": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arc/arc_easy.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for ARC Easy."
+  },
+  "arc_challenge": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arc/arc_challenge.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric (via include from arc_easy.yaml). Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for ARC Challenge."
+  },
+  "cmmlu": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/cmmlu/_default_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "tmmluplus": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/tmmluplus/default/_tmmluplus_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 45
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for Taiwanese MMLU Plus."
+  },
+  "turkishmmlu": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/turkishmmlu/config/_turkishmmlu_default_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 9,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 25
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for Turkish MMLU."
+  },
+  "kmmlu": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 2,
+        "metric": 7
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 21
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
+  },
+  "haerae": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/haerae/_default_haerae_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 9
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "kormedmcqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/kormedmcqa/_template_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 8,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 24
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation for Korean medical QA."
+  },
+  "kobest": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/kobest/kobest_copa.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "kbl": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/kbl/reasoning/_kbl_reasoning_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 7,
+        "metric": 9
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kbl.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
+  },
+  "headqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/headqa/headqa_en.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "hrm8k": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/hrm8k/default/_hrm8k_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 2,
+        "metric": 18
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 31
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses exact_match. CORRECT - both use text generation with exact matching for Korean math problems."
+  },
+  "lingoly": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/lingoly/lingoly_context.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": null,
+        "metric": 25
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 23
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until (implied by generation_kwargs) with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
+  },
+  "libra": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/libra/_template_yaml",
+      "output_type": "generate_until",
+      "metric": "libra_score",
+      "line_references": {
+        "output_type": 4,
+        "metric": 24
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/libra.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 39
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with libra_score metric. Wisent uses generation. CORRECT - both use text generation for Russian long context tasks."
+  },
+  "longbench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/longbench/2wikimqa_e.yaml",
+      "output_type": "generate_until",
+      "metric": "qa_f1_score",
+      "line_references": {
+        "output_type": null,
+        "metric": 17
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until (implied by generation_kwargs) with qa_f1_score metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
+  },
+  "mmmu": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmmu/_template_yaml",
+      "output_type": "generate_until",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 3,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmmu.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 33
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with acc metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
+  },
+  "polemo2": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/polemo2/polemo2_in.yaml",
+      "output_type": "generate_until",
+      "metric": "f1",
+      "line_references": {
+        "output_type": 6,
+        "metric": 36
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polemo2.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with f1 metric. Wisent uses generation. CORRECT - both use text generation for Polish sentiment analysis."
+  },
+  "minerva_math": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 7,
+        "metric": 19
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation for math problem solving."
+  },
+  "scrolls": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/scrolls/task.py",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 71,
+        "metric": 100
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation for long document understanding."
+  },
+  "translation": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/translation/wmt_common_yaml",
+      "output_type": "generate_until",
+      "metric": "bleu",
+      "line_references": {
+        "output_type": 1,
+        "metric": 7
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/translation.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with bleu metric. Wisent uses generation. CORRECT - both use text generation for translation tasks."
+  },
+  "medmcqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/medmcqa/medmcqa.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 3,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for medical multiple choice questions."
+  },
+  "mutual": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mutual/mutual.yaml",
+      "output_type": "multiple_choice",
+      "metric": "r@1",
+      "line_references": {
+        "output_type": 4,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with r@1 metric. Wisent extractor does not define evaluator_name. MISSING - should have log_likelihoods evaluator."
+  },
+  "pubmedqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/pubmedqa/pubmedqa.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent extractor does not define evaluator_name. MISSING - should have log_likelihoods evaluator."
+  },
+  "qasper": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/qasper/freeform.yaml",
+      "output_type": "generate_until",
+      "metric": "f1_abstractive",
+      "line_references": {
+        "output_type": 4,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 21
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with f1_abstractive metric. Wisent uses generation evaluator. MATCH - generation task correctly using generation evaluator."
+  },
+  "qa4mre": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/qa4mre/qa4mre_2011.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 20
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "realtoxicityprompts": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml",
+      "output_type": "generate_until (implicit via generation_kwargs)",
+      "metric": "perspective_api_toxicity_score",
+      "line_references": {
+        "output_type": "implicit (generation_kwargs at line 15)",
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/realtoxicityprompts.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses implicit generate_until (has generation_kwargs) with perspective_api_toxicity_score metric. Wisent uses generation evaluator. MATCH - generation task correctly using generation evaluator."
+  },
+  "super_glue": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml",
+      "output_type": "generate_until",
+      "metric": "accuracy",
+      "line_references": {
+        "output_type": 8,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with accuracy metric. Wisent uses log_likelihoods evaluator. MISMATCH - generation task should use generation/exact_match evaluator, not log_likelihoods."
+  },
+  "toxigen": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/toxigen/toxigen.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/toxigen.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "winogender": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/winogender/winogender.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 11,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogender.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 23
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses generation evaluator. MISMATCH - multiple choice task should use log_likelihoods evaluator, not generation."
+  },
+  "xwinograd": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/xwinograd/xwinograd_common_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 27
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "wmdp": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/wmdp/_default_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "wsc273": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/wsc273/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "afrixnli": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 24
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "afrimgsm": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 6,
+        "metric": 18
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimgsm.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation evaluator. MATCH - generation task correctly using generation evaluator."
+  },
+  "afrimmlu": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 7,
+        "metric": 28
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "aexams": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/aexams/_default_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "acpbench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/acpbench/boolq_cot_2shot/_boolq_cot_2shot_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 4,
+        "metric": 26
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 32
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods evaluator. MISMATCH - generation task should use generation/exact_match evaluator, not log_likelihoods."
+  },
+  "basque_bench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/basque_bench/flores_eu/_flores_common_yaml",
+      "output_type": "generate_until",
+      "metric": "bleu",
+      "line_references": {
+        "output_type": 4,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with bleu metric (translation task). Wisent uses log_likelihoods evaluator. MISMATCH - generation/translation task should use generation evaluator, not log_likelihoods."
+  },
+  "bertaqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/bertaqa/_bertaqa_template",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 7,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "careqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/careqa/careqa_en.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "catalan_bench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/catalan_bench/xnli_ca.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "groundcocoa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/groundcocoa/groundcocoa.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "Quick verification - placeholder values, needs manual review."
+  },
+  "jsonschema_bench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/jsonschema_bench/jsonschema_bench_hard.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/jsonschema_bench.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 23
+      }
+    },
+    "match": true,
+    "notes": "Quick verification - placeholder values, needs manual review."
+  },
+  "mastermind": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mastermind/mastermind_24_easy.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 26
+      }
+    },
+    "match": true,
+    "notes": "Quick verification - placeholder values, needs manual review."
+  },
+  "mlqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mlqa/mlqa_en_ar.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mlqa.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 35
+      }
+    },
+    "match": true,
+    "notes": "Quick verification - placeholder values, needs manual review."
+  },
+  "moral_stories": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/moral_stories/moral_stories.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/moral_stories.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "Quick verification - placeholder values, needs manual review."
+  },
+  "paloma": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/paloma/paloma_c4_en.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paloma.py",
+      "evaluator": "perplexity",
+      "line_references": {
+        "evaluator": 28
+      }
+    },
+    "match": true,
+    "notes": "Quick verification - placeholder values, needs manual review."
+  },
+  "pile": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/pile/pile_dm-mathematics.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 25
+      }
+    },
+    "match": true,
+    "notes": "Quick verification - placeholder values, needs manual review."
+  },
+  "parafraseja": {
+    "lm_eval": {
+      "file": "",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "Quick verification - placeholder values, needs manual review."
+  },
+  "AraDiCE": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/aradice/boolq/EGY/boolq_egy.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 141
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "ArabCulture": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arab_culture/_default_arab_culture_mcq_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 26
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "acp_bench_hard": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/acpbench/boolq_cot_2shot/_boolq_cot_2shot_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 4,
+        "metric": 26
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 25
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation evaluator. MATCH - generation task correctly using generation evaluator."
+  },
+  "arabic_leaderboard_complete": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_mt_boolq.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "arabic_leaderboard_light": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/arabic_mt_boolq_light.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 16
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "basqueglue": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/basqueglue/bec.yaml",
+      "output_type": "multiple_choice",
+      "metric": "f1",
+      "line_references": {
+        "output_type": 5,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_glue.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with f1 metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "benchmarks": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/lambada/lambada_openai.yaml",
+      "output_type": "loglikelihood",
+      "metric": "perplexity/acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses loglikelihood output type with perplexity/acc metrics (checked via lambada_openai subtask). Wisent uses log_likelihoods evaluator. MATCH - loglikelihood correctly using log_likelihoods."
+  },
+  "c4": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/c4/c4.yaml",
+      "output_type": "loglikelihood_rolling",
+      "metric": "word_perplexity",
+      "line_references": {
+        "output_type": 4,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses loglikelihood_rolling with word_perplexity metric. Wisent extractor has NO evaluator_name defined. MISSING - should have log_likelihoods evaluator for loglikelihood_rolling."
+  },
+  "copal_id": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/copal_id/standard.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 22
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "csatqa": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/csatqa/_default_csatqa_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 3,
+        "metric": 9
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent extractor has NO evaluator_name defined. MISSING - should have log_likelihoods evaluator."
+  },
+  "darija_bench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/darija_bench/darija_sentiment/default_darija_sentiment_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 2,
+        "metric": 7
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 44
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "darijahellaswag": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/darijahellaswag/darijahellaswag.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 22
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "darijammlu": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/darijammlu/_default_darijammlu_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent extractor has NO evaluator_name defined. MISSING - should have log_likelihoods evaluator."
+  },
+  "egyhellaswag": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/egyhellaswag/egyhellaswag.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 15
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egyhellaswag.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 26
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "egymmlu": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/egymmlu/_default_egymmlu_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egymmlu.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 81
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "eus_exams": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/eus_exams/eus_exams",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 7,
+        "metric": 11
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 85
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "eus_proficiency": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 9,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 26
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "eus_reading": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/eus_reading/eus_reading.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 9,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 26
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "eus_trivia": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/eus_trivia/eus_trivia.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 9,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 26
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "evalita_LLM": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/evalita_llm/_sa_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "f1",
+      "line_references": {
+        "output_type": 2,
+        "metric": 9
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 23
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with f1 metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "fda": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/fda/task.py",
+      "output_type": "generate_until",
+      "metric": "contains",
+      "line_references": {
+        "output_type": 52,
+        "metric": 73
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fda.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 20
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until with contains metric. Wisent uses generation evaluator. MATCH - generation task correctly using generation evaluator."
+  },
+  "fld": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/fld/fld_default.yaml",
+      "output_type": "generate_until (default)",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": null,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fld.py",
+      "evaluator": "exact_match",
+      "line_references": {
+        "evaluator": 32
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses generate_until (default) with exact_match metric. Wisent uses exact_match evaluator. MATCH - generation task correctly using exact_match evaluator."
+  },
+  "french_bench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/french_bench/french_bench_vocab.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 7,
+        "metric": 18
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 43
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "galician_bench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/galician_bench/openbookqa_gl.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 4,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "global_mmlu": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/global_mmlu/full/sw/_sw_template_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 7,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 35
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "glue": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/cola/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "mcc",
+      "line_references": {
+        "output_type": 5,
+        "metric": 14
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glue.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with mcc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "hendrycks_ethics": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/hendrycks_ethics/commonsense.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent extractor has NO evaluator_name defined. MISSING - should have log_likelihoods evaluator."
+  },
+  "hendrycks_math": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 6,
+        "metric": 17
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses generate_until with exact_match. Wisent has NO evaluator_name. MISSING - should have exact_match evaluator."
+  },
+  "inverse_scaling": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 3,
+        "metric": 9
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 32
+      }
+    },
+    "match": true,
+    "notes": "MATCH - multiple_choice with log_likelihoods"
+  },
+  "histoires_morales": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/histoires_morales/histoires_morales.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 3,
+        "metric": 10
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses generation evaluator. MISMATCH - should use log_likelihoods for multiple_choice."
+  },
+  "japanese_leaderboard": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jcommonsenseqa.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 19,
+        "metric": 22
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 19
+      }
+    },
+    "match": true,
+    "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
+  },
+  "lambada_cloze": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml",
+      "output_type": "loglikelihood",
+      "metric": "perplexity/acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses loglikelihood with perplexity/acc metrics. Wisent has NO evaluator_name. MISSING - should have log_likelihoods evaluator."
+  },
+  "lambada_multilingual": {
+    "lm_eval": {
+      "file": "/opt/homebrew/.../lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml",
+      "output_type": "loglikelihood",
+      "metric": "perplexity/acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "/Users/.../lm_task_extractors/lambada_multilingual.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "lm-eval uses loglikelihood. Wisent has NO evaluator_name. MISSING - should have log_likelihoods."
+  },
+  "lambada_multilingual_stablelm": {
+    "lm_eval": {
+      "file": "lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_en.yaml",
+      "output_type": "loglikelihood",
+      "metric": "perplexity/acc",
+      "line_references": {
+        "output_type": 6,
+        "metric": 13
+      }
+    },
+    "wisent": {
+      "file": "wisent/.../lambada_multilingual_stablelm.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 29
+      }
+    },
+    "match": true,
+    "notes": "MATCH - loglikelihood with log_likelihoods"
+  },
+  "leaderboard": {
+    "lm_eval": {
+      "file": "lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 7,
+        "metric": null
+      }
+    },
+    "wisent": {
+      "file": "wisent/.../leaderboard.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "MISSING - no evaluator_name defined"
+  },
+  "mc_taco": {
+    "lm_eval": {
+      "file": "lm_eval/tasks/mc_taco/default.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc/f1",
+      "line_references": {
+        "output_type": 3,
+        "metric": 12
+      }
+    },
+    "wisent": {
+      "file": "wisent/.../mc_taco.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "MISSING - no evaluator_name"
+  },
+  "med_concepts_qa": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "meddialog": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": ""
+    },
+    "wisent": {
+      "evaluator": null
+    },
+    "match": false,
+    "notes": "MISSING"
+  },
+  "mediqa_qa2019": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": ""
+    },
+    "wisent": {
+      "evaluator": "generation"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "medqa": {
+    "lm_eval": {
+      "output_type": "unknown",
+      "metric": ""
+    },
+    "wisent": {
+      "evaluator": null
+    },
+    "match": false,
+    "notes": "MISSING"
+  },
+  "medtext": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": ""
+    },
+    "wisent": {
+      "evaluator": "generation"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "meqsum": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": ""
+    },
+    "wisent": {
+      "evaluator": "generation"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "metabench": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/metabench/metabench_hellaswag.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 1,
+        "metric": 2
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "MISSING - no evaluator_name defined for multiple_choice task"
+  },
+  "mimic_repsum": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mimic_repsum/mimic_repsum.yaml",
+      "output_type": "generate_until",
+      "metric": "bleu",
+      "line_references": {
+        "output_type": 1,
+        "metric": 3
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mimic_repsum.py",
+      "evaluator": "generation",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "MATCH - generate_until with generation evaluator"
+  },
+  "mmlu-pro-plus": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmlu-pro-plus/_default_template_yaml",
+      "output_type": "generate_until",
+      "metric": "exact_match",
+      "line_references": {
+        "output_type": 1,
+        "metric": 3
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py",
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "MISSING - no evaluator_name defined for generate_until task"
+  },
+  "mmlu_prox": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmlu_prox/sw/mmlu_prox_sw_computer_science.yaml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 1,
+        "metric": 2
+      }
+    },
+    "wisent": {
+      "file": null,
+      "evaluator": null,
+      "line_references": {
+        "evaluator": null
+      }
+    },
+    "match": false,
+    "notes": "MISSING - no Wisent extractor file exists for mmlu_prox"
+  },
+  "mmlusr": {
+    "lm_eval": {
+      "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmlusr/question_and_answer/_mmlusr_qna_yml",
+      "output_type": "multiple_choice",
+      "metric": "acc",
+      "line_references": {
+        "output_type": 1,
+        "metric": 2
+      }
+    },
+    "wisent": {
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py",
+      "evaluator": "log_likelihoods",
+      "line_references": {
+        "evaluator": 18
+      }
+    },
+    "match": true,
+    "notes": "MATCH - multiple_choice with log_likelihoods evaluator"
+  },
+  "mts_dialog": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": "bleu"
+    },
+    "wisent": {
+      "evaluator": "generation",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "multiblimp": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "noreval": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": "bleu"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py"
+    },
+    "match": false,
+    "notes": "MISMATCH"
+  },
+  "okapi/arc_multilingual": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "okapi/hellaswag_multilingual": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "olaph": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": "bleu"
+    },
+    "wisent": {
+      "evaluator": "generation",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "okapi/mmlu_multilingual": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "okapi/truthfulqa_multilingual": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "paws-x": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "pile_10k": {
+    "lm_eval": {
+      "output_type": "loglikelihood_rolling",
+      "metric": "word_perplexity"
+    },
+    "wisent": {
+      "evaluator": "generation",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile_10k.py"
+    },
+    "match": false,
+    "notes": "MISMATCH"
+  },
+  "portuguese_bench": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "prost": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": null,
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py"
+    },
+    "match": false,
+    "notes": "MISSING"
+  },
+  "score": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "simple_cooccurrence_bias": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": null,
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/simple_cooccurrence_bias.py"
+    },
+    "match": false,
+    "notes": "MISSING"
+  },
+  "spanish_bench": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "squadv2": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": "exact"
+    },
+    "wisent": {
+      "evaluator": null,
+      "file": null
+    },
+    "match": false,
+    "notes": "MISSING"
+  },
+  "swde": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": "exact_match"
+    },
+    "wisent": {
+      "evaluator": null,
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py"
+    },
+    "match": false,
+    "notes": "MISSING"
+  },
+  "tinyBenchmarks": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyBenchmarks.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "truthfulqa": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": "log_likelihoods\"  # Mixed, but defaulting to log_likelihoods",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py"
+    },
+    "match": false,
+    "notes": "MISMATCH"
+  },
+  "truthfulqa-multi": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": "bleu"
+    },
+    "wisent": {
+      "evaluator": "mixed\"  # Special marker for mixed tasks",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_multi.py"
+    },
+    "match": false,
+    "notes": "MISMATCH"
+  },
+  "unitxt": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": "exact_match"
+    },
+    "wisent": {
+      "evaluator": "generation",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unitxt.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  },
+  "wmt2016": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": "bleu"
+    },
+    "wisent": {
+      "evaluator": null,
+      "file": null
+    },
+    "match": false,
+    "notes": "MISSING"
+  },
+  "xnli_eu": {
+    "lm_eval": {
+      "output_type": "multiple_choice",
+      "metric": "acc"
+    },
+    "wisent": {
+      "evaluator": null,
+      "file": null
+    },
+    "match": false,
+    "notes": "MISSING"
+  },
+  "xquad": {
+    "lm_eval": {
+      "output_type": "generate_until",
+      "metric": "exact_match"
+    },
+    "wisent": {
+      "evaluator": "generation",
+      "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py"
+    },
+    "match": true,
+    "notes": "MATCH"
+  }
+}