PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "fld",
+  "model_name": "mock",
+  "evaluator_name": "exact_match",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Given the following facts:\nsent1: the taxonomicness occurs. sent2: that the snowballing great-uncle ...",
+      "positive_response": "PROVED",
+      "negative_response": "DISPROVED",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Exact match: 'PROVED' == 'PROVED'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "No match: 'PROVED' not in ['DISPROVED']"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Given the following facts:\nsent1: the taxonomicness occurs. sent2: that the snowballing great-uncle does not occur brings about that the taxonomicness and the transfiguring hybridization occurs. sent3: the transfiguring nonabsorbency happens. sent4: both the noness and the snowballing interrupter occurs. sent5: the pocketing contemplative happens if the uncongenialness occurs. sent6: if the snowballing biohazard happens then the pocketing roundup does not occur and the inconvertibleness does not occur. sent7: if the pocketing Culex does not occur then both the no and the harshness happens. sent8: the noness happens. sent9: not the pocketing Culex but the tack occurs if the pocketing contemplative occurs. sent10: if the coinciding occurs the snowballing biohazard occurs. sent11: that the transfiguring nonabsorbency occurs triggers that the non-preventiveness and the rummage happens. sent12: that the discord occurs results in the pocketing contemplative. sent13: the taxonomicness and the snowballing great-uncle happens. sent14: either that the uncongenialness happens or the discord or both is triggered by that the pocketing roundup does not occur. sent15: that the backhandness does not occur is caused by that the snowballing great-uncle and the snowballing interrupter happens. sent16: if the rummage occurs then the coinciding happens.\n\nDetermine if the hypothesis can be proved, disproved, or is unknown:\nHypothesis: the backhanding does not occur.\n\nAnswer (PROVED/DISPROVED/UNKNOWN):",
+    "positive_response": "PROVED",
+    "negative_response": "DISPROVED"
+  }
+]

wisent/examples/scripts/results/test_fld_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Given the following facts:\nsent1: the taxonomicness occurs. sent2: that the snowballing great-uncle does not occur brings about that the taxonomicness and the transfiguring hybridization occurs. sent3: the transfiguring nonabsorbency happens. sent4: both the noness and the snowballing interrupter occurs. sent5: the pocketing contemplative happens if the uncongenialness occurs. sent6: if the snowballing biohazard happens then the pocketing roundup does not occur and the inconvertibleness does not occur. sent7: if the pocketing Culex does not occur then both the no and the harshness happens. sent8: the noness happens. sent9: not the pocketing Culex but the tack occurs if the pocketing contemplative occurs. sent10: if the coinciding occurs the snowballing biohazard occurs. sent11: that the transfiguring nonabsorbency occurs triggers that the non-preventiveness and the rummage happens. sent12: that the discord occurs results in the pocketing contemplative. sent13: the taxonomicness and the snowballing great-uncle happens. sent14: either that the uncongenialness happens or the discord or both is triggered by that the pocketing roundup does not occur. sent15: that the backhandness does not occur is caused by that the snowballing great-uncle and the snowballing interrupter happens. sent16: if the rummage occurs then the coinciding happens.\n\nDetermine if the hypothesis can be proved, disproved, or is unknown:\nHypothesis: the backhanding does not occur.\n\nAnswer (PROVED/DISPROVED/UNKNOWN):",
+    "positive_response": "PROVED",
+    "negative_response": "DISPROVED"
+  }
+]

wisent/examples/scripts/results/test_flores_evaluation.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "task_name": "flores",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 2,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u062c\u0644 \u062c\u0627\u0645\u0648 \u06bd\u06a0 \u062a\u064a\u0645\u06a0, \u06a0\u0646 \u0686\u0627\u0631\u0648\u06a0 \u0643 \u062a\u0627\u0644\u0648\u0631 \u06a0\u0646 \u06a0\u0646 \u06a4\u0631\u062c\u0627\u0644\u0646\u0646 \u062f...",
+      "positive_response": "Woyasik\u025bla basigilen ani seba camaw ma se k'u y\u025br\u025b s\u0254r\u0254 ka da jamana tagany\u025bf\u025b woyasi tabolo kuraw kan, y\u0254r\u0254mina tabolo fitiniw degiliw be fara \u0272\u0254g\u0254nkan teliyala.",
+      "negative_response": "da tagany\u025bf\u025b ani ma fitiniw teliyala. k'u \u0272\u0254g\u0254nkan fara y\u0254r\u0254mina seba be jamana tabolo y\u025br\u025b tabolo camaw se kuraw kan, woyasi degiliw Woyasik\u025bla ka basigilen s\u0254r\u0254",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Woyasik\u025bla basigilen ani seba camaw ma se k'u y\u025br\u025b s\u0254r\u0254 ka da jamana tagany\u025bf\u025b woyasi tabolo kuraw kan, y\u0254r\u0254mina tabolo fitiniw degiliw be fara \u0272\u0254g\u0254nkan teliyala.' (log_prob=-0.500), Expected: 'Woyasik\u025bla basigilen ani seba camaw ma se k'u y\u025br\u025b s\u0254r\u0254 ka da jamana tagany\u025bf\u025b woyasi tabolo kuraw kan, y\u0254r\u0254mina tabolo fitiniw degiliw be fara \u0272\u0254g\u0254nkan teliyala.'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Woyasik\u025bla basigilen ani seba camaw ma se k'u y\u025br\u025b s\u0254r\u0254 ka da jamana tagany\u025bf\u025b woyasi tabolo kuraw kan, y\u0254r\u0254mina tabolo fitiniw degiliw be fara \u0272\u0254g\u0254nkan teliyala.' (log_prob=-0.500), Expected: 'da tagany\u025bf\u025b ani ma fitiniw teliyala. k'u \u0272\u0254g\u0254nkan fara y\u0254r\u0254mina seba be jamana tabolo y\u025br\u025b tabolo camaw se kuraw kan, woyasi degiliw Woyasik\u025bla ka basigilen s\u0254r\u0254'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u062c\u0648\u062f\u0648\u06a9 \u062c\u0648\u06a4\u06a0 \u0647\u064a\u062a\u0648\u0634\u064a \u0633\u0627\u064a\u062a\u0648\u2e41 \u06bd\u06a0 \u0645\u0646\u06a0 \u062f\u0648\u0627 \u0645\u062f\u0627\u0644\u064a \u0645\u0648\u0647 \u0627\u0648\u0644...",
+      "positive_response": "Zap\u0254n ka Zido k\u025bla Hitishi Saito, Oly\u025bnpiki Sanu j\u0254nj\u0254n fila tabaa, sara a san 54.",
+      "negative_response": "k\u025bla 54. fila tabaa, Zap\u0254n Oly\u025bnpiki j\u0254nj\u0254n Sanu Saito, Hitishi ka Zido a san sara",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Zap\u0254n ka Zido k\u025bla Hitishi Saito, Oly\u025bnpiki Sanu j\u0254nj\u0254n fila tabaa, sara a san 54.' (log_prob=-0.500), Expected: 'Zap\u0254n ka Zido k\u025bla Hitishi Saito, Oly\u025bnpiki Sanu j\u0254nj\u0254n fila tabaa, sara a san 54.'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Zap\u0254n ka Zido k\u025bla Hitishi Saito, Oly\u025bnpiki Sanu j\u0254nj\u0254n fila tabaa, sara a san 54.' (log_prob=-0.500), Expected: 'k\u025bla 54. fila tabaa, Zap\u0254n Oly\u025bnpiki j\u0254nj\u0254n Sanu Saito, Hitishi ka Zido a san sara'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_flores_pairs.json ADDED Viewed

@@ -0,0 +1,14 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u062c\u0644 \u062c\u0627\u0645\u0648 \u06bd\u06a0 \u062a\u064a\u0645\u06a0, \u06a0\u0646 \u0686\u0627\u0631\u0648\u06a0 \u0643 \u062a\u0627\u0644\u0648\u0631 \u06a0\u0646 \u06a0\u0646 \u06a4\u0631\u062c\u0627\u0644\u0646\u0646 \u062f\u0648\u0646\u064a\u0627 \u06bd\u06a0 \u062a\u06a0\u0648\u0647 \u0645\u0627\u062c\u0648, \u062f \u06a4\u062a \u062c\u0644 \u06a4\u0633\u0627\u0628\u0646 \u0633\u0627\u0631\u064a \u0628\u0648\u062f\u0627\u064a\u0627 \u0627\u0648\u0628\u064a\u062a \u062c\u062f \u0645\u062a\u0627\u0645\u0647 \u0628\u0627\u0762\u0633.",
+    "positive_response": "Woyasik\u025bla basigilen ani seba camaw ma se k'u y\u025br\u025b s\u0254r\u0254 ka da jamana tagany\u025bf\u025b woyasi tabolo kuraw kan, y\u0254r\u0254mina tabolo fitiniw degiliw be fara \u0272\u0254g\u0254nkan teliyala.",
+    "negative_response": "da tagany\u025bf\u025b ani ma fitiniw teliyala. k'u \u0272\u0254g\u0254nkan fara y\u0254r\u0254mina seba be jamana tabolo y\u025br\u025b tabolo camaw se kuraw kan, woyasi degiliw Woyasik\u025bla ka basigilen s\u0254r\u0254"
+  },
+  {
+    "pair_id": 1,
+    "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u062c\u0648\u062f\u0648\u06a9 \u062c\u0648\u06a4\u06a0 \u0647\u064a\u062a\u0648\u0634\u064a \u0633\u0627\u064a\u062a\u0648\u2e41 \u06bd\u06a0 \u0645\u0646\u06a0 \u062f\u0648\u0627 \u0645\u062f\u0627\u0644\u064a \u0645\u0648\u0647 \u0627\u0648\u0644\u064a\u0645\u06a4\u064a\u0627\u062f\u0649\u2e41 \u0645\u0627\u062a\u0649 \u0628\u0642 \u0639\u0645\u0648 \u0665\u0664 \u062a\u0647\u0648\u0646.",
+    "positive_response": "Zap\u0254n ka Zido k\u025bla Hitishi Saito, Oly\u025bnpiki Sanu j\u0254nj\u0254n fila tabaa, sara a san 54.",
+    "negative_response": "k\u025bla 54. fila tabaa, Zap\u0254n Oly\u025bnpiki j\u0254nj\u0254n Sanu Saito, Hitishi ka Zido a san sara"
+  }
+]

wisent/examples/scripts/results/test_freebase_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "freebase",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Question: what does jamaican people speak?\nAnswer:\nA. eansnma i aloecaur CgaelgEJghniL\nB. Jamaican C...",
+      "positive_response": "Jamaican Creole English Language",
+      "negative_response": "eansnma i aloecaur CgaelgEJghniL",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Jamaican Creole English Language' (log_prob=-0.500), Expected: 'Jamaican Creole English Language'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Jamaican Creole English Language' (log_prob=-0.500), Expected: 'eansnma i aloecaur CgaelgEJghniL'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_freebase_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Question: what does jamaican people speak?\nAnswer:\nA. eansnma i aloecaur CgaelgEJghniL\nB. Jamaican Creole English Language",
+    "positive_response": "Jamaican Creole English Language",
+    "negative_response": "eansnma i aloecaur CgaelgEJghniL"
+  }
+]

wisent/examples/scripts/results/test_french_bench_evaluation.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "task_name": "french_bench",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 2,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Question: Waterskiing: Un homme en chemise bleue se tient sur une plage. Un petit gar\u00e7on en gilet de...",
+      "positive_response": "est debout dans l'eau.",
+      "negative_response": "marche derri\u00e8re lui sur la plage.",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'est debout dans l'eau.' (log_prob=-0.500), Expected: 'est debout dans l'eau.'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'est debout dans l'eau.' (log_prob=-0.500), Expected: 'marche derri\u00e8re lui sur la plage.'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "Question: Spread mulch: On voit un homme parler \u00e0 la cam\u00e9ra et encha\u00eener avec des clips de lui d\u00e9pla...",
+      "positive_response": "aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.",
+      "negative_response": "sont \u00e9galement vus autour de lui ainsi qu'un vieil homme ramassant des rochers et la vid\u00e9o se terminant avec l'homme se tenant devant un groupe et les fauchant.",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.' (log_prob=-0.500), Expected: 'aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.' (log_prob=-0.500), Expected: 'sont \u00e9galement vus autour de lui ainsi qu'un vieil homme ramassant des rochers et la vid\u00e9o se terminant avec l'homme se tenant devant un groupe et les fauchant.'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_french_bench_pairs.json ADDED Viewed

@@ -0,0 +1,14 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Question: Waterskiing: Un homme en chemise bleue se tient sur une plage. Un petit gar\u00e7on en gilet de sauvetage bleu\nA. marche derri\u00e8re lui sur la plage.\nB. est debout dans l'eau.",
+    "positive_response": "est debout dans l'eau.",
+    "negative_response": "marche derri\u00e8re lui sur la plage."
+  },
+  {
+    "pair_id": 1,
+    "prompt": "Question: Spread mulch: On voit un homme parler \u00e0 la cam\u00e9ra et encha\u00eener avec des clips de lui d\u00e9pla\u00e7ant de la terre. D'autres hommes\nA. sont \u00e9galement vus autour de lui ainsi qu'un vieil homme ramassant des rochers et la vid\u00e9o se terminant avec l'homme se tenant devant un groupe et les fauchant.\nB. aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.",
+    "positive_response": "aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.",
+    "negative_response": "sont \u00e9galement vus autour de lui ainsi qu'un vieil homme ramassant des rochers et la vid\u00e9o se terminant avec l'homme se tenant devant un groupe et les fauchant."
+  }
+]

wisent/examples/scripts/results/test_galcola_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "galcola",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Is the following sentence grammatically acceptable?\nCoci\u00f1ei o peixe para o comeres tu....",
+      "positive_response": "Yes, it is acceptable.",
+      "negative_response": "No, it is not acceptable.",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Yes, it is acceptable.' (log_prob=-0.500), Expected: 'Yes, it is acceptable.'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Yes, it is acceptable.' (log_prob=-0.500), Expected: 'No, it is not acceptable.'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_galcola_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Is the following sentence grammatically acceptable?\nCoci\u00f1ei o peixe para o comeres tu.",
+    "positive_response": "Yes, it is acceptable.",
+    "negative_response": "No, it is not acceptable."
+  }
+]

wisent/examples/scripts/results/test_galician_bench_evaluation.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "task_name": "galician_bench",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 2,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u0628\u06a9\u0627\u064a\u0647 \u06a4\u0645\u0631\u0646\u062a\u0647 \u06a9\u0648\u0646\u0633\u064a\u0631\u06cf\u0627\u062a\u064a\u0641 \u0627\u0621\u0648\u0633\u062a\u0631\u0627\u0644\u064a\u0627 \u062a\u0648\u0644\u0642 \u06a9\u0649 \u0762\u06a4\u0631\u062a\u064a...",
+      "positive_response": "\u0186sitrali laadajamanamarabulon\u0272\u025bm\u0254g\u0254w bana k'u bolon\u0254n bila Kyoto b\u025bnkans\u025bb\u025bn na, ko a b\u025b na s\u0254r\u0254 c\u025bn n'a ka danayaba ye tajibay\u025bl\u025bmabalilab\u0254li kan jamana k\u0254 kan, waati minna jamanaw i n'a f\u0254 Sini jamana ni \u0190ndu jamana tun dulonni tun \u0272\u0254g\u0254n na f\u025bnkuradilanw sira kan.",
+      "negative_response": "i na n'a tun bolon\u0254n na, danayaba kan n'a ni s\u0254r\u0254 sira laadajamanamarabulon\u0272\u025bm\u0254g\u0254w c\u025bn Kyoto \u0186sitrali waati tun f\u025bnkuradilanw \u0272\u0254g\u0254n na ka Sini minna dulonni k'u kan, ko tajibay\u025bl\u025bmabalilab\u0254li \u0190ndu jamana f\u0254 a jamana ye kan. b\u025b jamanaw bana bila jamana b\u025bnkans\u025bb\u025bn k\u0254",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '\u0186sitrali laadajamanamarabulon\u0272\u025bm\u0254g\u0254w bana k'u bolon\u0254n bila Kyoto b\u025bnkans\u025bb\u025bn na, ko a b\u025b na s\u0254r\u0254 c\u025bn n'a ka danayaba ye tajibay\u025bl\u025bmabalilab\u0254li kan jamana k\u0254 kan, waati minna jamanaw i n'a f\u0254 Sini jamana ni \u0190ndu jamana tun dulonni tun \u0272\u0254g\u0254n na f\u025bnkuradilanw sira kan.' (log_prob=-0.500), Expected: '\u0186sitrali laadajamanamarabulon\u0272\u025bm\u0254g\u0254w bana k'u bolon\u0254n bila Kyoto b\u025bnkans\u025bb\u025bn na, ko a b\u025b na s\u0254r\u0254 c\u025bn n'a ka danayaba ye tajibay\u025bl\u025bmabalilab\u0254li kan jamana k\u0254 kan, waati minna jamanaw i n'a f\u0254 Sini jamana ni \u0190ndu jamana tun dulonni tun \u0272\u0254g\u0254n na f\u025bnkuradilanw sira kan.'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '\u0186sitrali laadajamanamarabulon\u0272\u025bm\u0254g\u0254w bana k'u bolon\u0254n bila Kyoto b\u025bnkans\u025bb\u025bn na, ko a b\u025b na s\u0254r\u0254 c\u025bn n'a ka danayaba ye tajibay\u025bl\u025bmabalilab\u0254li kan jamana k\u0254 kan, waati minna jamanaw i n'a f\u0254 Sini jamana ni \u0190ndu jamana tun dulonni tun \u0272\u0254g\u0254n na f\u025bnkuradilanw sira kan.' (log_prob=-0.500), Expected: 'i na n'a tun bolon\u0254n na, danayaba kan n'a ni s\u0254r\u0254 sira laadajamanamarabulon\u0272\u025bm\u0254g\u0254w c\u025bn Kyoto \u0186sitrali waati tun f\u025bnkuradilanw \u0272\u0254g\u0254n na ka Sini minna dulonni k'u kan, ko tajibay\u025bl\u025bmabalilab\u0254li \u0190ndu jamana f\u0254 a jamana ye kan. b\u025b jamanaw bana bila jamana b\u025bnkans\u025bb\u025bn k\u0254'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u0645\u0648\u062a\u0648 \u0628\u064a\u0633 \u06bd\u0646 \u0645\u062c\u0642 \u0627\u0648 \u0633\u064a\u06a9\u200c\u0633 \u0641\u0644\u0627\u0762\u200c\u0633 \u0633\u062a. \u0644\u0648\u064a\u0633 \u0627\u064a \u0645\u064a\u0633\u0648\u0631...",
+      "positive_response": "M\u0254biliba in tun \u014b\u025bsin na  Six Flags St. Louis ma  Misuri walasa  kulu in ka f\u0254li k\u025b konkafeere jama f\u025b.",
+      "negative_response": "kulu M\u0254biliba f\u025b. Flags \u014b\u025bsin in ka St. Six in tun k\u025b na ma Louis jama Misuri f\u0254li konkafeere walasa",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'M\u0254biliba in tun \u014b\u025bsin na  Six Flags St. Louis ma  Misuri walasa  kulu in ka f\u0254li k\u025b konkafeere jama f\u025b.' (log_prob=-0.500), Expected: 'M\u0254biliba in tun \u014b\u025bsin na  Six Flags St. Louis ma  Misuri walasa  kulu in ka f\u0254li k\u025b konkafeere jama f\u025b.'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'M\u0254biliba in tun \u014b\u025bsin na  Six Flags St. Louis ma  Misuri walasa  kulu in ka f\u0254li k\u025b konkafeere jama f\u025b.' (log_prob=-0.500), Expected: 'kulu M\u0254biliba f\u025b. Flags \u014b\u025bsin in ka St. Six in tun k\u025b na ma Louis jama Misuri f\u0254li konkafeere walasa'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_galician_bench_pairs.json ADDED Viewed

@@ -0,0 +1,14 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u0628\u06a9\u0627\u064a\u0647 \u06a4\u0645\u0631\u0646\u062a\u0647 \u06a9\u0648\u0646\u0633\u064a\u0631\u06cf\u0627\u062a\u064a\u0641 \u0627\u0621\u0648\u0633\u062a\u0631\u0627\u0644\u064a\u0627 \u062a\u0648\u0644\u0642 \u06a9\u0649 \u0762\u06a4\u0631\u062a\u064a\u0641\u064a\u06a9\u0627\u0633\u064a \u06a9\u064a\u0648\u062a\u0648\u060c \u062f\u06a0\u0648\u0646 \u062e\u0646 \u0647\u0627\u064a \u06bd\u0646 \u0627\u06a9\u0646 \u0762\u06a4\u0631\u0644\u0648\u0647 \u0627\u064a\u06a9\u0648\u0646\u0648\u0645\u064a \u062f\u06a0\u0648\u0646 \u0645\u0762\u0646\u062a\u0648\u06a0\u062c\u064a\u0647 \u06bd\u06a0 \u0628\u0631\u062a \u0628\u0642 \u0627\u064a\u06a9\u200c\u0633\u06a4\u0648\u0631 \u0628\u0627\u062a\u0649 \u0628\u0627\u0631\u0627\u060c \u0627\u062f\u0642\u06a4\u064a\u0647 \u0646\u06a0\u0631\u0648-\u0646\u06a0\u0631\u0648 \u0644\u0627\u0762\u0649 \u0627\u064a\u0646\u062f\u064a\u0627 \u06a0\u0648\u0646 \u0686\u064a\u0646\u0627 \u0647\u0627\u0646 \u0645\u0623\u064a\u06a9\u062a \u062f\u06a0\u0648\u0646 \u062a\u0631\u0762\u064a\u062a \u0627\u064a\u0645\u064a\u0633\u064a.",
+    "positive_response": "\u0186sitrali laadajamanamarabulon\u0272\u025bm\u0254g\u0254w bana k'u bolon\u0254n bila Kyoto b\u025bnkans\u025bb\u025bn na, ko a b\u025b na s\u0254r\u0254 c\u025bn n'a ka danayaba ye tajibay\u025bl\u025bmabalilab\u0254li kan jamana k\u0254 kan, waati minna jamanaw i n'a f\u0254 Sini jamana ni \u0190ndu jamana tun dulonni tun \u0272\u0254g\u0254n na f\u025bnkuradilanw sira kan.",
+    "negative_response": "i na n'a tun bolon\u0254n na, danayaba kan n'a ni s\u0254r\u0254 sira laadajamanamarabulon\u0272\u025bm\u0254g\u0254w c\u025bn Kyoto \u0186sitrali waati tun f\u025bnkuradilanw \u0272\u0254g\u0254n na ka Sini minna dulonni k'u kan, ko tajibay\u025bl\u025bmabalilab\u0254li \u0190ndu jamana f\u0254 a jamana ye kan. b\u025b jamanaw bana bila jamana b\u025bnkans\u025bb\u025bn k\u0254"
+  },
+  {
+    "pair_id": 1,
+    "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u0645\u0648\u062a\u0648 \u0628\u064a\u0633 \u06bd\u0646 \u0645\u062c\u0642 \u0627\u0648 \u0633\u064a\u06a9\u200c\u0633 \u0641\u0644\u0627\u0762\u200c\u0633 \u0633\u062a. \u0644\u0648\u064a\u0633 \u0627\u064a \u0645\u064a\u0633\u0648\u0631\u064a \u06a9\u0649 \u0628\u0627\u0646\u200c\u062f \u06a9\u0649 \u0645\u0645\u0626\u064a\u0646 \u0627\u064a \u0631\u0627\u0645\u0649 \u0627\u0648\u0631\u06a0 \u06bd\u06a0 \u062a\u06a4\u0628\u0644\u0648 \u0647\u0627\u0628\u064a\u0647.",
+    "positive_response": "M\u0254biliba in tun \u014b\u025bsin na  Six Flags St. Louis ma  Misuri walasa  kulu in ka f\u0254li k\u025b konkafeere jama f\u025b.",
+    "negative_response": "kulu M\u0254biliba f\u025b. Flags \u014b\u025bsin in ka St. Six in tun k\u025b na ma Louis jama Misuri f\u0254li konkafeere walasa"
+  }
+]

wisent/examples/scripts/results/test_glianorex_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "glianorex",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Question: A 34-year-old man presents to the clinic complaining of mood swings, dizziness, and unstea...",
+      "positive_response": "Glianorex degeneration",
+      "negative_response": "Glianorex Hyperactivity Disorder",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Glianorex degeneration' (log_prob=-0.500), Expected: 'Glianorex degeneration'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Glianorex degeneration' (log_prob=-0.500), Expected: 'Glianorex Hyperactivity Disorder'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_glianorex_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Question: A 34-year-old man presents to the clinic complaining of mood swings, dizziness, and unsteady gait. His symptoms started three months ago and have progressively worsened. The individual has been otherwise healthy and reports no history of trauma or past surgeries. Given your understanding of the novel organ Glianorex, and taking into account all of the patient's symptoms, which of the following conditions could you suspect?\nA. Glianorex Hyperactivity Disorder\nB. Glianorex degeneration",
+    "positive_response": "Glianorex degeneration",
+    "negative_response": "Glianorex Hyperactivity Disorder"
+  }
+]

wisent/examples/scripts/results/test_global_mmlu_evaluation.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "task_name": "global_mmlu",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 2,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Question: \u00bfCu\u00e1l de las siguientes personas se beneficiar\u00eda m\u00e1s si el valor del d\u00f3lar estadounidense ...",
+      "positive_response": "Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;",
+      "negative_response": "Un turista japon\u00e9s de vacaciones en los Estados Unidos;",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;' (log_prob=-0.500), Expected: 'Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;' (log_prob=-0.500), Expected: 'Un turista japon\u00e9s de vacaciones en los Estados Unidos;'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "Question: Se uma pe\u00e7a musical est\u00e1 em \"tempo comum\", quantas quartas h\u00e1 por compasso?\nA. cinco\nB. qu...",
+      "positive_response": "quatro",
+      "negative_response": "cinco",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'quatro' (log_prob=-0.500), Expected: 'quatro'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'quatro' (log_prob=-0.500), Expected: 'cinco'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_global_mmlu_pairs.json ADDED Viewed

@@ -0,0 +1,14 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Question: \u00bfCu\u00e1l de las siguientes personas se beneficiar\u00eda m\u00e1s si el valor del d\u00f3lar estadounidense aumentara con respecto al yen japon\u00e9s?\nA. Un turista japon\u00e9s de vacaciones en los Estados Unidos;\nB. Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;",
+    "positive_response": "Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;",
+    "negative_response": "Un turista japon\u00e9s de vacaciones en los Estados Unidos;"
+  },
+  {
+    "pair_id": 1,
+    "prompt": "Question: Se uma pe\u00e7a musical est\u00e1 em \"tempo comum\", quantas quartas h\u00e1 por compasso?\nA. cinco\nB. quatro",
+    "positive_response": "quatro",
+    "negative_response": "cinco"
+  }
+]

wisent/examples/scripts/results/test_glue_evaluation.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "task_name": "glue",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 2,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "How can I do self study effectively?\nHow do you study effectively?\nQuestion: Do both qiestions ask t...",
+      "positive_response": "Yes",
+      "negative_response": "No",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Yes' (log_prob=-0.500), Expected: 'Yes'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Yes' (log_prob=-0.500), Expected: 'No'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "Lavishly furnished and decorated, with much original period furniture, the rooms are used for ceremo...",
+      "positive_response": "True",
+      "negative_response": "Neither",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'True' (log_prob=-0.500), Expected: 'True'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'True' (log_prob=-0.500), Expected: 'Neither'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_glue_pairs.json ADDED Viewed

@@ -0,0 +1,14 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "How can I do self study effectively?\nHow do you study effectively?\nQuestion: Do both qiestions ask the same thing?\nAnswer:\nA. Yes\nB. No",
+    "positive_response": "Yes",
+    "negative_response": "No"
+  },
+  {
+    "pair_id": 1,
+    "prompt": "Lavishly furnished and decorated, with much original period furniture, the rooms are used for ceremonial events, visits from foreign dignitaries, and EU meetings.\nQuestion: The rooms are opulent, and used for formal, elegant events. True, False or Neither?\nAnswer:",
+    "positive_response": "True",
+    "negative_response": "Neither"
+  }
+]

wisent/examples/scripts/results/test_gpqa_evaluation.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "task_name": "gpqa",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 2,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "What is the correct answer to this question:A quantum mechanical particle of mass m moves in two dim...",
+      "positive_response": "E = (2n_x+n_y+3/2)\u210f*sqrt(k/m)",
+      "negative_response": "E = (n_x+3*n_y+3/2) \u210f*sqrt(k/m))",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'E = (2n_x+n_y+3/2)\u210f*sqrt(k/m)' (log_prob=-0.500), Expected: 'E = (2n_x+n_y+3/2)\u210f*sqrt(k/m)'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'E = (2n_x+n_y+3/2)\u210f*sqrt(k/m)' (log_prob=-0.500), Expected: 'E = (n_x+3*n_y+3/2) \u210f*sqrt(k/m))'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "What is the correct answer to this question:Arrange the following carbocations in decreasing order o...",
+      "positive_response": "5>1>6>7>4>3>2",
+      "negative_response": "4>3>2>5>1>6>7",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '5>1>6>7>4>3>2' (log_prob=-0.500), Expected: '5>1>6>7>4>3>2'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '5>1>6>7>4>3>2' (log_prob=-0.500), Expected: '4>3>2>5>1>6>7'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_gpqa_pairs.json ADDED Viewed

@@ -0,0 +1,14 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "What is the correct answer to this question:A quantum mechanical particle of mass m moves in two dimensions in the following potential, as a function of (r,\u03b8): V (r, \u03b8) = 1/2 kr^2 + 3/2 kr^2 cos^2(\u03b8)\nFind the energy spectrum.\nChoices:\n(A) E = (n_x+3*n_y+3/2) \u210f*sqrt(k/m))\n(B) E = (2n_x+n_y+3/2)\u210f*sqrt(k/m)\n(C) E = (2n_x+3n_y+1/2) \u210f*sqrt(k/m))\n(D) E = (3n_x+2n_y+1/2) \u210f*sqrt(k/m))\nLet's think step by step: ",
+    "positive_response": "E = (2n_x+n_y+3/2)\u210f*sqrt(k/m)",
+    "negative_response": "E = (n_x+3*n_y+3/2) \u210f*sqrt(k/m))"
+  },
+  {
+    "pair_id": 1,
+    "prompt": "What is the correct answer to this question:Arrange the following carbocations in decreasing order of stability:\n\n1. CH3OCH2(+)\n2. CH2(+)-NO2\n3. CH2(+)-CHO\n4. CH3COCH2(+)\n5. CH2(+)-OH\n6. CH3CH2(+)\n7. CH2(+)CH2Cl\nChoices:\n(A) 4>3>2>5>1>6>7\n(B) 1>5>7>6>4>3>2\n(C) 5>6>7>1>2>3>4\n(D) 5>1>6>7>4>3>2\nLet's think step by step: ",
+    "positive_response": "5>1>6>7>4>3>2",
+    "negative_response": "4>3>2>5>1>6>7"
+  }
+]