PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/examples/scripts/results/test_kobest_evaluation.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "task_name": "kobest",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 2,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Question: \ubc95\uad00(\u6cd5\u5b98)\uc740 \uad6d\uac00\uc758 \uc0ac\ubc95\uad8c\uc744 \ud589\uc0ac\ud558\ub294 \uc0ac\ubc95\ubd80\ub97c \uad6c\uc131\ud558\ub294 \ubc95\uc6d0\uc5d0\uc11c \ud310\uc0ac\uc5d0 \uc784\uba85\ub418\uc5b4, \uc0ac\ubc95\uad8c\uc744 \ud589\uc0ac\ud558\ub294 \uacf5\ubb34\uc6d0\uc5d0 \ub300\ud574 \ud5cc\ubc95\uae30\uad00\uc73c\ub85c\uc11c \uad8c\uc704\ub97c \ub192\uc77c \ubaa9\uc801\uc73c\ub85c \uc77c\uceeb\ub294 \uba85...",
+      "positive_response": "Yes",
+      "negative_response": "No",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Yes' (log_prob=-0.500), Expected: 'Yes'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Yes' (log_prob=-0.500), Expected: 'No'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "Question: \uc870\uc120 \uc2dc\ub300 \ub3c4\uc131 5\ub300 \uba85\uc2b9\uc9c0\uc5d0 \uc774\ub984\uc744 \uc62c\ub9b0 \ub099\uc0b0\uc740 \uc870\uc120\uc2dc\ub300 \ub54c \ubb38\uc778\ub4e4\uc774 \ubcc4\uc7a5\uc744 \uc9d3\uace0 \uc0b4 \ub9cc\ud07c \ud48d\uad11\uc774 \uc544\ub984\ub2f5\uae30\ub85c \ub110\ub9ac \uc54c\ub824\uc838 \uc654\ub2e4. \uc870\uc120\uc2dc\ub300\uc5d0\ub294 \uc0bc\uccad(\u4e09\u6df8)\u00b7\uc778\uc655(...",
+      "positive_response": "Yes",
+      "negative_response": "No",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Yes' (log_prob=-0.500), Expected: 'Yes'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Yes' (log_prob=-0.500), Expected: 'No'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_kobest_pairs.json ADDED Viewed

@@ -0,0 +1,14 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Question: \ubc95\uad00(\u6cd5\u5b98)\uc740 \uad6d\uac00\uc758 \uc0ac\ubc95\uad8c\uc744 \ud589\uc0ac\ud558\ub294 \uc0ac\ubc95\ubd80\ub97c \uad6c\uc131\ud558\ub294 \ubc95\uc6d0\uc5d0\uc11c \ud310\uc0ac\uc5d0 \uc784\uba85\ub418\uc5b4, \uc0ac\ubc95\uad8c\uc744 \ud589\uc0ac\ud558\ub294 \uacf5\ubb34\uc6d0\uc5d0 \ub300\ud574 \ud5cc\ubc95\uae30\uad00\uc73c\ub85c\uc11c \uad8c\uc704\ub97c \ub192\uc77c \ubaa9\uc801\uc73c\ub85c \uc77c\uceeb\ub294 \uba85\uce6d\uc774\ub2e4.\n\ubc95\uad00\uc740 \uacf5\ubb34\uc6d0\uc5d0 \uc18d\ud558\ub098\uc694?\nA. No\nB. Yes",
+    "positive_response": "Yes",
+    "negative_response": "No"
+  },
+  {
+    "pair_id": 1,
+    "prompt": "Question: \uc870\uc120 \uc2dc\ub300 \ub3c4\uc131 5\ub300 \uba85\uc2b9\uc9c0\uc5d0 \uc774\ub984\uc744 \uc62c\ub9b0 \ub099\uc0b0\uc740 \uc870\uc120\uc2dc\ub300 \ub54c \ubb38\uc778\ub4e4\uc774 \ubcc4\uc7a5\uc744 \uc9d3\uace0 \uc0b4 \ub9cc\ud07c \ud48d\uad11\uc774 \uc544\ub984\ub2f5\uae30\ub85c \ub110\ub9ac \uc54c\ub824\uc838 \uc654\ub2e4. \uc870\uc120\uc2dc\ub300\uc5d0\ub294 \uc0bc\uccad(\u4e09\u6df8)\u00b7\uc778\uc655(\u4ec1\u738b)\u00b7\uc30d\uacc4(\u96d9\u9dc4)\u00b7\ubc31\uc6b4(\u767d\u96f2)\u00b7\uccad\ud559(\u6df8\u9db4)\uc744 \ub3c4\uc131 \uc548\uc758 5\ub300 \uba85\uc2b9\uc9c0\ub85c \uc190\uaf3d\uc558\ub294\ub370, \uc774 \uc911 \ud604\uc7ac\uc758 \uc774\ud654\ub3d9\uc5d0 \uc788\ub358 \uc30d\uacc4\ub3d9\uc774 \uae30\ubb18\ud55c \uc554\uc11d\uacfc \uc6b8\ucc3d\ud55c \uc218\ub9bc\uc5d0 \ub450\uc904\uae30\uc758 \ub9d1\uc740 \uc2dc\ub0c7\ubb3c\uae4c\uc9c0 \uc788\ub2e4 \ud558\uc5ec \uc138 \ubc88\uc9f8\ub85c \uaf3d\ud614\ub2e4.\n\ub099\uc0b0\uc740 \ud48d\uad11\uc774 \uc544\ub984\ub2e4\uc6cc \uc870\uc120 \ubb38\uc778\uc758 \uc0ac\ub791\uc744 \ubc1b\uc558\ub2e4.\nA. No\nB. Yes",
+    "positive_response": "Yes",
+    "negative_response": "No"
+  }
+]

wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "kormedmcqa",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "2\uac1c\uc6d4 \ub0a8\uc544\uac00 BCG\uc608\ubc29\uc811\uc885 1\uac1c\uc6d4 \ub4a4 \uc8fc\uc0ac \ubd80\uc704\uc5d0 \uc774\uc0c1\ubc18\uc751\uc774 \uc0dd\uaca8\uc11c \uc608\ubc29\uc811\uc885\uc744 \uc2e4\uc2dc\ud55c \uc18c\uc544\uccad\uc18c\ub144\uacfc\uc758\uc6d0\uc744 \ucc3e\uc544\uc654\ub2e4. \uc774\ub54c \u300c\uac10\uc5fc\ubcd1\uc758 \uc608\ubc29 \ubc0f \uad00\ub9ac\uc5d0 \uad00\ud55c \ubc95\ub960\u300d\uc5d0 \ub530\ub77c \uc608\ubc29\uc811\uc885 \ud6c4...",
+      "positive_response": "C",
+      "negative_response": "D",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'C' (log_prob=-0.500), Expected: 'C'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'C' (log_prob=-0.500), Expected: 'D'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "2\uac1c\uc6d4 \ub0a8\uc544\uac00 BCG\uc608\ubc29\uc811\uc885 1\uac1c\uc6d4 \ub4a4 \uc8fc\uc0ac \ubd80\uc704\uc5d0 \uc774\uc0c1\ubc18\uc751\uc774 \uc0dd\uaca8\uc11c \uc608\ubc29\uc811\uc885\uc744 \uc2e4\uc2dc\ud55c \uc18c\uc544\uccad\uc18c\ub144\uacfc\uc758\uc6d0\uc744 \ucc3e\uc544\uc654\ub2e4. \uc774\ub54c \u300c\uac10\uc5fc\ubcd1\uc758 \uc608\ubc29 \ubc0f \uad00\ub9ac\uc5d0 \uad00\ud55c \ubc95\ub960\u300d\uc5d0 \ub530\ub77c \uc608\ubc29\uc811\uc885 \ud6c4 \uc774\uc0c1\ubc18\uc751\uc73c\ub85c \uc9c4\ub2e8\ud55c \uc6d0\uc7a5\uc774 \uc774\uc0c1\ubc18\uc751 \ubc1c\uc0dd\uc2e0\uace0\uc11c\ub97c \uc81c\ucd9c\ud574\uc57c \ud560 \ub300\uc0c1\uc740?\nA. \ub300\ud55c\uc758\uc0ac\ud611\ud68c\uc7a5\nB. \ubcf4\uac74\ubcf5\uc9c0\ubd80\uc7a5\uad00\nC. \ub0a8\uc544 \uc18c\uc7ac\uc9c0 \uad00\ud560 \ubcf4\uac74\uc18c\uc7a5\nD. \ub0a8\uc544 \uc18c\uc7ac\uc9c0 \uad00\ud560 \uc2dc\uc7a5 \u2219 \uad70\uc218 \u2219 \uad6c\uccad\uc7a5\nE. \ub0a8\uc544 \uc18c\uc7ac\uc9c0 \uad00\ud560 \uc2dc \u2219 \ub3c4\uc9c0\uc0ac\n\uc815\ub2f5\uff1a",
+    "positive_response": "C",
+    "negative_response": "D"
+  }
+]

wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "kormedmcqa_dentist",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "\ud600\uc758 \ud6c4\ubc29 1/3 \ubd80\uc704\uc640 \uc778\ub450 \uc810\ub9c9\uc5d0 \ub300\ud55c \uac10\uac01 \uae30\ub2a5, \uacbd\ub3cc\uc778\ub450\uadfc(stylopharyngeal m.)\uc5d0 \ub300\ud55c \uc6b4\ub3d9 \uae30\ub2a5\uc744 \ub2f4\ub2f9\ud558\ub294 \ub1cc\uc2e0\uacbd\uc758 \uae30\ub2a5\uc744 \ud3c9\uac00\ud558\ub294 \ubc29\ubc95\uc740?\nA. \ubcfc\uc744 \ubd80\ud480...",
+      "positive_response": "C",
+      "negative_response": "D",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'C' (log_prob=-0.500), Expected: 'C'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'C' (log_prob=-0.500), Expected: 'D'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "\ud600\uc758 \ud6c4\ubc29 1/3 \ubd80\uc704\uc640 \uc778\ub450 \uc810\ub9c9\uc5d0 \ub300\ud55c \uac10\uac01 \uae30\ub2a5, \uacbd\ub3cc\uc778\ub450\uadfc(stylopharyngeal m.)\uc5d0 \ub300\ud55c \uc6b4\ub3d9 \uae30\ub2a5\uc744 \ub2f4\ub2f9\ud558\ub294 \ub1cc\uc2e0\uacbd\uc758 \uae30\ub2a5\uc744 \ud3c9\uac00\ud558\ub294 \ubc29\ubc95\uc740?\nA. \ubcfc\uc744 \ubd80\ud480\uac8c \ud55c\ub2e4.\nB. \uc774\ub97c \uaf49 \ubb3c\uac8c \ud55c\ub2e4.\nC. '\uc544' \uc18c\ub9ac\ub97c \ub0b4\uac8c \ud55c\ub2e4.\nD. \ud600\ub97c \ucd5c\ub300\ud55c \uc55e\uc73c\ub85c \ub0b4\ubc00\uac8c \ud55c\ub2e4.\nE. \uc124\uc555\uc790\ub97c \ud600 \uce21\ubc29\uc5d0 \ub300\uace0 \ud600\ub85c \uc606\uc73c\ub85c \ubc00\uac8c \ud55c\ub2e4.\n\uc815\ub2f5\uff1a",
+    "positive_response": "C",
+    "negative_response": "D"
+  }
+]

wisent/examples/scripts/results/test_kormedmcqa_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "kormedmcqa",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "2\uac1c\uc6d4 \ub0a8\uc544\uac00 BCG\uc608\ubc29\uc811\uc885 1\uac1c\uc6d4 \ub4a4 \uc8fc\uc0ac \ubd80\uc704\uc5d0 \uc774\uc0c1\ubc18\uc751\uc774 \uc0dd\uaca8\uc11c \uc608\ubc29\uc811\uc885\uc744 \uc2e4\uc2dc\ud55c \uc18c\uc544\uccad\uc18c\ub144\uacfc\uc758\uc6d0\uc744 \ucc3e\uc544\uc654\ub2e4. \uc774\ub54c \u300c\uac10\uc5fc\ubcd1\uc758 \uc608\ubc29 \ubc0f \uad00\ub9ac\uc5d0 \uad00\ud55c \ubc95\ub960\u300d\uc5d0 \ub530\ub77c \uc608\ubc29\uc811\uc885 \ud6c4...",
+      "positive_response": "C",
+      "negative_response": "D",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'C' (log_prob=-0.500), Expected: 'C'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'C' (log_prob=-0.500), Expected: 'D'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_kormedmcqa_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "2\uac1c\uc6d4 \ub0a8\uc544\uac00 BCG\uc608\ubc29\uc811\uc885 1\uac1c\uc6d4 \ub4a4 \uc8fc\uc0ac \ubd80\uc704\uc5d0 \uc774\uc0c1\ubc18\uc751\uc774 \uc0dd\uaca8\uc11c \uc608\ubc29\uc811\uc885\uc744 \uc2e4\uc2dc\ud55c \uc18c\uc544\uccad\uc18c\ub144\uacfc\uc758\uc6d0\uc744 \ucc3e\uc544\uc654\ub2e4. \uc774\ub54c \u300c\uac10\uc5fc\ubcd1\uc758 \uc608\ubc29 \ubc0f \uad00\ub9ac\uc5d0 \uad00\ud55c \ubc95\ub960\u300d\uc5d0 \ub530\ub77c \uc608\ubc29\uc811\uc885 \ud6c4 \uc774\uc0c1\ubc18\uc751\uc73c\ub85c \uc9c4\ub2e8\ud55c \uc6d0\uc7a5\uc774 \uc774\uc0c1\ubc18\uc751 \ubc1c\uc0dd\uc2e0\uace0\uc11c\ub97c \uc81c\ucd9c\ud574\uc57c \ud560 \ub300\uc0c1\uc740?\nA. \ub300\ud55c\uc758\uc0ac\ud611\ud68c\uc7a5\nB. \ubcf4\uac74\ubcf5\uc9c0\ubd80\uc7a5\uad00\nC. \ub0a8\uc544 \uc18c\uc7ac\uc9c0 \uad00\ud560 \ubcf4\uac74\uc18c\uc7a5\nD. \ub0a8\uc544 \uc18c\uc7ac\uc9c0 \uad00\ud560 \uc2dc\uc7a5 \u2219 \uad70\uc218 \u2219 \uad6c\uccad\uc7a5\nE. \ub0a8\uc544 \uc18c\uc7ac\uc9c0 \uad00\ud560 \uc2dc \u2219 \ub3c4\uc9c0\uc0ac\n\uc815\ub2f5\uff1a",
+    "positive_response": "C",
+    "negative_response": "D"
+  }
+]

wisent/examples/scripts/results/test_lambada_cloze_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "lambada_cloze",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if...",
+      "positive_response": "signs",
+      "negative_response": "word",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'signs'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'word'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_lambada_cloze_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if you're going to be out at night getting hit by cars, you might as well have some backup.\" I look at him, feeling stunned. Like this is some sort of sign. But as I stare at Harlin, his mouth curved in a confident grin, I don't care about ____.",
+    "positive_response": "signs",
+    "negative_response": "word"
+  }
+]

wisent/examples/scripts/results/test_lambada_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "lambada",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if...",
+      "positive_response": "signs",
+      "negative_response": "gave",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'signs'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'gave'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "lambada_openai_mt_stablelm_en",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if...",
+      "positive_response": "signs",
+      "negative_response": "very",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'signs'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'very'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if you're going to be out at night getting hit by cars, you might as well have some backup.\" I look at him, feeling stunned. Like this is some sort of sign. But as I stare at Harlin, his mouth curved in a confident grin, I don't care about",
+    "positive_response": "signs",
+    "negative_response": "very"
+  }
+]

wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "task_name": "lambada_multilingual",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 2,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "\"Te has demostrado digno\", me dice ella.Ella avanzaMe retrocede.Su sonrisa tiembla. Yo digo: \"Te ale...",
+      "positive_response": "venenos.",
+      "negative_response": "but",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'venenos.' (log_prob=-0.500), Expected: 'venenos.'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'venenos.' (log_prob=-0.500), Expected: 'but'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "Sarai, uh, playacting.Non esegue effettivamente tanti compiti.La disparit\u00e0 di genere che stai vedend...",
+      "positive_response": "uomini",
+      "negative_response": "any",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'uomini' (log_prob=-0.500), Expected: 'uomini'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'uomini' (log_prob=-0.500), Expected: 'any'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json ADDED Viewed

@@ -0,0 +1,14 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "\"Te has demostrado digno\", me dice ella.Ella avanzaMe retrocede.Su sonrisa tiembla. Yo digo: \"Te alejaste de m\u00ed\". \"Es mi naturaleza\", dice ella. \"Usted pone a sus arqueros en m\u00ed, y sus venenos\". Ella se encoge de hombros.\"Tengo m\u00e1s arqueros, y m\u00e1s",
+    "positive_response": "venenos.",
+    "negative_response": "but"
+  },
+  {
+    "pair_id": 1,
+    "prompt": "Sarai, uh, playacting.Non esegue effettivamente tanti compiti.La disparit\u00e0 di genere che stai vedendo qui non \u00e8 un incidente. \"Annu\u00ec a Lucian, Phil, George e Justin.\" Loro-e me stesso - sono i giocatori principali in questo viaggio.Questo \u00e8 ci\u00f2 che gli Arcadiati si aspettano.Non si verificherebbe nemmeno a loro che i diplomatici importanti, potenti diplomatici sarebbero tutt'altro che",
+    "positive_response": "uomini",
+    "negative_response": "any"
+  }
+]

wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "task_name": "lambada_multilingual",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 2,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "\"Te has demostrado digno\", me dice ella.Ella avanzaMe retrocede.Su sonrisa tiembla. Yo digo: \"Te ale...",
+      "positive_response": "venenos.",
+      "negative_response": "word",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'venenos.' (log_prob=-0.500), Expected: 'venenos.'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'venenos.' (log_prob=-0.500), Expected: 'word'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "Sarai, uh, playacting.Non esegue effettivamente tanti compiti.La disparit\u00e0 di genere che stai vedend...",
+      "positive_response": "uomini",
+      "negative_response": "word",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'uomini' (log_prob=-0.500), Expected: 'uomini'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'uomini' (log_prob=-0.500), Expected: 'word'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_lambada_multilingual_pairs.json ADDED Viewed

@@ -0,0 +1,14 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "\"Te has demostrado digno\", me dice ella.Ella avanzaMe retrocede.Su sonrisa tiembla. Yo digo: \"Te alejaste de m\u00ed\". \"Es mi naturaleza\", dice ella. \"Usted pone a sus arqueros en m\u00ed, y sus venenos\". Ella se encoge de hombros.\"Tengo m\u00e1s arqueros, y m\u00e1s ____.",
+    "positive_response": "venenos.",
+    "negative_response": "word"
+  },
+  {
+    "pair_id": 1,
+    "prompt": "Sarai, uh, playacting.Non esegue effettivamente tanti compiti.La disparit\u00e0 di genere che stai vedendo qui non \u00e8 un incidente. \"Annu\u00ec a Lucian, Phil, George e Justin.\" Loro-e me stesso - sono i giocatori principali in questo viaggio.Questo \u00e8 ci\u00f2 che gli Arcadiati si aspettano.Non si verificherebbe nemmeno a loro che i diplomatici importanti, potenti diplomatici sarebbero tutt'altro che ____.",
+    "positive_response": "uomini",
+    "negative_response": "word"
+  }
+]

wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json ADDED Viewed

@@ -0,0 +1,51 @@
+{
+  "task_name": "lambada_multilingual_stablelm",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 2,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Mijn vader was een Franse edelman die Bretagne van zijn vrouw erfde.\" \"Je moeder.\" \"Nee.\" Ze schudt ...",
+      "positive_response": "Bretagne\"",
+      "negative_response": "came",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Bretagne\"' (log_prob=-0.500), Expected: 'Bretagne\"'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Bretagne\"' (log_prob=-0.500), Expected: 'came'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "Wenn man es genau bedenkt, k\u00f6nnte es ihr sogar Auftrieb geben, wenn das \u00fcberhaupt denkbar ist.\" \"Wie...",
+      "positive_response": "Miller",
+      "negative_response": "made",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Miller' (log_prob=-0.500), Expected: 'Miller'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'Miller' (log_prob=-0.500), Expected: 'made'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json ADDED Viewed

@@ -0,0 +1,14 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Mijn vader was een Franse edelman die Bretagne van zijn vrouw erfde.\" \"Je moeder.\" \"Nee.\" Ze schudt snel en stevig haar hoofd. \"Niet mijn moeder. Zijn eerste vrouw, de erfgename van Bretagne, stierf jaren voordat ik geboren werd. Mijn moeder heette ook Marguerite, maar ze was Marguerite van Foix, niet van",
+    "positive_response": "Bretagne\"",
+    "negative_response": "came"
+  },
+  {
+    "pair_id": 1,
+    "prompt": "Wenn man es genau bedenkt, k\u00f6nnte es ihr sogar Auftrieb geben, wenn das \u00fcberhaupt denkbar ist.\" \"Wie das?\", fragte Miller. \"Sie ist eine Rebellin\", bot Jones an. \"Und Rebellen neigen dazu, im Kreuzfeuer zu gedeihen, nicht wahr?\" \"Ich verstehe, was Sie meinen\", sagte",
+    "positive_response": "Miller",
+    "negative_response": "made"
+  }
+]

wisent/examples/scripts/results/test_lambada_openai_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "lambada_openai",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if...",
+      "positive_response": "signs",
+      "negative_response": "and",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'signs'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'and'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_lambada_openai_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if you're going to be out at night getting hit by cars, you might as well have some backup.\" I look at him, feeling stunned. Like this is some sort of sign. But as I stare at Harlin, his mouth curved in a confident grin, I don't care about",
+    "positive_response": "signs",
+    "negative_response": "and"
+  }
+]

wisent/examples/scripts/results/test_lambada_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if you're going to be out at night getting hit by cars, you might as well have some backup.\" I look at him, feeling stunned. Like this is some sort of sign. But as I stare at Harlin, his mouth curved in a confident grin, I don't care about",
+    "positive_response": "signs",
+    "negative_response": "gave"
+  }
+]

wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "lambada_openai_mt_stablelm_en",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if...",
+      "positive_response": "signs",
+      "negative_response": "but",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'signs'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'but'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if you're going to be out at night getting hit by cars, you might as well have some backup.\" I look at him, feeling stunned. Like this is some sort of sign. But as I stare at Harlin, his mouth curved in a confident grin, I don't care about",
+    "positive_response": "signs",
+    "negative_response": "but"
+  }
+]

wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "task_name": "lambada_openai_mt_stablelm_en",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 1,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if...",
+      "positive_response": "signs",
+      "negative_response": "more",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'signs'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'signs' (log_prob=-0.500), Expected: 'more'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json ADDED Viewed

@@ -0,0 +1,8 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "In my palm is a clear stone, and inside it is a small ivory statuette. A guardian angel. \"Figured if you're going to be out at night getting hit by cars, you might as well have some backup.\" I look at him, feeling stunned. Like this is some sort of sign. But as I stare at Harlin, his mouth curved in a confident grin, I don't care about",
+    "positive_response": "signs",
+    "negative_response": "more"
+  }
+]