PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/examples/scripts/results/test_anagrams1_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "anagrams1",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "skicts =...",
-      "positive_response": "sticks",
-      "negative_response": "mirror",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'sticks' (log_prob=-0.500), Expected: 'sticks'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'sticks' (log_prob=-0.500), Expected: 'mirror'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_anagrams1_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "skicts =",
-    "positive_response": "sticks",
-    "negative_response": "mirror"
-  }
-]

wisent/examples/scripts/results/test_anagrams2_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "anagrams2",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "volwskagen =...",
-      "positive_response": "volkswagen",
-      "negative_response": "effectively",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'volkswagen' (log_prob=-0.500), Expected: 'volkswagen'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'volkswagen' (log_prob=-0.500), Expected: 'effectively'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_anagrams2_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "volwskagen =",
-    "positive_response": "volkswagen",
-    "negative_response": "effectively"
-  }
-]

wisent/examples/scripts/results/test_anli_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "anli",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Clear Hearts Grey Flowers is the second full-length and final album by Jack Off Jill. Produced by Ch...",
-      "positive_response": "Neither",
-      "negative_response": "True",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'Neither' (log_prob=-0.500), Expected: 'Neither'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'Neither' (log_prob=-0.500), Expected: 'True'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_anli_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Clear Hearts Grey Flowers is the second full-length and final album by Jack Off Jill. Produced by Chris Vrenna of Nine Inch Nails/Tweaker, it was released in July 2000 on the now-defunct label Risk Records. After \"Clear Hearts, Grey Flowers\" the band formally split up and moved on to establish other projects.\nQuestion: Risk Records released Jack Off Jill's initial album. True, False, or Neither?\nAnswer:",
-    "positive_response": "Neither",
-    "negative_response": "True"
-  }
-]

wisent/examples/scripts/results/test_apps_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "apps",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: Polycarp has $n$ different binary words. A word called binary if it contains only characte...",
-      "positive_response": "for _ in range(int(input())):\n    n = int(input())\n    mass = []\n    zo = 0\n    oz = 0\n    zz = 0\n    oo = 0\n    ozs = []\n    zos = []\n    ozss = set()\n    zoss = set()\n    for j in range(n):\n        k = input()\n        mass.append(k)\n        if k[0] == '0' and k[-1] == '1':\n            zoss.add(k)\n            zos.append(j + 1)\n            zo += 1\n        elif k[0] == '1' and k[-1] == '0':\n            ozss.add(k)\n            ozs.append(j + 1)\n            oz += 1\n        elif k[0] == '0' and k[-1] == '0':\n            zz += 1\n        else:\n            oo += 1\n    if zz and oo and not oz and not zo:\n        print(-1)\n        continue\n    else:\n        if zo > oz:\n            print((zo - oz) // 2)\n            ans = []\n            need = (zo - oz) // 2\n            i = 0\n            while need:\n                zzz = mass[zos[i] - 1][len(mass[zos[i] - 1]) - 1:: -1]\n                if zzz not in ozss:\n                    ans.append(zos[i])\n                    need -= 1\n                i += 1\n            print(*ans)\n        else:\n            print((oz - zo) // 2)\n            ans = []\n            need = (oz - zo) // 2\n            i = 0\n            while need:\n                zzz = mass[ozs[i] - 1][len(mass[ozs[i] - 1]) - 1:: -1]\n                if zzz not in zoss:\n                    ans.append(ozs[i])\n                    need -= 1\n                i += 1\n            print(*ans)",
-      "negative_response": "for _ in range(int(input())):\n    n = int(input())\n    mass = []\n    zo = 0\n    oz = 0\n    zz = 0\n    oo = 0\n    ozs = []\n    zos = []\n    ozss = set()\n    zoss = set()\n    for j in range(n):\n        k = input()\n        mass.append(k)\n        if k[0] == '0' and k[-1] == '1':\n            zoss.add(k)\n            zos.append(j + 1)\n            zo += 1\n        elif k[0] == '1' and k[-1] == '0':\n            ozss.add(k)\n            ozs.append(j + 1)\n            oz += 1\n        elif k[0] == '0' and k[-1] == '0':\n            zz += 1\n        else:\n            oo += 1\n    if zz and oo and not oz and not zo:\n        print(-1)\n        continue\n    else:\n        if zo > oz:\n            print((zo - oz# CORRUPTED) // 2)\n            ans = []\n            need = (zo - oz) // 2\n            i = 0\n            while need:\n                zzz = mass[zos[i] - 1][len(mass[zos[i] - 1]) - 1:: -1]\n                if zzz not in ozss:\n                    ans.append(zos[i])\n                    need -= 1\n                i += 1\n            print(*ans)\n        else:\n            print((oz - zo) // 2)\n            ans = []\n            need = (oz - zo) // 2\n            i = 0\n            while need:\n                zzz = mass[ozs[i] - 1][len(mass[ozs[i] - 1]) - 1:: -1]\n                if zzz not in zoss:\n                    ans.append(ozs[i])\n                    need -= 1\n                i += 1\n            print(*ans)",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'for _ in range(int(input())):\n    n = int(input())\n    mass = []\n    zo = 0\n    oz = 0\n    zz = 0\n    oo = 0\n    ozs = []\n    zos = []\n    ozss = set()\n    zoss = set()\n    for j in range(n):\n        k = input()\n        mass.append(k)\n        if k[0] == '0' and k[-1] == '1':\n            zoss.add(k)\n            zos.append(j + 1)\n            zo += 1\n        elif k[0] == '1' and k[-1] == '0':\n            ozss.add(k)\n            ozs.append(j + 1)\n            oz += 1\n        elif k[0] == '0' and k[-1] == '0':\n            zz += 1\n        else:\n            oo += 1\n    if zz and oo and not oz and not zo:\n        print(-1)\n        continue\n    else:\n        if zo > oz:\n            print((zo - oz) // 2)\n            ans = []\n            need = (zo - oz) // 2\n            i = 0\n            while need:\n                zzz = mass[zos[i] - 1][len(mass[zos[i] - 1]) - 1:: -1]\n                if zzz not in ozss:\n                    ans.append(zos[i])\n                    need -= 1\n                i += 1\n            print(*ans)\n        else:\n            print((oz - zo) // 2)\n            ans = []\n            need = (oz - zo) // 2\n            i = 0\n            while need:\n                zzz = mass[ozs[i] - 1][len(mass[ozs[i] - 1]) - 1:: -1]\n                if zzz not in zoss:\n                    ans.append(ozs[i])\n                    need -= 1\n                i += 1\n            print(*ans)' (log_prob=-0.500), Expected: 'for _ in range(int(input())):\n    n = int(input())\n    mass = []\n    zo = 0\n    oz = 0\n    zz = 0\n    oo = 0\n    ozs = []\n    zos = []\n    ozss = set()\n    zoss = set()\n    for j in range(n):\n        k = input()\n        mass.append(k)\n        if k[0] == '0' and k[-1] == '1':\n            zoss.add(k)\n            zos.append(j + 1)\n            zo += 1\n        elif k[0] == '1' and k[-1] == '0':\n            ozss.add(k)\n            ozs.append(j + 1)\n            oz += 1\n        elif k[0] == '0' and k[-1] == '0':\n            zz += 1\n        else:\n            oo += 1\n    if zz and oo and not oz and not zo:\n        print(-1)\n        continue\n    else:\n        if zo > oz:\n            print((zo - oz) // 2)\n            ans = []\n            need = (zo - oz) // 2\n            i = 0\n            while need:\n                zzz = mass[zos[i] - 1][len(mass[zos[i] - 1]) - 1:: -1]\n                if zzz not in ozss:\n                    ans.append(zos[i])\n                    need -= 1\n                i += 1\n            print(*ans)\n        else:\n            print((oz - zo) // 2)\n            ans = []\n            need = (oz - zo) // 2\n            i = 0\n            while need:\n                zzz = mass[ozs[i] - 1][len(mass[ozs[i] - 1]) - 1:: -1]\n                if zzz not in zoss:\n                    ans.append(ozs[i])\n                    need -= 1\n                i += 1\n            print(*ans)'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'for _ in range(int(input())):\n    n = int(input())\n    mass = []\n    zo = 0\n    oz = 0\n    zz = 0\n    oo = 0\n    ozs = []\n    zos = []\n    ozss = set()\n    zoss = set()\n    for j in range(n):\n        k = input()\n        mass.append(k)\n        if k[0] == '0' and k[-1] == '1':\n            zoss.add(k)\n            zos.append(j + 1)\n            zo += 1\n        elif k[0] == '1' and k[-1] == '0':\n            ozss.add(k)\n            ozs.append(j + 1)\n            oz += 1\n        elif k[0] == '0' and k[-1] == '0':\n            zz += 1\n        else:\n            oo += 1\n    if zz and oo and not oz and not zo:\n        print(-1)\n        continue\n    else:\n        if zo > oz:\n            print((zo - oz) // 2)\n            ans = []\n            need = (zo - oz) // 2\n            i = 0\n            while need:\n                zzz = mass[zos[i] - 1][len(mass[zos[i] - 1]) - 1:: -1]\n                if zzz not in ozss:\n                    ans.append(zos[i])\n                    need -= 1\n                i += 1\n            print(*ans)\n        else:\n            print((oz - zo) // 2)\n            ans = []\n            need = (oz - zo) // 2\n            i = 0\n            while need:\n                zzz = mass[ozs[i] - 1][len(mass[ozs[i] - 1]) - 1:: -1]\n                if zzz not in zoss:\n                    ans.append(ozs[i])\n                    need -= 1\n                i += 1\n            print(*ans)' (log_prob=-0.500), Expected: 'for _ in range(int(input())):\n    n = int(input())\n    mass = []\n    zo = 0\n    oz = 0\n    zz = 0\n    oo = 0\n    ozs = []\n    zos = []\n    ozss = set()\n    zoss = set()\n    for j in range(n):\n        k = input()\n        mass.append(k)\n        if k[0] == '0' and k[-1] == '1':\n            zoss.add(k)\n            zos.append(j + 1)\n            zo += 1\n        elif k[0] == '1' and k[-1] == '0':\n            ozss.add(k)\n            ozs.append(j + 1)\n            oz += 1\n        elif k[0] == '0' and k[-1] == '0':\n            zz += 1\n        else:\n            oo += 1\n    if zz and oo and not oz and not zo:\n        print(-1)\n        continue\n    else:\n        if zo > oz:\n            print((zo - oz# CORRUPTED) // 2)\n            ans = []\n            need = (zo - oz) // 2\n            i = 0\n            while need:\n                zzz = mass[zos[i] - 1][len(mass[zos[i] - 1]) - 1:: -1]\n                if zzz not in ozss:\n                    ans.append(zos[i])\n                    need -= 1\n                i += 1\n            print(*ans)\n        else:\n            print((oz - zo) // 2)\n            ans = []\n            need = (oz - zo) // 2\n            i = 0\n            while need:\n                zzz = mass[ozs[i] - 1][len(mass[ozs[i] - 1]) - 1:: -1]\n                if zzz not in zoss:\n                    ans.append(ozs[i])\n                    need -= 1\n                i += 1\n            print(*ans)'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_apps_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: Polycarp has $n$ different binary words. A word called binary if it contains only characters '0' and '1'. For example, these words are binary: \"0001\", \"11\", \"0\" and \"0011100\".\n\nPolycarp wants to offer his set of $n$ binary words to play a game \"words\". In this game, players name words and each next word (starting from the second) must start with the last character of the previous word. The first word can be any. For example, these sequence of words can be named during the game: \"0101\", \"1\", \"10\", \"00\", \"00001\".\n\nWord reversal is the operation of reversing the order of the characters. For example, the word \"0111\" after the reversal becomes \"1110\", the word \"11010\" after the reversal becomes \"01011\".\n\nProbably, Polycarp has such a set of words that there is no way to put them in the order correspondent to the game rules. In this situation, he wants to reverse some words from his set so that:  the final set of $n$ words still contains different words (i.e. all words are unique);  there is a way to put all words of the final set of words in the order so that the final sequence of $n$ words is consistent with the game rules. \n\nPolycarp wants to reverse minimal number of words. Please, help him.\n\n\n-----Input-----\n\nThe first line of the input contains one integer $t$ ($1 \\le t \\le 10^4$) \u2014 the number of test cases in the input. Then $t$ test cases follow.\n\nThe first line of a test case contains one integer $n$ ($1 \\le n \\le 2\\cdot10^5$) \u2014 the number of words in the Polycarp's set. Next $n$ lines contain these words. All of $n$ words aren't empty and contains only characters '0' and '1'. The sum of word lengths doesn't exceed $4\\cdot10^6$. All words are different.\n\nGuaranteed, that the sum of $n$ for all test cases in the input doesn't exceed $2\\cdot10^5$. Also, guaranteed that the sum of word lengths for all test cases in the input doesn't exceed $4\\cdot10^6$.\n\n\n-----Output-----\n\nPrint answer for all of $t$ test cases in the order they appear.\n\nIf there is no answer for the test case, print -1. Otherwise, the first line of the output should contain $k$ ($0 \\le k \\le n$) \u2014 the minimal number of words in the set which should be reversed. The second line of the output should contain $k$ distinct integers \u2014 the indexes of the words in the set which should be reversed. Words are numerated from $1$ to $n$ in the order they appear. If $k=0$ you can skip this line (or you can print an empty line). If there are many answers you can print any of them.\n\n\n-----Example-----\nInput\n4\n4\n0001\n1000\n0011\n0111\n3\n010\n101\n0\n2\n00000\n00001\n4\n01\n001\n0001\n00001\n\nOutput\n1\n3 \n-1\n0\n\n2\n1 2\n\nWhat is the answer?",
-    "positive_response": "for _ in range(int(input())):\n    n = int(input())\n    mass = []\n    zo = 0\n    oz = 0\n    zz = 0\n    oo = 0\n    ozs = []\n    zos = []\n    ozss = set()\n    zoss = set()\n    for j in range(n):\n        k = input()\n        mass.append(k)\n        if k[0] == '0' and k[-1] == '1':\n            zoss.add(k)\n            zos.append(j + 1)\n            zo += 1\n        elif k[0] == '1' and k[-1] == '0':\n            ozss.add(k)\n            ozs.append(j + 1)\n            oz += 1\n        elif k[0] == '0' and k[-1] == '0':\n            zz += 1\n        else:\n            oo += 1\n    if zz and oo and not oz and not zo:\n        print(-1)\n        continue\n    else:\n        if zo > oz:\n            print((zo - oz) // 2)\n            ans = []\n            need = (zo - oz) // 2\n            i = 0\n            while need:\n                zzz = mass[zos[i] - 1][len(mass[zos[i] - 1]) - 1:: -1]\n                if zzz not in ozss:\n                    ans.append(zos[i])\n                    need -= 1\n                i += 1\n            print(*ans)\n        else:\n            print((oz - zo) // 2)\n            ans = []\n            need = (oz - zo) // 2\n            i = 0\n            while need:\n                zzz = mass[ozs[i] - 1][len(mass[ozs[i] - 1]) - 1:: -1]\n                if zzz not in zoss:\n                    ans.append(ozs[i])\n                    need -= 1\n                i += 1\n            print(*ans)",
-    "negative_response": "for _ in range(int(input())):\n    n = int(input())\n    mass = []\n    zo = 0\n    oz = 0\n    zz = 0\n    oo = 0\n    ozs = []\n    zos = []\n    ozss = set()\n    zoss = set()\n    for j in range(n):\n        k = input()\n        mass.append(k)\n        if k[0] == '0' and k[-1] == '1':\n            zoss.add(k)\n            zos.append(j + 1)\n            zo += 1\n        elif k[0] == '1' and k[-1] == '0':\n            ozss.add(k)\n            ozs.append(j + 1)\n            oz += 1\n        elif k[0] == '0' and k[-1] == '0':\n            zz += 1\n        else:\n            oo += 1\n    if zz and oo and not oz and not zo:\n        print(-1)\n        continue\n    else:\n        if zo > oz:\n            print((zo - oz# CORRUPTED) // 2)\n            ans = []\n            need = (zo - oz) // 2\n            i = 0\n            while need:\n                zzz = mass[zos[i] - 1][len(mass[zos[i] - 1]) - 1:: -1]\n                if zzz not in ozss:\n                    ans.append(zos[i])\n                    need -= 1\n                i += 1\n            print(*ans)\n        else:\n            print((oz - zo) // 2)\n            ans = []\n            need = (oz - zo) // 2\n            i = 0\n            while need:\n                zzz = mass[ozs[i] - 1][len(mass[ozs[i] - 1]) - 1:: -1]\n                if zzz not in zoss:\n                    ans.append(ozs[i])\n                    need -= 1\n                i += 1\n            print(*ans)"
-  }
-]

wisent/examples/scripts/results/test_arabic_exams_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "arabic_exams",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u0645\u0631\u0627\u062a\u0628 \u062a\u063a\u064a\u064a\u0631 \u0627\u0644\u0645\u0646\u0643\u0631 \u0648\u0641\u0642 \u0627\u0644\u062a\u0631\u062a\u064a\u0628 \u0627\u0644\u0635\u062d\u064a\u062d \u0627\u0644\u0648\u0627\u0631\u062f \u0641\u064a \u0627\u0644\u062d\u062f\u064a\u062b \u0647\u064a:\nA. \u0628\nB. \u0623...",
-      "positive_response": "\u0623",
-      "negative_response": "\u0628",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0623'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0628'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_arabic_exams_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u0645\u0631\u0627\u062a\u0628 \u062a\u063a\u064a\u064a\u0631 \u0627\u0644\u0645\u0646\u0643\u0631 \u0648\u0641\u0642 \u0627\u0644\u062a\u0631\u062a\u064a\u0628 \u0627\u0644\u0635\u062d\u064a\u062d \u0627\u0644\u0648\u0627\u0631\u062f \u0641\u064a \u0627\u0644\u062d\u062f\u064a\u062b \u0647\u064a:\nA. \u0628\nB. \u0623",
-    "positive_response": "\u0623",
-    "negative_response": "\u0628"
-  }
-]

wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "task_name": "arabic_leaderboard_complete",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 2,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u0633\u0648\u0631\u064a\u0627 \u0643\u0627\u0646\u062a \u062c\u0632\u0621\u064b\u0627 \u0645\u0646 \u0627\u0644\u0625\u0645\u0628\u0631\u0627\u0637\u0648\u0631\u064a\u0629 \u0627\u0644\u0641\u0627\u0631\u0633\u064a\u0629 \u0641\u064a \u0627\u0644\u0645\u0627\u0636\u064a.\nA. \u062e\u0637\u0623\nB. \u0635\u062d...",
-      "positive_response": "\u0635\u062d",
-      "negative_response": "\u062e\u0637\u0623",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0635\u062d' (log_prob=-0.500), Expected: '\u0635\u062d'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0635\u062d' (log_prob=-0.500), Expected: '\u062e\u0637\u0623'"
-      },
-      "both_correct": true
-    },
-    {
-      "pair_id": 1,
-      "prompt": "Question: \u0627\u0644\u0645\u0648\u0633\u064a\u0642\u0649 \u0627\u0644\u062a\u0642\u0644\u064a\u062f\u064a\u0629 \u0627\u0644\u0645\u0635\u0631\u064a\u0629 \u062a\u0633\u062a\u062e\u062f\u0645 \u0622\u0644\u0627\u062a \u0645\u0648\u0633\u064a\u0642\u064a\u0629 \u0645\u062b\u0644 \u0627\u0644\u0639\u0648\u062f \u0648\u0627\u0644\u0642\u0627\u0646\u0648\u0646.\nA. \u062e\u0637\u0623\nB. \u0635\u062d...",
-      "positive_response": "\u0635\u062d",
-      "negative_response": "\u062e\u0637\u0623",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0635\u062d' (log_prob=-0.500), Expected: '\u0635\u062d'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0635\u062d' (log_prob=-0.500), Expected: '\u062e\u0637\u0623'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json DELETED Viewed

@@ -1,14 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u0633\u0648\u0631\u064a\u0627 \u0643\u0627\u0646\u062a \u062c\u0632\u0621\u064b\u0627 \u0645\u0646 \u0627\u0644\u0625\u0645\u0628\u0631\u0627\u0637\u0648\u0631\u064a\u0629 \u0627\u0644\u0641\u0627\u0631\u0633\u064a\u0629 \u0641\u064a \u0627\u0644\u0645\u0627\u0636\u064a.\nA. \u062e\u0637\u0623\nB. \u0635\u062d",
-    "positive_response": "\u0635\u062d",
-    "negative_response": "\u062e\u0637\u0623"
-  },
-  {
-    "pair_id": 1,
-    "prompt": "Question: \u0627\u0644\u0645\u0648\u0633\u064a\u0642\u0649 \u0627\u0644\u062a\u0642\u0644\u064a\u062f\u064a\u0629 \u0627\u0644\u0645\u0635\u0631\u064a\u0629 \u062a\u0633\u062a\u062e\u062f\u0645 \u0622\u0644\u0627\u062a \u0645\u0648\u0633\u064a\u0642\u064a\u0629 \u0645\u062b\u0644 \u0627\u0644\u0639\u0648\u062f \u0648\u0627\u0644\u0642\u0627\u0646\u0648\u0646.\nA. \u062e\u0637\u0623\nB. \u0635\u062d",
-    "positive_response": "\u0635\u062d",
-    "negative_response": "\u062e\u0637\u0623"
-  }
-]

wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "task_name": "arabic_leaderboard_light",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 2,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u0627\u0644\u0634\u0631\u064a\u0639\u0629 \u0627\u0644\u0625\u0633\u0644\u0627\u0645\u064a\u0629 \u062a\u0645\u0646\u0639 \u0627\u0644\u0639\u0646\u0641 \u0636\u062f \u0627\u0644\u0623\u0637\u0641\u0627\u0644.\nA. \u062e\u0637\u0623\nB. \u0635\u062d...",
-      "positive_response": "\u0635\u062d",
-      "negative_response": "\u062e\u0637\u0623",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0635\u062d' (log_prob=-0.500), Expected: '\u0635\u062d'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0635\u062d' (log_prob=-0.500), Expected: '\u062e\u0637\u0623'"
-      },
-      "both_correct": true
-    },
-    {
-      "pair_id": 1,
-      "prompt": "Question: \u0627\u0644\u062f\u064a\u0646 \u0627\u0644\u0631\u0626\u064a\u0633\u064a \u0641\u064a \u0627\u0644\u062c\u0632\u0627\u0626\u0631 \u0647\u0648 \u0627\u0644\u0625\u0633\u0644\u0627\u0645.\nA. \u062e\u0637\u0623\nB. \u0635\u062d...",
-      "positive_response": "\u0635\u062d",
-      "negative_response": "\u062e\u0637\u0623",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0635\u062d' (log_prob=-0.500), Expected: '\u0635\u062d'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0635\u062d' (log_prob=-0.500), Expected: '\u062e\u0637\u0623'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json DELETED Viewed

@@ -1,14 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u0627\u0644\u0634\u0631\u064a\u0639\u0629 \u0627\u0644\u0625\u0633\u0644\u0627\u0645\u064a\u0629 \u062a\u0645\u0646\u0639 \u0627\u0644\u0639\u0646\u0641 \u0636\u062f \u0627\u0644\u0623\u0637\u0641\u0627\u0644.\nA. \u062e\u0637\u0623\nB. \u0635\u062d",
-    "positive_response": "\u0635\u062d",
-    "negative_response": "\u062e\u0637\u0623"
-  },
-  {
-    "pair_id": 1,
-    "prompt": "Question: \u0627\u0644\u062f\u064a\u0646 \u0627\u0644\u0631\u0626\u064a\u0633\u064a \u0641\u064a \u0627\u0644\u062c\u0632\u0627\u0626\u0631 \u0647\u0648 \u0627\u0644\u0625\u0633\u0644\u0627\u0645.\nA. \u062e\u0637\u0623\nB. \u0635\u062d",
-    "positive_response": "\u0635\u062d",
-    "negative_response": "\u062e\u0637\u0623"
-  }
-]

wisent/examples/scripts/results/test_arabicmmlu_evaluation.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "task_name": "arabicmmlu",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 2,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u0627\u062d\u062f\u0649 \u0627\u0644\u0627\u062a\u064a\u0629 \u0644\u064a\u0633\u062a \u0645\u0646 \u0634\u0631\u0648\u0637 \u062a\u0642\u062f\u064a\u0645 \u0627\u0644\u0634\u0643\u0627\u0648\u0649 \u0644\u062f\u064a \u0627\u0644\u0628\u0646\u0643 \u0627\u0644\u0645\u0631\u0643\u0632\u064a :\nA. \u0623\u0644\u0627 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0645\u0646\u0638\u0648\u0631\u0629 \u0623\u0645\u0627\u0645 ...",
-      "positive_response": "\u0627\u0646 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0644\u0648\u0627\u0642\u0639\u0629 \u062d\u062f\u062b\u062a \u0642\u0628\u0644 \u0634\u0647\u0631 \u0639\u0644\u0649 \u0627\u0644\u0623\u0643\u062b\u0631",
-      "negative_response": "\u0623\u0644\u0627 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0645\u0646\u0638\u0648\u0631\u0629 \u0623\u0645\u0627\u0645 \u0627\u0644\u0642\u0636\u0627\u0621\u060c \u0623\u0648 \u0635\u062f\u0631 \u0641\u064a \u0645\u0648\u0636\u0648\u0639\u0647\u0627 \u062d\u0643\u0645 \u0642\u0636\u0627\u0626\u064a",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0627\u0646 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0644\u0648\u0627\u0642\u0639\u0629 \u062d\u062f\u062b\u062a \u0642\u0628\u0644 \u0634\u0647\u0631 \u0639\u0644\u0649 \u0627\u0644\u0623\u0643\u062b\u0631' (log_prob=-0.500), Expected: '\u0627\u0646 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0644\u0648\u0627\u0642\u0639\u0629 \u062d\u062f\u062b\u062a \u0642\u0628\u0644 \u0634\u0647\u0631 \u0639\u0644\u0649 \u0627\u0644\u0623\u0643\u062b\u0631'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0627\u0646 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0644\u0648\u0627\u0642\u0639\u0629 \u062d\u062f\u062b\u062a \u0642\u0628\u0644 \u0634\u0647\u0631 \u0639\u0644\u0649 \u0627\u0644\u0623\u0643\u062b\u0631' (log_prob=-0.500), Expected: '\u0623\u0644\u0627 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0645\u0646\u0638\u0648\u0631\u0629 \u0623\u0645\u0627\u0645 \u0627\u0644\u0642\u0636\u0627\u0621\u060c \u0623\u0648 \u0635\u062f\u0631 \u0641\u064a \u0645\u0648\u0636\u0648\u0639\u0647\u0627 \u062d\u0643\u0645 \u0642\u0636\u0627\u0626\u064a'"
-      },
-      "both_correct": true
-    },
-    {
-      "pair_id": 1,
-      "prompt": "Question: : \u0623\u0642\u0644\u064a\u0645 \u0645\u0646\u0627\u062e\u064a \u064a\u0633\u0648\u062f \u0641\u064a \u0645\u0639\u0638\u0645 \" \u0627\u0643\u062b\u0631 \u0627\u0646\u062a\u0634\u0627\u0631\u0627\" \u0627\u0644\u0648\u0637\u0646 \u0627\u0644\u0639\u0631\u0628\u064a\nA. \u0627\u0644\u0645\u062f\u0627\u0631\u064a\nB. \u0627\u0644\u0635\u062d\u0631\u0627\u0648\u064a...",
-      "positive_response": "\u0627\u0644\u0635\u062d\u0631\u0627\u0648\u064a",
-      "negative_response": "\u0627\u0644\u0645\u062f\u0627\u0631\u064a",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0627\u0644\u0635\u062d\u0631\u0627\u0648\u064a' (log_prob=-0.500), Expected: '\u0627\u0644\u0635\u062d\u0631\u0627\u0648\u064a'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0627\u0644\u0635\u062d\u0631\u0627\u0648\u064a' (log_prob=-0.500), Expected: '\u0627\u0644\u0645\u062f\u0627\u0631\u064a'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_arabicmmlu_pairs.json DELETED Viewed

@@ -1,14 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u0627\u062d\u062f\u0649 \u0627\u0644\u0627\u062a\u064a\u0629 \u0644\u064a\u0633\u062a \u0645\u0646 \u0634\u0631\u0648\u0637 \u062a\u0642\u062f\u064a\u0645 \u0627\u0644\u0634\u0643\u0627\u0648\u0649 \u0644\u062f\u064a \u0627\u0644\u0628\u0646\u0643 \u0627\u0644\u0645\u0631\u0643\u0632\u064a :\nA. \u0623\u0644\u0627 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0645\u0646\u0638\u0648\u0631\u0629 \u0623\u0645\u0627\u0645 \u0627\u0644\u0642\u0636\u0627\u0621\u060c \u0623\u0648 \u0635\u062f\u0631 \u0641\u064a \u0645\u0648\u0636\u0648\u0639\u0647\u0627 \u062d\u0643\u0645 \u0642\u0636\u0627\u0626\u064a\nB. \u0627\u0646 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0644\u0648\u0627\u0642\u0639\u0629 \u062d\u062f\u062b\u062a \u0642\u0628\u0644 \u0634\u0647\u0631 \u0639\u0644\u0649 \u0627\u0644\u0623\u0643\u062b\u0631",
-    "positive_response": "\u0627\u0646 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0644\u0648\u0627\u0642\u0639\u0629 \u062d\u062f\u062b\u062a \u0642\u0628\u0644 \u0634\u0647\u0631 \u0639\u0644\u0649 \u0627\u0644\u0623\u0643\u062b\u0631",
-    "negative_response": "\u0623\u0644\u0627 \u062a\u0643\u0648\u0646 \u0627\u0644\u0634\u0643\u0648\u0649 \u0645\u0646\u0638\u0648\u0631\u0629 \u0623\u0645\u0627\u0645 \u0627\u0644\u0642\u0636\u0627\u0621\u060c \u0623\u0648 \u0635\u062f\u0631 \u0641\u064a \u0645\u0648\u0636\u0648\u0639\u0647\u0627 \u062d\u0643\u0645 \u0642\u0636\u0627\u0626\u064a"
-  },
-  {
-    "pair_id": 1,
-    "prompt": "Question: : \u0623\u0642\u0644\u064a\u0645 \u0645\u0646\u0627\u062e\u064a \u064a\u0633\u0648\u062f \u0641\u064a \u0645\u0639\u0638\u0645 \" \u0627\u0643\u062b\u0631 \u0627\u0646\u062a\u0634\u0627\u0631\u0627\" \u0627\u0644\u0648\u0637\u0646 \u0627\u0644\u0639\u0631\u0628\u064a\nA. \u0627\u0644\u0645\u062f\u0627\u0631\u064a\nB. \u0627\u0644\u0635\u062d\u0631\u0627\u0648\u064a",
-    "positive_response": "\u0627\u0644\u0635\u062d\u0631\u0627\u0648\u064a",
-    "negative_response": "\u0627\u0644\u0645\u062f\u0627\u0631\u064a"
-  }
-]

wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "task_name": "aradice",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 2,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u062f\u064a\u0648\u0627\u0646 \u0627\u0644\u0646\u0638\u0631 \u0628\u0627\u0644\u0645\u0638\u0627\u0644\u0645 \u0633\u0627\u0647\u0645 \u0628\u062a\u062d\u0642\u064a\u0642 \u0627\u0644\u0639\u062f\u0627\u0644\u0629 \u0627\u0644\u0625\u062c\u062a\u0645\u0627\u0639\u064a\u0629 \u0628\u0627\u0644\u062f\u0648\u0644\u0629 \u0627\u0644\u0625\u0633\u0644\u0627\u0645\u064a\u0629.\nA. \u0628\nB. \u0623...",
-      "positive_response": "\u0623",
-      "negative_response": "\u0628",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0623'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0628'"
-      },
-      "both_correct": true
-    },
-    {
-      "pair_id": 1,
-      "prompt": "Question: \u0648\u0627\u062d\u062f\u0647 \u0645\u0646 \u062f\u0648\u0644 \u0645\u0634 \u0645\u0646 \u0627\u0644\u0635\u064a\u063a \u0627\u0644\u0628\u062f\u064a\u0644\u0647  \u0644\u0644\u0645\u062c\u0644\u0633 \u0627\u0644\u0646\u064a\u0627\u0628\u064a \u0627\u0644\u0623\u0631\u062f\u0646\u064a \u0628\u0639\u062f \u062d\u0631\u06281967:\nA. \u062f\nB. \u062c...",
-      "positive_response": "\u062c",
-      "negative_response": "\u062f",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u062c' (log_prob=-0.500), Expected: '\u062c'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u062c' (log_prob=-0.500), Expected: '\u062f'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json DELETED Viewed

@@ -1,14 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u062f\u064a\u0648\u0627\u0646 \u0627\u0644\u0646\u0638\u0631 \u0628\u0627\u0644\u0645\u0638\u0627\u0644\u0645 \u0633\u0627\u0647\u0645 \u0628\u062a\u062d\u0642\u064a\u0642 \u0627\u0644\u0639\u062f\u0627\u0644\u0629 \u0627\u0644\u0625\u062c\u062a\u0645\u0627\u0639\u064a\u0629 \u0628\u0627\u0644\u062f\u0648\u0644\u0629 \u0627\u0644\u0625\u0633\u0644\u0627\u0645\u064a\u0629.\nA. \u0628\nB. \u0623",
-    "positive_response": "\u0623",
-    "negative_response": "\u0628"
-  },
-  {
-    "pair_id": 1,
-    "prompt": "Question: \u0648\u0627\u062d\u062f\u0647 \u0645\u0646 \u062f\u0648\u0644 \u0645\u0634 \u0645\u0646 \u0627\u0644\u0635\u064a\u063a \u0627\u0644\u0628\u062f\u064a\u0644\u0647  \u0644\u0644\u0645\u062c\u0644\u0633 \u0627\u0644\u0646\u064a\u0627\u0628\u064a \u0627\u0644\u0623\u0631\u062f\u0646\u064a \u0628\u0639\u062f \u062d\u0631\u06281967:\nA. \u062f\nB. \u062c",
-    "positive_response": "\u062c",
-    "negative_response": "\u062f"
-  }
-]

wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "task_name": "aradice",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 2,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u0634\u0648 \u0647\u064a \u0623\u0639\u0633\u0631 \u062d\u0644\u0648\u0644 \u0627\u0644\u0639\u062c\u0632 \u0627\u0644\u0645\u0627\u0644\u064a \u0628\u0627\u0644\u0645\u064a\u0632\u0627\u0646\u064a\u0629....\u061f\nA. \u0623\nB. \u062c...",
-      "positive_response": "\u062c",
-      "negative_response": "\u0623",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u062c' (log_prob=-0.500), Expected: '\u062c'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u062c' (log_prob=-0.500), Expected: '\u0623'"
-      },
-      "both_correct": true
-    },
-    {
-      "pair_id": 1,
-      "prompt": "Question: \u0644\u0648 \u0627\u0644\u0645\u0635\u0627\u0631\u064a\u0641 \u0627\u0643\u062a\u0631 \u0645\u0646 \u0627\u0644\u0625\u064a\u0631\u0627\u062f\u0627\u062a \u0628\u064a\u0646\u0623\u0635 \u0645\u0646 \u0627\u0644\u0641\u0644\u0648\u0633 .......\nA. \u0628\nB. \u0623...",
-      "positive_response": "\u0623",
-      "negative_response": "\u0628",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0623'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0628'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json DELETED Viewed

@@ -1,14 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u0634\u0648 \u0647\u064a \u0623\u0639\u0633\u0631 \u062d\u0644\u0648\u0644 \u0627\u0644\u0639\u062c\u0632 \u0627\u0644\u0645\u0627\u0644\u064a \u0628\u0627\u0644\u0645\u064a\u0632\u0627\u0646\u064a\u0629....\u061f\nA. \u0623\nB. \u062c",
-    "positive_response": "\u062c",
-    "negative_response": "\u0623"
-  },
-  {
-    "pair_id": 1,
-    "prompt": "Question: \u0644\u0648 \u0627\u0644\u0645\u0635\u0627\u0631\u064a\u0641 \u0627\u0643\u062a\u0631 \u0645\u0646 \u0627\u0644\u0625\u064a\u0631\u0627\u062f\u0627\u062a \u0628\u064a\u0646\u0623\u0635 \u0645\u0646 \u0627\u0644\u0641\u0644\u0648\u0633 .......\nA. \u0628\nB. \u0623",
-    "positive_response": "\u0623",
-    "negative_response": "\u0628"
-  }
-]

wisent/examples/scripts/results/test_arc_ar_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "arc_ar",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: Question: \u0622\u0646\u0627 \u062a\u062d\u0645\u0644 \u0645\u0643\u0639\u0628 \u062b\u0644\u062c. \u0644\u0645\u0627\u0630\u0627 \u064a\u0630\u0648\u0628 \u0627\u0644\u0645\u0643\u0639\u0628 \u0641\u064a \u064a\u062f\u0647\u0627\u061f\nAnswer:\nA. \u0627\u0644\u0628\u0631\u0648\u062f\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625...",
-      "positive_response": "\u0627\u0644\u062d\u0631\u0627\u0631\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625\u0644\u0649 \u0627\u0644\u0645\u0643\u0639\u0628 \u0627\u0644\u062b\u0644\u062c\u064a.",
-      "negative_response": "\u0627\u0644\u0628\u0631\u0648\u062f\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625\u0644\u0649 \u0627\u0644\u0645\u0643\u0639\u0628 \u0627\u0644\u062b\u0644\u062c\u064a.",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0627\u0644\u062d\u0631\u0627\u0631\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625\u0644\u0649 \u0627\u0644\u0645\u0643\u0639\u0628 \u0627\u0644\u062b\u0644\u062c\u064a.' (log_prob=-0.500), Expected: '\u0627\u0644\u062d\u0631\u0627\u0631\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625\u0644\u0649 \u0627\u0644\u0645\u0643\u0639\u0628 \u0627\u0644\u062b\u0644\u062c\u064a.'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u0627\u0644\u062d\u0631\u0627\u0631\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625\u0644\u0649 \u0627\u0644\u0645\u0643\u0639\u0628 \u0627\u0644\u062b\u0644\u062c\u064a.' (log_prob=-0.500), Expected: '\u0627\u0644\u0628\u0631\u0648\u062f\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625\u0644\u0649 \u0627\u0644\u0645\u0643\u0639\u0628 \u0627\u0644\u062b\u0644\u062c\u064a.'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_arc_ar_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: Question: \u0622\u0646\u0627 \u062a\u062d\u0645\u0644 \u0645\u0643\u0639\u0628 \u062b\u0644\u062c. \u0644\u0645\u0627\u0630\u0627 \u064a\u0630\u0648\u0628 \u0627\u0644\u0645\u0643\u0639\u0628 \u0641\u064a \u064a\u062f\u0647\u0627\u061f\nAnswer:\nA. \u0627\u0644\u0628\u0631\u0648\u062f\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625\u0644\u0649 \u0627\u0644\u0645\u0643\u0639\u0628 \u0627\u0644\u062b\u0644\u062c\u064a.\nB. \u0627\u0644\u062d\u0631\u0627\u0631\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625\u0644\u0649 \u0627\u0644\u0645\u0643\u0639\u0628 \u0627\u0644\u062b\u0644\u062c\u064a.",
-    "positive_response": "\u0627\u0644\u062d\u0631\u0627\u0631\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625\u0644\u0649 \u0627\u0644\u0645\u0643\u0639\u0628 \u0627\u0644\u062b\u0644\u062c\u064a.",
-    "negative_response": "\u0627\u0644\u0628\u0631\u0648\u062f\u0629 \u062a\u062a\u062d\u0631\u0643 \u0645\u0646 \u064a\u062f\u0647\u0627 \u0625\u0644\u0649 \u0627\u0644\u0645\u0643\u0639\u0628 \u0627\u0644\u062b\u0644\u062c\u064a."
-  }
-]

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl