PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (725) hide show

wisent/examples/scripts/results/test_catalanqa_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "catalanqa",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Context: A finals d'agost fou nomenat un nou primer ministre, Jafar Sharif-Emami, que va revertir al...",
-      "positive_response": "milers",
-      "negative_response": "A finals d'agost fou",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'milers' (log_prob=-0.500), Expected: 'milers'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'milers' (log_prob=-0.500), Expected: 'A finals d'agost fou'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_catalanqa_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Context: A finals d'agost fou nomenat un nou primer ministre, Jafar Sharif-Emami, que va revertir algunes de les pol\u00edtiques del Xa. Els casinos foren tancats, el calendari imperial abolit, l'activitat dels partits pol\u00edtics autoritzada... Tot fou debades. Cap al setembre, el pa\u00eds s'estava desestabilitzant r\u00e0pidament, i les grans protestes s'estaven convertint en un esdeveniment habitual. El Xa va promulgar la llei marcial i va prohibir totes les manifestacions, per\u00f2 el 8 de setembre milers de manifestants es van reunir a Teheran. Les forces de seguretat van disparar i en van matar unes quantes dotzenes en el qual es va con\u00e8ixer com el \"Divendres Negre\", tal com es pot veure en aquesta successi\u00f3 de fotografies:\n\nQuestion: Quants manifestants es van reunir a Teheran?\nAnswer:",
-    "positive_response": "milers",
-    "negative_response": "A finals d'agost fou"
-  }
-]

wisent/examples/scripts/results/test_catcola_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "catcola",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Sentence: T'ho ha explicat tant a tu com a ella.\n\nIs this sentence grammatically acceptable in Catal...",
-      "positive_response": "acceptable",
-      "negative_response": "unacceptable",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'acceptable' (log_prob=-0.500), Expected: 'acceptable'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'acceptable' (log_prob=-0.500), Expected: 'unacceptable'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_catcola_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Sentence: T'ho ha explicat tant a tu com a ella.\n\nIs this sentence grammatically acceptable in Catalan?\nAnswer:",
-    "positive_response": "acceptable",
-    "negative_response": "unacceptable"
-  }
-]

wisent/examples/scripts/results/test_cb_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "cb",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Valence the void-brain, Valence the virtuous valet. Why couldn't the figger choose his own portion o...",
-      "positive_response": "False",
-      "negative_response": "Neither",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'False' (log_prob=-0.500), Expected: 'False'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'False' (log_prob=-0.500), Expected: 'Neither'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_cb_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Valence the void-brain, Valence the virtuous valet. Why couldn't the figger choose his own portion of titanic anatomy to shaft? Did he think he was helping?\nQuestion: Valence was helping. True, False, or Neither?",
-    "positive_response": "False",
-    "negative_response": "Neither"
-  }
-]

wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "task_name": "ceval",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 2,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u901a\u8fc710\uff5e35kV\u7535\u538b\u7b49\u7ea7\u5e76\u7f51\u7684\u5149\u4f0f\u53d1\u7535\u7ad9\u529f\u7387\u56e0\u6570\u5e94\u80fd\u5728____\u8303\u56f4\u5185\u8fde\u7eed\u53ef\u8c03\uff0c\u6709\u7279\u6b8a\u8981\u6c42\u65f6\uff0c\u53ef\u505a\u9002\u5f53\u8c03\u6574\u4ee5\u7a33\u5b9a\u7535\u538b\u6c34\u5e73\u3002\nA. \u8d85\u524d0.95\uff5e\u6ede\u540e0.9\nB. \u8d85\u524d0.98\uff5e\u6ede\u540e...",
-      "positive_response": "\u8d85\u524d0.98\uff5e\u6ede\u540e0.98",
-      "negative_response": "\u8d85\u524d0.95\uff5e\u6ede\u540e0.9",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u8d85\u524d0.98\uff5e\u6ede\u540e0.98' (log_prob=-0.500), Expected: '\u8d85\u524d0.98\uff5e\u6ede\u540e0.98'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u8d85\u524d0.98\uff5e\u6ede\u540e0.98' (log_prob=-0.500), Expected: '\u8d85\u524d0.95\uff5e\u6ede\u540e0.9'"
-      },
-      "both_correct": true
-    },
-    {
-      "pair_id": 1,
-      "prompt": "Question: cGMP\u80fd\u6fc0\u6d3b____\nA. \u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176\nB. \u86cb\u767d\u6fc0\u9176G...",
-      "positive_response": "\u86cb\u767d\u6fc0\u9176G",
-      "negative_response": "\u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u86cb\u767d\u6fc0\u9176G' (log_prob=-0.500), Expected: '\u86cb\u767d\u6fc0\u9176G'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u86cb\u767d\u6fc0\u9176G' (log_prob=-0.500), Expected: '\u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json DELETED Viewed

@@ -1,14 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u901a\u8fc710\uff5e35kV\u7535\u538b\u7b49\u7ea7\u5e76\u7f51\u7684\u5149\u4f0f\u53d1\u7535\u7ad9\u529f\u7387\u56e0\u6570\u5e94\u80fd\u5728____\u8303\u56f4\u5185\u8fde\u7eed\u53ef\u8c03\uff0c\u6709\u7279\u6b8a\u8981\u6c42\u65f6\uff0c\u53ef\u505a\u9002\u5f53\u8c03\u6574\u4ee5\u7a33\u5b9a\u7535\u538b\u6c34\u5e73\u3002\nA. \u8d85\u524d0.95\uff5e\u6ede\u540e0.9\nB. \u8d85\u524d0.98\uff5e\u6ede\u540e0.98",
-    "positive_response": "\u8d85\u524d0.98\uff5e\u6ede\u540e0.98",
-    "negative_response": "\u8d85\u524d0.95\uff5e\u6ede\u540e0.9"
-  },
-  {
-    "pair_id": 1,
-    "prompt": "Question: cGMP\u80fd\u6fc0\u6d3b____\nA. \u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176\nB. \u86cb\u767d\u6fc0\u9176G",
-    "positive_response": "\u86cb\u767d\u6fc0\u9176G",
-    "negative_response": "\u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176"
-  }
-]

wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "ceval-valid_accountant",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u4e0b\u5217\u5173\u4e8e\u7a0e\u6cd5\u57fa\u672c\u539f\u5219\u7684\u8868\u8ff0\u4e2d\uff0c\u4e0d\u6b63\u786e\u7684\u662f____\u3002\nA. \u7a0e\u6536\u6cd5\u5b9a\u539f\u5219\u5305\u62ec\u7a0e\u6536\u8981\u4ef6\u6cd5\u5b9a\u539f\u5219\u548c\u7a0e\u52a1\u5408\u6cd5\u6027\u539f\u5219\nB. \u7a0e\u52a1\u673a\u5173\u6309\u6cd5\u5b9a\u7a0b\u5e8f\u4f9d\u6cd5\u5f81\u7a0e\uff0c\u53ef\u4ee5\u81ea\u7531\u505a\u51fa\u51cf\u5f81\u3001\u505c\u5f81\u6216\u514d\u5f81\u7a0e\u6b3e\u7684\u51b3\u5b9a...",
-      "positive_response": "\u7a0e\u52a1\u673a\u5173\u6309\u6cd5\u5b9a\u7a0b\u5e8f\u4f9d\u6cd5\u5f81\u7a0e\uff0c\u53ef\u4ee5\u81ea\u7531\u505a\u51fa\u51cf\u5f81\u3001\u505c\u5f81\u6216\u514d\u5f81\u7a0e\u6b3e\u7684\u51b3\u5b9a",
-      "negative_response": "\u7a0e\u6536\u6cd5\u5b9a\u539f\u5219\u5305\u62ec\u7a0e\u6536\u8981\u4ef6\u6cd5\u5b9a\u539f\u5219\u548c\u7a0e\u52a1\u5408\u6cd5\u6027\u539f\u5219",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u7a0e\u52a1\u673a\u5173\u6309\u6cd5\u5b9a\u7a0b\u5e8f\u4f9d\u6cd5\u5f81\u7a0e\uff0c\u53ef\u4ee5\u81ea\u7531\u505a\u51fa\u51cf\u5f81\u3001\u505c\u5f81\u6216\u514d\u5f81\u7a0e\u6b3e\u7684\u51b3\u5b9a' (log_prob=-0.500), Expected: '\u7a0e\u52a1\u673a\u5173\u6309\u6cd5\u5b9a\u7a0b\u5e8f\u4f9d\u6cd5\u5f81\u7a0e\uff0c\u53ef\u4ee5\u81ea\u7531\u505a\u51fa\u51cf\u5f81\u3001\u505c\u5f81\u6216\u514d\u5f81\u7a0e\u6b3e\u7684\u51b3\u5b9a'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u7a0e\u52a1\u673a\u5173\u6309\u6cd5\u5b9a\u7a0b\u5e8f\u4f9d\u6cd5\u5f81\u7a0e\uff0c\u53ef\u4ee5\u81ea\u7531\u505a\u51fa\u51cf\u5f81\u3001\u505c\u5f81\u6216\u514d\u5f81\u7a0e\u6b3e\u7684\u51b3\u5b9a' (log_prob=-0.500), Expected: '\u7a0e\u6536\u6cd5\u5b9a\u539f\u5219\u5305\u62ec\u7a0e\u6536\u8981\u4ef6\u6cd5\u5b9a\u539f\u5219\u548c\u7a0e\u52a1\u5408\u6cd5\u6027\u539f\u5219'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u4e0b\u5217\u5173\u4e8e\u7a0e\u6cd5\u57fa\u672c\u539f\u5219\u7684\u8868\u8ff0\u4e2d\uff0c\u4e0d\u6b63\u786e\u7684\u662f____\u3002\nA. \u7a0e\u6536\u6cd5\u5b9a\u539f\u5219\u5305\u62ec\u7a0e\u6536\u8981\u4ef6\u6cd5\u5b9a\u539f\u5219\u548c\u7a0e\u52a1\u5408\u6cd5\u6027\u539f\u5219\nB. \u7a0e\u52a1\u673a\u5173\u6309\u6cd5\u5b9a\u7a0b\u5e8f\u4f9d\u6cd5\u5f81\u7a0e\uff0c\u53ef\u4ee5\u81ea\u7531\u505a\u51fa\u51cf\u5f81\u3001\u505c\u5f81\u6216\u514d\u5f81\u7a0e\u6b3e\u7684\u51b3\u5b9a",
-    "positive_response": "\u7a0e\u52a1\u673a\u5173\u6309\u6cd5\u5b9a\u7a0b\u5e8f\u4f9d\u6cd5\u5f81\u7a0e\uff0c\u53ef\u4ee5\u81ea\u7531\u505a\u51fa\u51cf\u5f81\u3001\u505c\u5f81\u6216\u514d\u5f81\u7a0e\u6b3e\u7684\u51b3\u5b9a",
-    "negative_response": "\u7a0e\u6536\u6cd5\u5b9a\u539f\u5219\u5305\u62ec\u7a0e\u6536\u8981\u4ef6\u6cd5\u5b9a\u539f\u5219\u548c\u7a0e\u52a1\u5408\u6cd5\u6027\u539f\u5219"
-  }
-]

wisent/examples/scripts/results/test_ceval_evaluation.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "task_name": "ceval",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 2,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u901a\u8fc710\uff5e35kV\u7535\u538b\u7b49\u7ea7\u5e76\u7f51\u7684\u5149\u4f0f\u53d1\u7535\u7ad9\u529f\u7387\u56e0\u6570\u5e94\u80fd\u5728____\u8303\u56f4\u5185\u8fde\u7eed\u53ef\u8c03\uff0c\u6709\u7279\u6b8a\u8981\u6c42\u65f6\uff0c\u53ef\u505a\u9002\u5f53\u8c03\u6574\u4ee5\u7a33\u5b9a\u7535\u538b\u6c34\u5e73\u3002\nA. \u8d85\u524d0.95\uff5e\u6ede\u540e0.9\nB. \u8d85\u524d0.98\uff5e\u6ede\u540e...",
-      "positive_response": "\u8d85\u524d0.98\uff5e\u6ede\u540e0.98",
-      "negative_response": "\u8d85\u524d0.95\uff5e\u6ede\u540e0.9",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u8d85\u524d0.98\uff5e\u6ede\u540e0.98' (log_prob=-0.500), Expected: '\u8d85\u524d0.98\uff5e\u6ede\u540e0.98'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u8d85\u524d0.98\uff5e\u6ede\u540e0.98' (log_prob=-0.500), Expected: '\u8d85\u524d0.95\uff5e\u6ede\u540e0.9'"
-      },
-      "both_correct": true
-    },
-    {
-      "pair_id": 1,
-      "prompt": "Question: cGMP\u80fd\u6fc0\u6d3b____\nA. \u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176\nB. \u86cb\u767d\u6fc0\u9176G...",
-      "positive_response": "\u86cb\u767d\u6fc0\u9176G",
-      "negative_response": "\u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u86cb\u767d\u6fc0\u9176G' (log_prob=-0.500), Expected: '\u86cb\u767d\u6fc0\u9176G'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u86cb\u767d\u6fc0\u9176G' (log_prob=-0.500), Expected: '\u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_ceval_pairs.json DELETED Viewed

@@ -1,14 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u901a\u8fc710\uff5e35kV\u7535\u538b\u7b49\u7ea7\u5e76\u7f51\u7684\u5149\u4f0f\u53d1\u7535\u7ad9\u529f\u7387\u56e0\u6570\u5e94\u80fd\u5728____\u8303\u56f4\u5185\u8fde\u7eed\u53ef\u8c03\uff0c\u6709\u7279\u6b8a\u8981\u6c42\u65f6\uff0c\u53ef\u505a\u9002\u5f53\u8c03\u6574\u4ee5\u7a33\u5b9a\u7535\u538b\u6c34\u5e73\u3002\nA. \u8d85\u524d0.95\uff5e\u6ede\u540e0.9\nB. \u8d85\u524d0.98\uff5e\u6ede\u540e0.98",
-    "positive_response": "\u8d85\u524d0.98\uff5e\u6ede\u540e0.98",
-    "negative_response": "\u8d85\u524d0.95\uff5e\u6ede\u540e0.9"
-  },
-  {
-    "pair_id": 1,
-    "prompt": "Question: cGMP\u80fd\u6fc0\u6d3b____\nA. \u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176\nB. \u86cb\u767d\u6fc0\u9176G",
-    "positive_response": "\u86cb\u767d\u6fc0\u9176G",
-    "negative_response": "\u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176"
-  }
-]

wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "task_name": "ceval_valid",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 2,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u901a\u8fc710\uff5e35kV\u7535\u538b\u7b49\u7ea7\u5e76\u7f51\u7684\u5149\u4f0f\u53d1\u7535\u7ad9\u529f\u7387\u56e0\u6570\u5e94\u80fd\u5728____\u8303\u56f4\u5185\u8fde\u7eed\u53ef\u8c03\uff0c\u6709\u7279\u6b8a\u8981\u6c42\u65f6\uff0c\u53ef\u505a\u9002\u5f53\u8c03\u6574\u4ee5\u7a33\u5b9a\u7535\u538b\u6c34\u5e73\u3002\nA. \u8d85\u524d0.95\uff5e\u6ede\u540e0.9\nB. \u8d85\u524d0.98\uff5e\u6ede\u540e...",
-      "positive_response": "\u8d85\u524d0.98\uff5e\u6ede\u540e0.98",
-      "negative_response": "\u8d85\u524d0.95\uff5e\u6ede\u540e0.9",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u8d85\u524d0.98\uff5e\u6ede\u540e0.98' (log_prob=-0.500), Expected: '\u8d85\u524d0.98\uff5e\u6ede\u540e0.98'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u8d85\u524d0.98\uff5e\u6ede\u540e0.98' (log_prob=-0.500), Expected: '\u8d85\u524d0.95\uff5e\u6ede\u540e0.9'"
-      },
-      "both_correct": true
-    },
-    {
-      "pair_id": 1,
-      "prompt": "Question: cGMP\u80fd\u6fc0\u6d3b____\nA. \u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176\nB. \u86cb\u767d\u6fc0\u9176G...",
-      "positive_response": "\u86cb\u767d\u6fc0\u9176G",
-      "negative_response": "\u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u86cb\u767d\u6fc0\u9176G' (log_prob=-0.500), Expected: '\u86cb\u767d\u6fc0\u9176G'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u86cb\u767d\u6fc0\u9176G' (log_prob=-0.500), Expected: '\u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json DELETED Viewed

@@ -1,14 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u901a\u8fc710\uff5e35kV\u7535\u538b\u7b49\u7ea7\u5e76\u7f51\u7684\u5149\u4f0f\u53d1\u7535\u7ad9\u529f\u7387\u56e0\u6570\u5e94\u80fd\u5728____\u8303\u56f4\u5185\u8fde\u7eed\u53ef\u8c03\uff0c\u6709\u7279\u6b8a\u8981\u6c42\u65f6\uff0c\u53ef\u505a\u9002\u5f53\u8c03\u6574\u4ee5\u7a33\u5b9a\u7535\u538b\u6c34\u5e73\u3002\nA. \u8d85\u524d0.95\uff5e\u6ede\u540e0.9\nB. \u8d85\u524d0.98\uff5e\u6ede\u540e0.98",
-    "positive_response": "\u8d85\u524d0.98\uff5e\u6ede\u540e0.98",
-    "negative_response": "\u8d85\u524d0.95\uff5e\u6ede\u540e0.9"
-  },
-  {
-    "pair_id": 1,
-    "prompt": "Question: cGMP\u80fd\u6fc0\u6d3b____\nA. \u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176\nB. \u86cb\u767d\u6fc0\u9176G",
-    "positive_response": "\u86cb\u767d\u6fc0\u9176G",
-    "negative_response": "\u916a\u6c28\u9178\u86cb\u767d\u6fc0\u9176"
-  }
-]

wisent/examples/scripts/results/test_chain_of_thought_evaluation.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "task_name": "chain_of_thought",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 2,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: Jordan has 2 children who wear diapers.  Each child requires 5 diaper changes per day.  Jo...",
-      "positive_response": "5",
-      "negative_response": "6",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '5' (log_prob=-0.500), Expected: '5'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '5' (log_prob=-0.500), Expected: '6'"
-      },
-      "both_correct": true
-    },
-    {
-      "pair_id": 1,
-      "prompt": "Question: Mason likes eating carrots. If he eats 4 carrots each on weekdays and 5 carrots each on Sa...",
-      "positive_response": "30",
-      "negative_response": "31",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '30' (log_prob=-0.500), Expected: '30'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '30' (log_prob=-0.500), Expected: '31'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_chain_of_thought_pairs.json DELETED Viewed

@@ -1,14 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: Jordan has 2 children who wear diapers.  Each child requires 5 diaper changes per day.  Jordan's wife changes half of the diapers.  How many diapers does Jordan change per day?",
-    "positive_response": "5",
-    "negative_response": "6"
-  },
-  {
-    "pair_id": 1,
-    "prompt": "Question: Mason likes eating carrots. If he eats 4 carrots each on weekdays and 5 carrots each on Saturday and Sunday, how many carrots does he eat a week?",
-    "positive_response": "30",
-    "negative_response": "31"
-  }
-]

wisent/examples/scripts/results/test_chartqa_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "chartqa",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: How many food item is shown in the bar graph?...",
-      "positive_response": "14",
-      "negative_response": "16",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '14' (log_prob=-0.500), Expected: '14'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '14' (log_prob=-0.500), Expected: '16'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_chartqa_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: How many food item is shown in the bar graph?",
-    "positive_response": "14",
-    "negative_response": "16"
-  }
-]

wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "claim_stance_topic",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Classify the Topic of the following Argument to one of these options: advertising, all nations a rig...",
-      "positive_response": "gambling",
-      "negative_response": "advertising",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'gambling' (log_prob=-0.500), Expected: 'gambling'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'gambling' (log_prob=-0.500), Expected: 'advertising'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_claim_stance_topic_pairs.json DELETED Viewed

@@ -1,8 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Classify the Topic of the following Argument to one of these options: advertising, all nations a right to nuclear weapons, a mandatory retirement age, american jobs act, asean, atheism, austerity measures, barrier methods of contraception, blasphemy, boxing, bribery, burning the stars and stripes, children, collective bargaining rights claimed by trades unions, congressional earmarks, democratic governments should require voters to present photo identification at the polling station, democratization, endangered species, enforce term limits on the legislative branch of government, freedom of speech, fund education using a voucher scheme, gambling, governments should choose open source software, high rises for housing, holocaust denial, housewives should be paid for their work, hydroelectric dams, implement playoffs in collegiate level american football, intellectual property rights, israel's 2008-2009 military operations against gaza, leaking of military documents, multiculturalism, national service, only teach abstinence for sex education in schools, open primaries, partial birth abortions, physical education, poor communities, raising the school leaving age to 18, re-engage with myanmar, the blockade of gaza, the creation of private universities in the uk, the free market, the growing of tobacco, the keystone xl pipeline, the monarchy, the one-child policy of the republic of china, the right to asylum, the right to bear arms, the sale of violent video games to minors, the use of affirmative action, the use of performance enhancing drugs in professional sports, the use of truth and reconciliation commissions, wind power, year round schooling.\nArgument:\nmany people participate in gambling as a form of recreation or even as a means to gain an income\nTopic:",
-    "positive_response": "gambling",
-    "negative_response": "advertising"
-  }
-]

wisent/examples/scripts/results/test_cmmlu_evaluation.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "task_name": "cmmlu",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 2,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Question: \u7559\u4f4f\u4eba\u624d\u8981\u4ece\u5206\u914d\u5236\u5ea6\u4e0a\uff0c\u4f7f\u4eba\u624d\u7684\u52b3\u52a8\u548c\u8d21\u732e\u4e0e\u4ed6\u4eec\u7684\u6536\u5165\u76f8\u9002\u5e94\uff0c\u5c31\u8981\uff1aa\u5efa\u7acb\u6309\u52b3\u5206\u914d\u548c\u6309\u751f\u4ea7\u8981\u7d20\u5206\u914d\u76f8\u7ed3\u5408\u7684\u5206\u914d\u5236\u5ea6 b\u5b9e\u884c\u6309\u52b3\u5206\u914d\u5236\u5ea6 c\u5927\u5e45\u5ea6\u63d0\u9ad8\u6709\u7a81\u51fa\u8d21\u732e\u7684\u9ad8\u5c42\u6b21\u4eba\u624d\u7684\u5956\u52b1\u548c\u6d25...",
-      "positive_response": "ac",
-      "negative_response": "ab",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'ac' (log_prob=-0.500), Expected: 'ac'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'ac' (log_prob=-0.500), Expected: 'ab'"
-      },
-      "both_correct": true
-    },
-    {
-      "pair_id": 1,
-      "prompt": "Question: \u636e\u8d44\u6599\u7edf\u8ba1\uff1a\u5728\u660e\u540e\u671f\u81f3\u6e05\u524d\u671f200\u4f59\u5e74\u95f4\uff0c\u4e16\u754c\u767d\u94f6\u4ea7\u91cf\u7684\u4e00\u534a\u6d41\u5165\u4e2d\u56fd\uff0c\u62e5\u6709\u4e00\u6d41\u57ce\u5e02\u548c\u6700\u4e3a\u5bc6\u96c6\u3001\u5b8c\u5584\u7684\u5e02\u573a\u7f51\u7edc\u7684\u4e2d\u56fd\uff0c\u6210\u4e3a\u5f53\u65f6\u4e16\u754c\u7ecf\u6d4e\u548c\u8d38\u6613\u7684\u4e2d\u5fc3\u533a\u57df\u3002\u7136\u800c\u5f53\u65f6\u5b83\u5374\u6ca1\u6709\u5f62\u6210\u5f3a\u5927\u7684\u626b\u8361...",
-      "positive_response": "\u201c\u91cd\u519c\u6291\u5546\u201d\u548c\u201c\u95ed\u5173\u9501\u56fd\u201d\u7684\u653f\u7b56\u7684\u538b\u5236",
-      "negative_response": "\u82f1\u56fd\u5de5\u4e1a\u9769\u547d\u540e\u5bf9\u4e2d\u56fd\u8fdb\u884c\u7684\u5546\u54c1\u8f93\u51fa",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u201c\u91cd\u519c\u6291\u5546\u201d\u548c\u201c\u95ed\u5173\u9501\u56fd\u201d\u7684\u653f\u7b56\u7684\u538b\u5236' (log_prob=-0.500), Expected: '\u201c\u91cd\u519c\u6291\u5546\u201d\u548c\u201c\u95ed\u5173\u9501\u56fd\u201d\u7684\u653f\u7b56\u7684\u538b\u5236'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: '\u201c\u91cd\u519c\u6291\u5546\u201d\u548c\u201c\u95ed\u5173\u9501\u56fd\u201d\u7684\u653f\u7b56\u7684\u538b\u5236' (log_prob=-0.500), Expected: '\u82f1\u56fd\u5de5\u4e1a\u9769\u547d\u540e\u5bf9\u4e2d\u56fd\u8fdb\u884c\u7684\u5546\u54c1\u8f93\u51fa'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent/examples/scripts/results/test_cmmlu_pairs.json DELETED Viewed

@@ -1,14 +0,0 @@
-[
-  {
-    "pair_id": 0,
-    "prompt": "Question: \u7559\u4f4f\u4eba\u624d\u8981\u4ece\u5206\u914d\u5236\u5ea6\u4e0a\uff0c\u4f7f\u4eba\u624d\u7684\u52b3\u52a8\u548c\u8d21\u732e\u4e0e\u4ed6\u4eec\u7684\u6536\u5165\u76f8\u9002\u5e94\uff0c\u5c31\u8981\uff1aa\u5efa\u7acb\u6309\u52b3\u5206\u914d\u548c\u6309\u751f\u4ea7\u8981\u7d20\u5206\u914d\u76f8\u7ed3\u5408\u7684\u5206\u914d\u5236\u5ea6 b\u5b9e\u884c\u6309\u52b3\u5206\u914d\u5236\u5ea6 c\u5927\u5e45\u5ea6\u63d0\u9ad8\u6709\u7a81\u51fa\u8d21\u732e\u7684\u9ad8\u5c42\u6b21\u4eba\u624d\u7684\u5956\u52b1\u548c\u6d25\u8d34 d\u5b9e\u884c\u5e74\u85aa\u5236\u5206\u914d\u65b9\u5f0f\nA. ab\nB. ac",
-    "positive_response": "ac",
-    "negative_response": "ab"
-  },
-  {
-    "pair_id": 1,
-    "prompt": "Question: \u636e\u8d44\u6599\u7edf\u8ba1\uff1a\u5728\u660e\u540e\u671f\u81f3\u6e05\u524d\u671f200\u4f59\u5e74\u95f4\uff0c\u4e16\u754c\u767d\u94f6\u4ea7\u91cf\u7684\u4e00\u534a\u6d41\u5165\u4e2d\u56fd\uff0c\u62e5\u6709\u4e00\u6d41\u57ce\u5e02\u548c\u6700\u4e3a\u5bc6\u96c6\u3001\u5b8c\u5584\u7684\u5e02\u573a\u7f51\u7edc\u7684\u4e2d\u56fd\uff0c\u6210\u4e3a\u5f53\u65f6\u4e16\u754c\u7ecf\u6d4e\u548c\u8d38\u6613\u7684\u4e2d\u5fc3\u533a\u57df\u3002\u7136\u800c\u5f53\u65f6\u5b83\u5374\u6ca1\u6709\u5f62\u6210\u5f3a\u5927\u7684\u626b\u8361\u65e7\u7ecf\u6d4e\u57fa\u7840\u7684\u9769\u547d\u6027\u53d8\u5316\u3002\u5176\u4e2d\u5185\u5728\u7684\u548c\u4eba\u4e3a\u7684\u539f\u56e0\u662f\nA. \u82f1\u56fd\u5de5\u4e1a\u9769\u547d\u540e\u5bf9\u4e2d\u56fd\u8fdb\u884c\u7684\u5546\u54c1\u8f93\u51fa\nB. \u201c\u91cd\u519c\u6291\u5546\u201d\u548c\u201c\u95ed\u5173\u9501\u56fd\u201d\u7684\u653f\u7b56\u7684\u538b\u5236",
-    "positive_response": "\u201c\u91cd\u519c\u6291\u5546\u201d\u548c\u201c\u95ed\u5173\u9501\u56fd\u201d\u7684\u653f\u7b56\u7684\u538b\u5236",
-    "negative_response": "\u82f1\u56fd\u5de5\u4e1a\u9769\u547d\u540e\u5bf9\u4e2d\u56fd\u8fdb\u884c\u7684\u5546\u54c1\u8f93\u51fa"
-  }
-]

wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json DELETED Viewed

@@ -1,30 +0,0 @@
-{
-  "task_name": "cnn_dailymail",
-  "model_name": "mock",
-  "evaluator_name": "log_likelihoods",
-  "num_pairs": 1,
-  "all_correct": true,
-  "pairs": [
-    {
-      "pair_id": 0,
-      "prompt": "Summarize the following article: (CNN)Share, and your gift will be multiplied. That may sound like a...",
-      "positive_response": "Zully Broussard decided to give a kidney to a stranger .\nA new computer program helped her donation spur transplants for six kidney patients .",
-      "negative_response": "A new computer program helped her donation spur transplants for six kidney patients. Zully Broussard decided to give a kidney to a stranger.",
-      "positive_evaluation": {
-        "ground_truth": "TRUTHFUL",
-        "confidence": 1.0,
-        "expected": "TRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'Zully Broussard decided to give a kidney to a stranger .\nA new computer program helped her donation spur transplants for six kidney patients .' (log_prob=-0.500), Expected: 'Zully Broussard decided to give a kidney to a stranger .\nA new computer program helped her donation spur transplants for six kidney patients .'"
-      },
-      "negative_evaluation": {
-        "ground_truth": "UNTRUTHFUL",
-        "confidence": 0.0,
-        "expected": "UNTRUTHFUL",
-        "correct": true,
-        "details": "Predicted: 'Zully Broussard decided to give a kidney to a stranger .\nA new computer program helped her donation spur transplants for six kidney patients .' (log_prob=-0.500), Expected: 'A new computer program helped her donation spur transplants for six kidney patients. Zully Broussard decided to give a kidney to a stranger.'"
-      },
-      "both_correct": true
-    }
-  ]
-}

wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl