PyPI - wisent - Versions diffs - 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.701py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (330) hide show

wisent/examples/scripts/results/test_afrimmlu_evaluation.json ADDED Viewed

@@ -0,0 +1,324 @@
+{
+  "task_name": "afrimmlu",
+  "model_name": "mock",
+  "evaluator_name": "log_likelihoods",
+  "num_pairs": 15,
+  "all_correct": true,
+  "pairs": [
+    {
+      "pair_id": 0,
+      "prompt": "Ked\u1ee5 \u1ee5d\u1ecb emume Steeti a ch\u1ecdr\u1ecd?...",
+      "positive_response": "[",
+      "negative_response": "'",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '[' (log_prob=-0.500), Expected: '['"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '[' (log_prob=-0.500), Expected: '''"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 1,
+      "prompt": "\u12a8\u121a\u12a8\u1270\u1209\u1275 \u12cd\u1235\u1325 \u12e8\u12cb\u130b (P) \u1260\u134d\u1341\u121d \u1349\u12ad\u12ad\u122d \u12cd\u1235\u1325 \u1208\u121d\u1295 \u12a8\u1205\u12f3\u130d \u1308\u1262 (MR) \u130b\u122d \u12a5\u1295\u12f0\u121a\u1270\u12ab\u12a8\u120d\u1363 \u12a5\u1293 \u12cb\u130b (P) \u1260\u1265\u1278\u129d\u1290\u1275 \u12c8\u12ed\u121d \u134d\u133d\u121d\u1293 \u1260\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a8\u12a0\u1290\u1235\u1270\u129b \u1308\u1262 \u1208\u121d\u1295 \u12a5\u1295\u12f0\u121a\u1260\u120d\u1325 \u1260\u1275\u12ad\u12ad\u120d \u12e8\u121a\u12eb\u1235\u1228\u12f3\u12cd \u12e8\u1275\u129b\u12cd \u1290\u12cd? I. MR = P = \u1208\u134d\u1339\u121d \u12cd\u12f5\u12f5\u122d \u1260\u12a0\u130d\u12f5\u121d \u1270\u130d\u1263\u122d \u120b\u12ed \u134d\u120b\u130e\u1275. II. P > MR \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a5\u1295\u12f0 \u1241\u120d\u1241\u120d \u1270\u1295\u1238\u122b\u1273\u127d \u1270\u130d\u1263\u122b\u1275\u1362 III. \u134d\u120b\u130e\u1275 \u12a5\u1293 \u12cb\u130b \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u12cd\u12f5\u12f5\u122d \u12a5\u1295\u12f0 \u124b\u121a \u1270\u130d\u1263\u122d \u1290\u12cd \u12e8\u121a\u12c8\u12a8\u1209\u1275\u1362...",
+      "positive_response": "'",
+      "negative_response": "I",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: 'I'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 2,
+      "prompt": "Na 1997, \u1ee4l\u1ecd ak\u1ee5 \u1ee5wa ch\u1ecdp\u1ee5tara na fraksh\u1ecdn nke nd\u1ecb b\u1ecb n'\u1ee5wa niile bi na mba nd\u1ecb na-akpata obere ego--ya b\u1ee5 mba kwa onye \u1ecdb\u1ee5la n'eri \u1ee5gw\u1ecd \u1ecdnwa nke $1230 ma\u1ecdb\u1ee5 n'erughi --d\u1ecb ihe d\u1ecb ka...",
+      "positive_response": "5",
+      "negative_response": "%",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '5' (log_prob=-0.500), Expected: '5'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '5' (log_prob=-0.500), Expected: '%'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 3,
+      "prompt": "Mu gihe cyacyo cy'agahebuzo, ni irihe tandukaniro riri hagati y'abantu bo mu Majyepfo n'abo mu bindi bice bya Leta Zunze Ubumwe z'Amerika bemera ko amashuri atandukana n'ayandi?...",
+      "positive_response": "'",
+      "negative_response": "8",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: '8'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 4,
+      "prompt": "Ni iyihe mibare yabaturage itubwira igipimo cyuburumbuke aho umubare wabana bazavuka uzaba uhagije kugira ngo usimbure ababyeyi no kwishyura impfu zabana bapfa bakiri bato?...",
+      "positive_response": "'",
+      "negative_response": "U",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: 'U'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 5,
+      "prompt": "A 2016, kusan mutane nawa a Amurka ba su da matsuguni?...",
+      "positive_response": "'",
+      "negative_response": "5",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: '5'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 6,
+      "prompt": "Pour quel nombre l'\u00e9quation ci-apr\u00e8s est-elle vraie? : 26,1 mm = _ dm....",
+      "positive_response": "[",
+      "negative_response": "'",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '[' (log_prob=-0.500), Expected: '['"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '[' (log_prob=-0.500), Expected: '''"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 7,
+      "prompt": "Je, kuna vikwazo vyovyote vya kuingizwa kwa uhalifu wa kimila chini ya sheria ya Kiingereza?...",
+      "positive_response": "a",
+      "negative_response": "k",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'a'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'k'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 8,
+      "prompt": "Akwai wasu iyakoki dan shigar laifukan al'ada a \u0199ar\u0199ashin dokar Ingilishi?...",
+      "positive_response": "a",
+      "negative_response": "b",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'a'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'b'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 9,
+      "prompt": "Kufikia 2019, ni takriban asilimia ngapi ya watu duniani wanaishi katika umaskini uliokithiri (chini ya $1.90 kwa siku)?...",
+      "positive_response": "'",
+      "negative_response": "5",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: '5'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 10,
+      "prompt": "Which treaties are considered as 'source of international law' under article 38 ICJ Statute?...",
+      "positive_response": "'",
+      "negative_response": "A",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: 'A'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 11,
+      "prompt": "Umupaka w'ibishoboka byo gukora uzagaragara nk'umurongo ugororotse igihe...",
+      "positive_response": "b",
+      "negative_response": "u",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'b' (log_prob=-0.500), Expected: 'b'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'b' (log_prob=-0.500), Expected: 'u'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 12,
+      "prompt": "T\u00ed \u00ecd\u00e1s\u00ed l\u00e1ti \u00ecta t\u00ed \u00f2 d\u00e1ra b\u00e1 darap\u1ecd\u0300 m\u1ecd\u0301 \u00ecp\u00e8s\u00e8 ohun \u00e8l\u00f2 al\u00e1d\u00e0ni, \u00e8wo n\u00edn\u00fa \u00e0w\u1ecdn w\u1ecd\u0300ny\u00ec\u00ed ni \u00ecg\u00e9s\u1eb9\u0300 l\u00e1ti \u1ecd\u0300d\u1ecd\u0300 \u00ecj\u1ecdba t\u00ed \u00f3 \u1e63e\u00e9\u1e63e k\u00f3 gb\u00e9 \u1ecdj\u00e0 n\u00e1\u00e0 l\u1ecd s\u00edbi \u00e0b\u00e1j\u00e1de t\u00f3 d\u00e1ra?...",
+      "positive_response": "T",
+      "negative_response": "i",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'T' (log_prob=-0.500), Expected: 'T'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: 'T' (log_prob=-0.500), Expected: 'i'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 13,
+      "prompt": "What was GDP per capita in the United States in 1850 when adjusting for inflation and PPP in 2011 prices?...",
+      "positive_response": "'",
+      "negative_response": "A",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: ''' (log_prob=-0.500), Expected: 'A'"
+      },
+      "both_correct": true
+    },
+    {
+      "pair_id": 14,
+      "prompt": "Ni uwuhe mubare w'abana bafite hagati y'imyaka 13 na 15 muri Misiri bavuze ko bahohotewe nibura rimwe mu mezi abiri ashize, guhera mu mwaka wa 2015?...",
+      "positive_response": "0",
+      "negative_response": "%",
+      "positive_evaluation": {
+        "ground_truth": "TRUTHFUL",
+        "confidence": 1.0,
+        "expected": "TRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '0' (log_prob=-0.500), Expected: '0'"
+      },
+      "negative_evaluation": {
+        "ground_truth": "UNTRUTHFUL",
+        "confidence": 0.0,
+        "expected": "UNTRUTHFUL",
+        "correct": true,
+        "details": "Predicted: '0' (log_prob=-0.500), Expected: '%'"
+      },
+      "both_correct": true
+    }
+  ]
+}

wisent/examples/scripts/results/test_afrimmlu_pairs.json ADDED Viewed

@@ -0,0 +1,92 @@
+[
+  {
+    "pair_id": 0,
+    "prompt": "Ked\u1ee5 \u1ee5d\u1ecb emume Steeti a ch\u1ecdr\u1ecd?",
+    "positive_response": "[",
+    "negative_response": "'"
+  },
+  {
+    "pair_id": 1,
+    "prompt": "\u12a8\u121a\u12a8\u1270\u1209\u1275 \u12cd\u1235\u1325 \u12e8\u12cb\u130b (P) \u1260\u134d\u1341\u121d \u1349\u12ad\u12ad\u122d \u12cd\u1235\u1325 \u1208\u121d\u1295 \u12a8\u1205\u12f3\u130d \u1308\u1262 (MR) \u130b\u122d \u12a5\u1295\u12f0\u121a\u1270\u12ab\u12a8\u120d\u1363 \u12a5\u1293 \u12cb\u130b (P) \u1260\u1265\u1278\u129d\u1290\u1275 \u12c8\u12ed\u121d \u134d\u133d\u121d\u1293 \u1260\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a8\u12a0\u1290\u1235\u1270\u129b \u1308\u1262 \u1208\u121d\u1295 \u12a5\u1295\u12f0\u121a\u1260\u120d\u1325 \u1260\u1275\u12ad\u12ad\u120d \u12e8\u121a\u12eb\u1235\u1228\u12f3\u12cd \u12e8\u1275\u129b\u12cd \u1290\u12cd? I. MR = P = \u1208\u134d\u1339\u121d \u12cd\u12f5\u12f5\u122d \u1260\u12a0\u130d\u12f5\u121d \u1270\u130d\u1263\u122d \u120b\u12ed \u134d\u120b\u130e\u1275. II. P > MR \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a5\u1295\u12f0 \u1241\u120d\u1241\u120d \u1270\u1295\u1238\u122b\u1273\u127d \u1270\u130d\u1263\u122b\u1275\u1362 III. \u134d\u120b\u130e\u1275 \u12a5\u1293 \u12cb\u130b \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u12cd\u12f5\u12f5\u122d \u12a5\u1295\u12f0 \u124b\u121a \u1270\u130d\u1263\u122d \u1290\u12cd \u12e8\u121a\u12c8\u12a8\u1209\u1275\u1362",
+    "positive_response": "'",
+    "negative_response": "I"
+  },
+  {
+    "pair_id": 2,
+    "prompt": "Na 1997, \u1ee4l\u1ecd ak\u1ee5 \u1ee5wa ch\u1ecdp\u1ee5tara na fraksh\u1ecdn nke nd\u1ecb b\u1ecb n'\u1ee5wa niile bi na mba nd\u1ecb na-akpata obere ego--ya b\u1ee5 mba kwa onye \u1ecdb\u1ee5la n'eri \u1ee5gw\u1ecd \u1ecdnwa nke $1230 ma\u1ecdb\u1ee5 n'erughi --d\u1ecb ihe d\u1ecb ka",
+    "positive_response": "5",
+    "negative_response": "%"
+  },
+  {
+    "pair_id": 3,
+    "prompt": "Mu gihe cyacyo cy'agahebuzo, ni irihe tandukaniro riri hagati y'abantu bo mu Majyepfo n'abo mu bindi bice bya Leta Zunze Ubumwe z'Amerika bemera ko amashuri atandukana n'ayandi?",
+    "positive_response": "'",
+    "negative_response": "8"
+  },
+  {
+    "pair_id": 4,
+    "prompt": "Ni iyihe mibare yabaturage itubwira igipimo cyuburumbuke aho umubare wabana bazavuka uzaba uhagije kugira ngo usimbure ababyeyi no kwishyura impfu zabana bapfa bakiri bato?",
+    "positive_response": "'",
+    "negative_response": "U"
+  },
+  {
+    "pair_id": 5,
+    "prompt": "A 2016, kusan mutane nawa a Amurka ba su da matsuguni?",
+    "positive_response": "'",
+    "negative_response": "5"
+  },
+  {
+    "pair_id": 6,
+    "prompt": "Pour quel nombre l'\u00e9quation ci-apr\u00e8s est-elle vraie? : 26,1 mm = _ dm.",
+    "positive_response": "[",
+    "negative_response": "'"
+  },
+  {
+    "pair_id": 7,
+    "prompt": "Je, kuna vikwazo vyovyote vya kuingizwa kwa uhalifu wa kimila chini ya sheria ya Kiingereza?",
+    "positive_response": "a",
+    "negative_response": "k"
+  },
+  {
+    "pair_id": 8,
+    "prompt": "Akwai wasu iyakoki dan shigar laifukan al'ada a \u0199ar\u0199ashin dokar Ingilishi?",
+    "positive_response": "a",
+    "negative_response": "b"
+  },
+  {
+    "pair_id": 9,
+    "prompt": "Kufikia 2019, ni takriban asilimia ngapi ya watu duniani wanaishi katika umaskini uliokithiri (chini ya $1.90 kwa siku)?",
+    "positive_response": "'",
+    "negative_response": "5"
+  },
+  {
+    "pair_id": 10,
+    "prompt": "Which treaties are considered as 'source of international law' under article 38 ICJ Statute?",
+    "positive_response": "'",
+    "negative_response": "A"
+  },
+  {
+    "pair_id": 11,
+    "prompt": "Umupaka w'ibishoboka byo gukora uzagaragara nk'umurongo ugororotse igihe",
+    "positive_response": "b",
+    "negative_response": "u"
+  },
+  {
+    "pair_id": 12,
+    "prompt": "T\u00ed \u00ecd\u00e1s\u00ed l\u00e1ti \u00ecta t\u00ed \u00f2 d\u00e1ra b\u00e1 darap\u1ecd\u0300 m\u1ecd\u0301 \u00ecp\u00e8s\u00e8 ohun \u00e8l\u00f2 al\u00e1d\u00e0ni, \u00e8wo n\u00edn\u00fa \u00e0w\u1ecdn w\u1ecd\u0300ny\u00ec\u00ed ni \u00ecg\u00e9s\u1eb9\u0300 l\u00e1ti \u1ecd\u0300d\u1ecd\u0300 \u00ecj\u1ecdba t\u00ed \u00f3 \u1e63e\u00e9\u1e63e k\u00f3 gb\u00e9 \u1ecdj\u00e0 n\u00e1\u00e0 l\u1ecd s\u00edbi \u00e0b\u00e1j\u00e1de t\u00f3 d\u00e1ra?",
+    "positive_response": "T",
+    "negative_response": "i"
+  },
+  {
+    "pair_id": 13,
+    "prompt": "What was GDP per capita in the United States in 1850 when adjusting for inflation and PPP in 2011 prices?",
+    "positive_response": "'",
+    "negative_response": "A"
+  },
+  {
+    "pair_id": 14,
+    "prompt": "Ni uwuhe mubare w'abana bafite hagati y'imyaka 13 na 15 muri Misiri bavuze ko bahohotewe nibura rimwe mu mezi abiri ashize, guhera mu mwaka wa 2015?",
+    "positive_response": "0",
+    "negative_response": "%"
+  }
+]

wisent/examples/scripts/search_all_short_names.py ADDED Viewed

@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+"""Search for all short task names that might match Tag."""
+import sys
+sys.path.insert(0, '/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source')
+from lm_eval.tasks import TaskManager
+def main():
+    tm = TaskManager()
+    # Get all 3-letter task names
+    three_letter = [t for t in tm.task_index.keys() if len(t) == 3]
+    print(f"Found {len(three_letter)} tasks with exactly 3 letters:")
+    for task in sorted(three_letter):
+        print(f"  - {task}")
+    # Get all 3-4 letter task names starting with T
+    short_t = [t for t in tm.task_index.keys() if t.lower().startswith('t') and 3 <= len(t) <= 4]
+    print(f"\nFound {len(short_t)} tasks with 3-4 letters starting with 't':")
+    for task in sorted(short_t):
+        print(f"  - {task}")
+    # Search for anything with T, A, G in sequence (case insensitive)
+    tag_pattern = [t for t in tm.task_index.keys() if 't' in t.lower() and 'a' in t.lower() and 'g' in t.lower()]
+    print(f"\nFound {len(tag_pattern)} tasks containing t, a, and g:")
+    for task in sorted(tag_pattern)[:20]:  # Show first 20
+        print(f"  - {task}")
+if __name__ == "__main__":
+    main()

wisent/examples/scripts/test_all_benchmarks.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""Test all benchmarks to verify extractor and evaluator work."""
+import json
+import os
+import sys
+import signal
+from contextlib import contextmanager
+from pathlib import Path
+from wisent.examples.scripts.test_one_benchmark import test_benchmark
+# Set environment variable to trust remote code for datasets like meddialog
+os.environ['HF_DATASETS_TRUST_REMOTE_CODE'] = '1'
+# Set environment variable to allow code eval for coding tasks like humaneval, instructhumaneval
+os.environ['HF_ALLOW_CODE_EVAL'] = '1'
+class TimeoutError(Exception):
+    """Raised when a test times out."""
+    pass
+@contextmanager
+def timeout(seconds):
+    """Context manager for timing out operations."""
+    def signal_handler(signum, frame):
+        raise TimeoutError(f"Test timed out after {seconds} seconds")
+    # Set the signal handler and alarm
+    old_handler = signal.signal(signal.SIGALRM, signal_handler)
+    signal.alarm(seconds)
+    try:
+        yield
+    finally:
+        signal.alarm(0)
+        signal.signal(signal.SIGALRM, old_handler)
+def load_benchmarks():
+    """Load benchmarks from central registry."""
+    from wisent.core.benchmark_registry import get_all_benchmarks, get_broken_tasks
+    broken_tasks = get_broken_tasks()
+    if broken_tasks:
+        print(f"Skipping {len(broken_tasks)} broken benchmarks: {', '.join(broken_tasks)}")
+    return get_all_benchmarks()
+BENCHMARKS = load_benchmarks()
+def test_all_benchmarks(model_name: str = "meta-llama/Llama-3.1-8B-Instruct", output_dir: str = ".", start_index: int = 0):
+    """Test all benchmarks.
+    Args:
+        model_name: Model to use for testing
+        output_dir: Directory to save results
+        start_index: Index to start testing from (0-based)
+    Returns:
+        Dictionary with results for each benchmark
+    """
+    results = {
+        "model": model_name,
+        "total": len(BENCHMARKS),
+        "passed": 0,
+        "failed": 0,
+        "benchmarks": {}
+    }
+    print(f"\n{'='*70}")
+    print(f"Testing {len(BENCHMARKS)} benchmarks with {model_name}")
+    if start_index > 0:
+        print(f"Starting from benchmark {start_index + 1} ({BENCHMARKS[start_index]})")
+    print(f"{'='*70}\n")
+    for i, benchmark in enumerate(BENCHMARKS, 1):
+        if i - 1 < start_index:
+            continue
+        print(f"[{i}/{len(BENCHMARKS)}] Testing {benchmark}...")
+        try:
+            with timeout(1200):
+                success = test_benchmark(benchmark, model_name, output_dir)
+            results["benchmarks"][benchmark] = {
+                "status": "passed" if success else "failed",
+                "success": success
+            }
+            if success:
+                results["passed"] += 1
+                print(f"   PASSED\n")
+            else:
+                results["failed"] += 1
+                print(f"   FAILED\n")
+        except TimeoutError as e:
+            results["benchmarks"][benchmark] = {
+                "status": "timeout",
+                "success": False,
+                "error": str(e)
+            }
+            results["failed"] += 1
+            print(f"   TIMEOUT: {e}\n")
+        except Exception as e:
+            results["benchmarks"][benchmark] = {
+                "status": "error",
+                "success": False,
+                "error": str(e)
+            }
+            results["failed"] += 1
+            print(f"   ERROR: {e}\n")
+    print(f"\n{'='*70}")
+    print(f"SUMMARY")
+    print(f"{'='*70}")
+    print(f"Total: {results['total']}")
+    print(f"Passed: {results['passed']}")
+    print(f"Failed: {results['failed']}")
+    print(f"Success rate: {results['passed']/results['total']*100:.1f}%")
+    print(f"{'='*70}\n")
+    return results
+if __name__ == "__main__":
+    model = sys.argv[1] if len(sys.argv) > 1 else "meta-llama/Llama-3.1-8B-Instruct"
+    # Default to results directory in scripts folder
+    default_output = Path(__file__).parent / "results"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else str(default_output)
+    start_index = int(sys.argv[3]) if len(sys.argv) > 3 else 0
+    results = test_all_benchmarks(model, output_dir, start_index)
+    # Exit with appropriate code
+    sys.exit(0 if results["failed"] == 0 else 1)

wisent/examples/scripts/test_all_benchmarks_new.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Test all benchmarks to verify extractor and evaluator work."""
+import sys
+import signal
+from contextmanager import contextmanager
+from wisent.examples.scripts.test_one_benchmark import test_benchmark
+class TimeoutError(Exception):
+    """Raised when a test times out."""
+    pass
+@contextmanager
+def timeout(seconds):
+    """Context manager for timing out operations."""
+    def signal_handler(signum, frame):
+        raise TimeoutError(f"Test timed out after {seconds} seconds")
+    # Set the signal handler and alarm
+    old_handler = signal.signal(signal.SIGALRM, signal_handler)
+    signal.alarm(seconds)
+    try:
+        yield
+    finally:
+        signal.alarm(0)
+        signal.signal(signal.SIGALRM, old_handler)

wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.701py3-none-any.whl → 0.7.901py3-none-any.whl