wisent 0.7.901__py3-none-any.whl → 0.7.1116__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/comparison/__init__.py +1 -0
  3. wisent/comparison/detect_bos_features.py +275 -0
  4. wisent/comparison/fgaa.py +465 -0
  5. wisent/comparison/lora.py +663 -0
  6. wisent/comparison/lora_dpo.py +604 -0
  7. wisent/comparison/main.py +444 -0
  8. wisent/comparison/ours.py +76 -0
  9. wisent/comparison/reft.py +690 -0
  10. wisent/comparison/sae.py +304 -0
  11. wisent/comparison/utils.py +381 -0
  12. wisent/core/activations/activations_collector.py +3 -2
  13. wisent/core/activations/extraction_strategy.py +8 -4
  14. wisent/core/cli/agent/apply_steering.py +7 -5
  15. wisent/core/cli/agent/train_classifier.py +4 -3
  16. wisent/core/cli/generate_vector_from_task.py +11 -20
  17. wisent/core/cli/get_activations.py +1 -1
  18. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +20 -3
  19. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +8 -1
  20. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +8 -1
  21. wisent/core/parser_arguments/generate_vector_from_task_parser.py +4 -11
  22. wisent/core/parser_arguments/get_activations_parser.py +5 -14
  23. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/METADATA +5 -1
  24. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/RECORD +28 -91
  25. wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +0 -2112
  26. wisent/examples/scripts/1/test_basqueglue_evaluation.json +0 -51
  27. wisent/examples/scripts/1/test_basqueglue_pairs.json +0 -14
  28. wisent/examples/scripts/1/test_bec2016eu_evaluation.json +0 -51
  29. wisent/examples/scripts/1/test_bec2016eu_pairs.json +0 -14
  30. wisent/examples/scripts/1/test_belebele_evaluation.json +0 -51
  31. wisent/examples/scripts/1/test_belebele_pairs.json +0 -14
  32. wisent/examples/scripts/1/test_benchmarks_evaluation.json +0 -51
  33. wisent/examples/scripts/1/test_benchmarks_pairs.json +0 -14
  34. wisent/examples/scripts/1/test_bertaqa_evaluation.json +0 -51
  35. wisent/examples/scripts/1/test_bertaqa_pairs.json +0 -14
  36. wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +0 -30
  37. wisent/examples/scripts/1/test_bhtc_v2_pairs.json +0 -8
  38. wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +0 -30
  39. wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +0 -8
  40. wisent/examples/scripts/1/test_cabreu_evaluation.json +0 -30
  41. wisent/examples/scripts/1/test_cabreu_pairs.json +0 -8
  42. wisent/examples/scripts/1/test_careqa_en_evaluation.json +0 -30
  43. wisent/examples/scripts/1/test_careqa_en_pairs.json +0 -8
  44. wisent/examples/scripts/1/test_careqa_evaluation.json +0 -30
  45. wisent/examples/scripts/1/test_careqa_pairs.json +0 -8
  46. wisent/examples/scripts/1/test_catalanqa_evaluation.json +0 -30
  47. wisent/examples/scripts/1/test_catalanqa_pairs.json +0 -8
  48. wisent/examples/scripts/1/test_catcola_evaluation.json +0 -30
  49. wisent/examples/scripts/1/test_catcola_pairs.json +0 -8
  50. wisent/examples/scripts/1/test_chartqa_evaluation.json +0 -30
  51. wisent/examples/scripts/1/test_chartqa_pairs.json +0 -8
  52. wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +0 -30
  53. wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +0 -8
  54. wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +0 -30
  55. wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +0 -8
  56. wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +0 -30
  57. wisent/examples/scripts/1/test_cocoteros_es_pairs.json +0 -8
  58. wisent/examples/scripts/1/test_coedit_gec_evaluation.json +0 -30
  59. wisent/examples/scripts/1/test_coedit_gec_pairs.json +0 -8
  60. wisent/examples/scripts/1/test_cola_evaluation.json +0 -30
  61. wisent/examples/scripts/1/test_cola_pairs.json +0 -8
  62. wisent/examples/scripts/1/test_coqcat_evaluation.json +0 -30
  63. wisent/examples/scripts/1/test_coqcat_pairs.json +0 -8
  64. wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +0 -30
  65. wisent/examples/scripts/1/test_dbpedia_14_pairs.json +0 -8
  66. wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +0 -30
  67. wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +0 -8
  68. wisent/examples/scripts/1/test_ethos_binary_evaluation.json +0 -30
  69. wisent/examples/scripts/1/test_ethos_binary_pairs.json +0 -8
  70. wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +0 -30
  71. wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +0 -8
  72. wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +0 -30
  73. wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +0 -8
  74. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  75. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +0 -8
  76. wisent/examples/scripts/2/test_arc_ar_evaluation.json +0 -30
  77. wisent/examples/scripts/2/test_arc_ar_pairs.json +0 -8
  78. wisent/examples/scripts/2/test_atis_evaluation.json +0 -30
  79. wisent/examples/scripts/2/test_atis_pairs.json +0 -8
  80. wisent/examples/scripts/2/test_babi_evaluation.json +0 -30
  81. wisent/examples/scripts/2/test_babi_pairs.json +0 -8
  82. wisent/examples/scripts/2/test_babilong_evaluation.json +0 -30
  83. wisent/examples/scripts/2/test_babilong_pairs.json +0 -8
  84. wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +0 -30
  85. wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +0 -8
  86. wisent/examples/scripts/2/test_basque-glue_pairs.json +0 -14
  87. wisent/examples/scripts/generate_paper_data.py +0 -384
  88. wisent/examples/scripts/intervention_validation.py +0 -626
  89. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +0 -324
  90. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +0 -92
  91. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +0 -324
  92. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +0 -92
  93. wisent/examples/scripts/results/test_afrimgsm_pairs.json +0 -92
  94. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +0 -324
  95. wisent/examples/scripts/results/test_afrimmlu_pairs.json +0 -92
  96. wisent/examples/scripts/threshold_analysis.py +0 -434
  97. wisent/examples/scripts/visualization_gallery.py +0 -582
  98. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/WHEEL +0 -0
  99. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/entry_points.txt +0 -0
  100. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/licenses/LICENSE +0 -0
  101. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/top_level.txt +0 -0
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "basqueglue",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Zer egin? Nori deitu? Zalantza horrekin geratu zen.\nAlegia:\nA. Zer egin eta nori deitu ez ...",
11
- "positive_response": "Zer egin eta nori deitu ez zekiela geratu zen.",
12
- "negative_response": "Zer egin eta nori deitu ez zekielakoan geratu zen.",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Zer egin eta nori deitu ez zekiela geratu zen.' (log_prob=-0.500), Expected: 'Zer egin eta nori deitu ez zekiela geratu zen.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Zer egin eta nori deitu ez zekiela geratu zen.' (log_prob=-0.500), Expected: 'Zer egin eta nori deitu ez zekielakoan geratu zen.'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Decide the relationship of the hypothesis 'Mary Traillek badaki hori.'' to the premise 'Mary Traille...",
32
- "positive_response": "entailment",
33
- "negative_response": "neutral",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'entailment' (log_prob=-0.500), Expected: 'entailment'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'entailment' (log_prob=-0.500), Expected: 'neutral'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Zer egin? Nori deitu? Zalantza horrekin geratu zen.\nAlegia:\nA. Zer egin eta nori deitu ez zekielakoan geratu zen.\nB. Zer egin eta nori deitu ez zekiela geratu zen.",
5
- "positive_response": "Zer egin eta nori deitu ez zekiela geratu zen.",
6
- "negative_response": "Zer egin eta nori deitu ez zekielakoan geratu zen."
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Decide the relationship of the hypothesis 'Mary Traillek badaki hori.'' to the premise 'Mary Traillek emango dizu horren berri.'\nA. neutral\nB. entailment",
11
- "positive_response": "entailment",
12
- "negative_response": "neutral"
13
- }
14
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "bec2016eu",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Zer egin? Nori deitu? Zalantza horrekin geratu zen.\nAlegia:\nA. Zer egin eta nori deitu ez ...",
11
- "positive_response": "Zer egin eta nori deitu ez zekiela geratu zen.",
12
- "negative_response": "Zer egin eta nori deitu ez zekielakoan geratu zen.",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Zer egin eta nori deitu ez zekiela geratu zen.' (log_prob=-0.500), Expected: 'Zer egin eta nori deitu ez zekiela geratu zen.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Zer egin eta nori deitu ez zekiela geratu zen.' (log_prob=-0.500), Expected: 'Zer egin eta nori deitu ez zekielakoan geratu zen.'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Decide the relationship of the hypothesis 'Mary Traillek badaki hori.'' to the premise 'Mary Traille...",
32
- "positive_response": "entailment",
33
- "negative_response": "neutral",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'entailment' (log_prob=-0.500), Expected: 'entailment'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'entailment' (log_prob=-0.500), Expected: 'neutral'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Zer egin? Nori deitu? Zalantza horrekin geratu zen.\nAlegia:\nA. Zer egin eta nori deitu ez zekielakoan geratu zen.\nB. Zer egin eta nori deitu ez zekiela geratu zen.",
5
- "positive_response": "Zer egin eta nori deitu ez zekiela geratu zen.",
6
- "negative_response": "Zer egin eta nori deitu ez zekielakoan geratu zen."
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Decide the relationship of the hypothesis 'Mary Traillek badaki hori.'' to the premise 'Mary Traillek emango dizu horren berri.'\nA. neutral\nB. entailment",
11
- "positive_response": "entailment",
12
- "negative_response": "neutral"
13
- }
14
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "belebele",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Hi ku ya hi ndzimana, i swiga swihi leswi landzelaka leswi nga hlamuseriki hi ku kongoma i...",
11
- "positive_response": "Chayina a yi si tshama yi xavisele matiko timovha to tala ku hundza Jarimani",
12
- "negative_response": "Ikhonomi ya Chayina yi kule swinene ku sukela ku lunghisiwa",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Chayina a yi si tshama yi xavisele matiko timovha to tala ku hundza Jarimani' (log_prob=-0.500), Expected: 'Chayina a yi si tshama yi xavisele matiko timovha to tala ku hundza Jarimani'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Chayina a yi si tshama yi xavisele matiko timovha to tala ku hundza Jarimani' (log_prob=-0.500), Expected: 'Ikhonomi ya Chayina yi kule swinene ku sukela ku lunghisiwa'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Question: Mukuwirirana nemashoko akataurwa, ndechipi chinangwa chikuru kubva zvino tichienda mberi c...",
32
- "positive_response": "Kusimudzira kuchengeteka kwevatambi",
33
- "negative_response": "Kuongorora mabasa ekubata chibharo aLarry Nassar",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'Kusimudzira kuchengeteka kwevatambi' (log_prob=-0.500), Expected: 'Kusimudzira kuchengeteka kwevatambi'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'Kusimudzira kuchengeteka kwevatambi' (log_prob=-0.500), Expected: 'Kuongorora mabasa ekubata chibharo aLarry Nassar'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Hi ku ya hi ndzimana, i swiga swihi leswi landzelaka leswi nga hlamuseriki hi ku kongoma ikhonomi ya Chayina?\nA. Ikhonomi ya Chayina yi kule swinene ku sukela ku lunghisiwa\nB. Chayina a yi si tshama yi xavisele matiko timovha to tala ku hundza Jarimani",
5
- "positive_response": "Chayina a yi si tshama yi xavisele matiko timovha to tala ku hundza Jarimani",
6
- "negative_response": "Ikhonomi ya Chayina yi kule swinene ku sukela ku lunghisiwa"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Question: Mukuwirirana nemashoko akataurwa, ndechipi chinangwa chikuru kubva zvino tichienda mberi chesangano reUSA Gymnastics nereUnited States Olympic Committee?\nA. Kuongorora mabasa ekubata chibharo aLarry Nassar\nB. Kusimudzira kuchengeteka kwevatambi",
11
- "positive_response": "Kusimudzira kuchengeteka kwevatambi",
12
- "negative_response": "Kuongorora mabasa ekubata chibharo aLarry Nassar"
13
- }
14
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "benchmarks",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Identify the only construct that is not pertinent to developmental models on intelligence:...",
11
- "positive_response": "Investment theory",
12
- "negative_response": "The positive manifold",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Investment theory' (log_prob=-0.500), Expected: 'Investment theory'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Investment theory' (log_prob=-0.500), Expected: 'The positive manifold'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Question: A 17-year-old girl is brought to the physician by her mother because she has not had a men...",
32
- "positive_response": "Hypogonadotropic hypogonadism",
33
- "negative_response": "Hypothyroidism",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'Hypogonadotropic hypogonadism' (log_prob=-0.500), Expected: 'Hypogonadotropic hypogonadism'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'Hypogonadotropic hypogonadism' (log_prob=-0.500), Expected: 'Hypothyroidism'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Identify the only construct that is not pertinent to developmental models on intelligence:\nA. The positive manifold\nB. Investment theory",
5
- "positive_response": "Investment theory",
6
- "negative_response": "The positive manifold"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Question: A 17-year-old girl is brought to the physician by her mother because she has not had a menstrual period for 6 months. The patient is unconcerned about the lack of menses. Menarche occurred at the age of 12 years, and menses had occurred at regular 28-day intervals until they became irregular 1 year ago. She is a member of her high school gymnastics team. She appears emaciated. She is 163 cm (5 ft 4 in) tall and weighs 40 kg (88 lb); BMI is 15 kg/m2 . Her pulse is 54/min, and blood pressure is 80/50 mm Hg. Which of the following is the most likely cause of this patient's amenorrhea?\nA. Hypothyroidism\nB. Hypogonadotropic hypogonadism",
11
- "positive_response": "Hypogonadotropic hypogonadism",
12
- "negative_response": "Hypothyroidism"
13
- }
14
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "bertaqa",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Will the French league be cancelled?\nA. 1953\nB. 1952...",
11
- "positive_response": "1952",
12
- "negative_response": "1953",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: '1952' (log_prob=-0.500), Expected: '1952'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: '1952' (log_prob=-0.500), Expected: '1953'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Question: When was the first sound film in Basque made?\nA. In the 1960s\nB. In the 1930s...",
32
- "positive_response": "In the 1930s",
33
- "negative_response": "In the 1960s",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'In the 1930s' (log_prob=-0.500), Expected: 'In the 1930s'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'In the 1930s' (log_prob=-0.500), Expected: 'In the 1960s'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Will the French league be cancelled?\nA. 1953\nB. 1952",
5
- "positive_response": "1952",
6
- "negative_response": "1953"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Question: When was the first sound film in Basque made?\nA. In the 1960s\nB. In the 1930s",
11
- "positive_response": "In the 1930s",
12
- "negative_response": "In the 1960s"
13
- }
14
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "bhtc_v2",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Text: Londresko Museoko ikerlari talde batek II. mendeko hilerri zahar bateko 22 gizakiren arrastoak...",
11
- "positive_response": "Historia",
12
- "negative_response": "Ingurumena",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Historia' (log_prob=-0.500), Expected: 'Historia'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Historia' (log_prob=-0.500), Expected: 'Ingurumena'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Text: Londresko Museoko ikerlari talde batek II. mendeko hilerri zahar bateko 22 gizakiren arrastoak aztertu ditu eta, eskeletoen morfologiari erreparatuta, horietako bi asiarrak, ziurrenik txinatarrak, zirela ondorioztatu dute..\nQuestion: What is the topic of the above text?\nA. Ingurumena\nB. Historia",
5
- "positive_response": "Historia",
6
- "negative_response": "Ingurumena"
7
- }
8
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "boolq-seq2seq",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Passage: Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown...",
11
- "positive_response": "no",
12
- "negative_response": "yes",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'no' (log_prob=-0.500), Expected: 'no'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'no' (log_prob=-0.500), Expected: 'yes'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Passage: Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, which requires a tropical climate to grow productively, returns from 8 to 9 units of energy for each unit expended, as compared to corn, which only returns about 1.34 units of fuel energy for each unit of energy expended. A 2006 University of California Berkeley study, after analyzing six separate studies, concluded that producing ethanol from corn uses much less petroleum than producing gasoline.\nQuestion: does ethanol take more energy make that produces\nAnswer (yes or no):",
5
- "positive_response": "no",
6
- "negative_response": "yes"
7
- }
8
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "cabreu",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Text: El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve ...",
11
- "positive_response": "El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. \nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.",
12
- "negative_response": "Segons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus. Per tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient. El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement. La investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%.",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. \nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.' (log_prob=-0.500), Expected: 'El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. \nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. \nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.' (log_prob=-0.500), Expected: 'Segons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus. Per tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient. El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement. La investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%.'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Text: El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nAix\u00ed ho constata un estudi realitzat per investigadors del Grup de recerca Neurovascular de l\u2019Institut Hospital del Mar d\u2019Investigacions M\u00e8diques (IMIM).\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nAlhora, tamb\u00e9 el grau de severitat de les seq\u00fceles \u00e9s m\u00e9s important.\nEl treball, que s\u2019ha publicat a la revista \u2018Scientific Reports\u2019, ha analitzat les dades de gaireb\u00e9 600 pacients.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%.\nEls investigadors van determinar la seva edat biol\u00f2gica a partir de marcadors epigen\u00e8tics (canvis en els gens causats per factors externs), concretament, la metilaci\u00f3 de l\u2019ADN (el principal mecanisme epigen\u00e8tic).\nAix\u00ed van comprovar, segons comenta Soriano-T\u00e1rraga, que l\u2019edat biol\u00f2gica aporta informaci\u00f3 extra.\n\u2018Es correlaciona molt b\u00e9 amb l\u2019edat cronol\u00f2gica, \u00e9s molt similar, per\u00f2 t\u00e9 informaci\u00f3 extra sobre l\u2019estat funcional de la persona\u2019, apunta Soriano-T\u00e1rraga.\nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.\nL\u2019estudi tamb\u00e9 va analitzar el pes de l\u2019edat biol\u00f2gica tenint en compte el tipus d\u2019ictus.\nAix\u00ed, en els ictus aterotromb\u00f2tics, que acostumen a donar-se en pacients m\u00e9s joves -entre 55 i 60 anys de mitjana-, va ser en els que es va mostrar com un millor indicador de mortalitat.\nEn canvi, en els cardioemb\u00f2lics, m\u00e9s habituals en pacients de m\u00e9s edat, l\u2019efecte de l\u2019edat biol\u00f2gica no era evident.\nAix\u00f2 confirma, segons la investigadora principal de l\u2019estudi, que l\u2019edat biol\u00f2gica \u00e9s un bon biomarcador.\n\u2018En un pacient jove l\u2019edat biol\u00f2gica, l\u2019estil de vida, t\u00e9 un major impacte, indica un envelliment m\u00e9s gran\u2019 i un risc m\u00e9s elevat de mortalitat despr\u00e9s de patir un atac, aix\u00ed com una major severitat de les seq\u00fceles, destaca Soriano-T\u00e1rraga.\n\u00c9s a dir, \u2018no s\u00f3n tan joves\u2019 com indica la seva edat cronol\u00f2gica, afegeix.\nEls investigadors assenyalen que estudis anteriors que comparaven pacients que havien patit un ictus amb pacients que no n\u2019havien patit, indicaven que per una mateixa edat cronol\u00f2gica, el grau d\u2019envelliment dels ictus era de 2,5 anys m\u00e9s gran de mitjana, sent de fins a 7 anys en els casos m\u00e9s joves.\nLa utilitzaci\u00f3 d\u2019aquest marcador \u2018pot servir per detectar els pacients que tenen un risc m\u00e9s elevat de morir despr\u00e9s de patir un ictus\u2019, segons Soriano-T\u00e1rraga.\nPer la seva banda, el doctor Jordi Jim\u00e9nez-Conde, neur\u00f2leg de l\u2019Hospital del Mar responsable de la l\u00ednia de recerca, ha remarcat que aquest estudi \u2018\u00e9s molt congruent amb les troballes dels nostres estudis previs, corroborant que l\u2019edat biol\u00f2gica t\u00e9 un gran valor informatiu sobre l\u2019estat d\u2019envelliment real de les persones, sobre el seu risc de patir malalties associades a l\u2019edat, i sobre la capacitat de l\u2019individu d\u2019afrontar-les\u2019.\nHi ha dos tipus d\u2019ictus principals, els hemorr\u00e0gics (quan es trenca un vas sanguini al cervell) i els isqu\u00e8mics (quan s\u2019obtura una de les art\u00e8ries del cervell).\nL\u2019ictus aterotromb\u00f2tic \u00e9s el tercer m\u00e9s freq\u00fcent i la seva causa subjacent, l\u2019ateroesclerosi \u00e9s tractable.\nAquesta patologia est\u00e0 associada a alts nivells de colesterol, a l\u2019h\u00e0bit del tabac i la diabetis.\nEn general, el 90% dels ictus estan relacionats amb factors de risc com aquests.\n\nGenerate a extractive summary:",
5
- "positive_response": "El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. \nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.",
6
- "negative_response": "Segons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus. Per tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient. El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement. La investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%."
7
- }
8
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "careqa_en",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: In relation to iron metabolism and its control mediated by hepcidin, it is true that:\nA. H...",
11
- "positive_response": "The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.",
12
- "negative_response": "Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1).",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.' (log_prob=-0.500), Expected: 'The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.' (log_prob=-0.500), Expected: 'Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1).'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: In relation to iron metabolism and its control mediated by hepcidin, it is true that:\nA. Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1).\nB. The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.",
5
- "positive_response": "The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.",
6
- "negative_response": "Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1)."
7
- }
8
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "careqa",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: In relation to iron metabolism and its control mediated by hepcidin, it is true that:\nA. H...",
11
- "positive_response": "The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.",
12
- "negative_response": "Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1).",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.' (log_prob=-0.500), Expected: 'The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.' (log_prob=-0.500), Expected: 'Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1).'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: In relation to iron metabolism and its control mediated by hepcidin, it is true that:\nA. Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1).\nB. The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.",
5
- "positive_response": "The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.",
6
- "negative_response": "Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1)."
7
- }
8
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "catalanqa",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Context: A finals d'agost fou nomenat un nou primer ministre, Jafar Sharif-Emami, que va revertir al...",
11
- "positive_response": "milers",
12
- "negative_response": "A finals d'agost fou",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'milers' (log_prob=-0.500), Expected: 'milers'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'milers' (log_prob=-0.500), Expected: 'A finals d'agost fou'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Context: A finals d'agost fou nomenat un nou primer ministre, Jafar Sharif-Emami, que va revertir algunes de les pol\u00edtiques del Xa. Els casinos foren tancats, el calendari imperial abolit, l'activitat dels partits pol\u00edtics autoritzada... Tot fou debades. Cap al setembre, el pa\u00eds s'estava desestabilitzant r\u00e0pidament, i les grans protestes s'estaven convertint en un esdeveniment habitual. El Xa va promulgar la llei marcial i va prohibir totes les manifestacions, per\u00f2 el 8 de setembre milers de manifestants es van reunir a Teheran. Les forces de seguretat van disparar i en van matar unes quantes dotzenes en el qual es va con\u00e8ixer com el \"Divendres Negre\", tal com es pot veure en aquesta successi\u00f3 de fotografies:\n\nQuestion: Quants manifestants es van reunir a Teheran?\nAnswer:",
5
- "positive_response": "milers",
6
- "negative_response": "A finals d'agost fou"
7
- }
8
- ]