wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/__init__.py +22 -6
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +22 -40
- wisent/core/activations/activations_collector.py +145 -373
- wisent/core/activations/classifier_inference_strategy.py +195 -0
- wisent/core/activations/core/atoms.py +8 -92
- wisent/core/activations/extraction_strategy.py +480 -0
- wisent/core/agent/diagnose/response_diagnostics.py +3 -3
- wisent/core/agent/diagnose.py +3 -3
- wisent/core/autonomous_agent.py +2 -2
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/apply_steering.py +25 -31
- wisent/core/cli/agent/evaluate_response.py +18 -20
- wisent/core/cli/agent/train_classifier.py +36 -26
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +470 -0
- wisent/core/cli/create_steering_vector.py +19 -9
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/generate_vector_from_task.py +4 -0
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +13 -37
- wisent/core/cli/method_optimizer.py +860 -0
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize.py +44 -5
- wisent/core/cli/optimize_classification.py +5 -6
- wisent/core/cli/optimize_sample_size.py +9 -23
- wisent/core/cli/optimize_steering.py +433 -159
- wisent/core/cli/optimize_weights.py +67 -7
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +8 -7
- wisent/core/cli/steering_search_space.py +20 -15
- wisent/core/cli/tasks.py +31 -117
- wisent/core/cli/train_unified_goodness.py +18 -19
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1582 -177
- wisent/core/contrastive_pairs/diagnostics/linearity.py +70 -80
- wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +11 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
- wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
- wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
- wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
- wisent/core/evaluators/personalization/coherence.py +46 -0
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +14 -14
- wisent/core/lm_eval_harness_ground_truth.py +7 -11
- wisent/core/main.py +6 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +9 -8
- wisent/core/opti/methods/opti_weights.py +29 -2
- wisent/core/optuna/classifier/activation_generator.py +14 -12
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/optuna/steering/steering_optimization.py +14 -9
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +22 -2
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/main_parser.py +16 -0
- wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
- wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
- wisent/core/parser_arguments/tasks_parser.py +7 -19
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/core/atoms.py +1 -2
- wisent/core/steering_methods/methods/caa.py +1 -1
- wisent/core/steering_methods/methods/hyperplane.py +75 -0
- wisent/core/steering_methods/methods/prism.py +1 -2
- wisent/core/steering_methods/methods/pulse.py +39 -8
- wisent/core/steering_methods/methods/titan.py +59 -14
- wisent/core/steering_methods/registry.py +52 -12
- wisent/core/steering_optimizer.py +15 -15
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +11 -20
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/generate_paper_data.py +384 -0
- wisent/examples/scripts/intervention_validation.py +626 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
- wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/examples/scripts/threshold_analysis.py +434 -0
- wisent/examples/scripts/visualization_gallery.py +582 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/scripts/run_quality_metrics_sweep.sh +22 -27
- wisent/tests/test_aggregation_geometry.py +236 -0
- wisent/tests/test_detector_accuracy.py +163 -0
- wisent/tests/test_geometry_exhaustive.py +1202 -0
- wisent/tests/visualize_geometry.py +255 -61
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/RECORD +376 -974
- wisent/core/activations/prompt_construction_strategy.py +0 -47
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
- wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
- wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
- wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
- wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
- wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
- wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
- wisent/examples/scripts/results/benchmark_tags.json +0 -917
- wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
- wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
- wisent/examples/scripts/results/failing_benchmarks.json +0 -946
- wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
- wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
- wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
- wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
- wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
- wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
- wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
- wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
- wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
- wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
- wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
- wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
- wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
- wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
- wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anli_pairs.json +0 -8
- wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
- wisent/examples/scripts/results/test_apps_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
- wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
- wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
- wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
- wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
- wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
- wisent/examples/scripts/results/test_atis_pairs.json +0 -8
- wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babi_pairs.json +0 -8
- wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
- wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
- wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
- wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
- wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
- wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
- wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
- wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
- wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
- wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
- wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
- wisent/examples/scripts/results/test_c4_pairs.json +0 -8
- wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
- wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cb_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
- wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
- wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
- wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
- wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
- wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
- wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cola_pairs.json +0 -8
- wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
- wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
- wisent/examples/scripts/results/test_conala_pairs.json +0 -8
- wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_concode_pairs.json +0 -8
- wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copa_pairs.json +0 -8
- wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
- wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
- wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
- wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
- wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
- wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
- wisent/examples/scripts/results/test_drop_pairs.json +0 -8
- wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
- wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
- wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
- wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_escola_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fda_pairs.json +0 -8
- wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
- wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
- wisent/examples/scripts/results/test_flores_pairs.json +0 -14
- wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
- wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
- wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
- wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
- wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
- wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
- wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
- wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
- wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
- wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
- wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
- wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
- wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
- wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
- wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
- wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
- wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
- wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
- wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
- wisent/examples/scripts/results/test_llama_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math500_pairs.json +0 -8
- wisent/examples/scripts/results/test_math_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math_pairs.json +0 -8
- wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
- wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
- wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
- wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mela_pairs.json +0 -14
- wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
- wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
- wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
- wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
- wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
- wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
- wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
- wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
- wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
- wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
- wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
- wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
- wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
- wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
- wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
- wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
- wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
- wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
- wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
- wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
- wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
- wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
- wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
- wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prost_pairs.json +0 -8
- wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
- wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
- wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
- wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
- wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
- wisent/examples/scripts/results/test_race_evaluation.json +0 -30
- wisent/examples/scripts/results/test_race_pairs.json +0 -8
- wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
- wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
- wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_recode_pairs.json +0 -8
- wisent/examples/scripts/results/test_record_evaluation.json +0 -30
- wisent/examples/scripts/results/test_record_pairs.json +0 -8
- wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
- wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
- wisent/examples/scripts/results/test_score_evaluation.json +0 -51
- wisent/examples/scripts/results/test_score_pairs.json +0 -14
- wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
- wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
- wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_swag_pairs.json +0 -8
- wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
- wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
- wisent/examples/scripts/results/test_translation_pairs.json +0 -14
- wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
- wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
- wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
- wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
- wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
- wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
- wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
- wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
- wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
- wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
- wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
- wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
- wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
- wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
- wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
- wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
- wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
- wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
- wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
- wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Question: Identify the only construct that is not pertinent to developmental models on intelligence:\nA. The positive manifold\nB. Investment theory",
|
|
5
|
-
"positive_response": "Investment theory",
|
|
6
|
-
"negative_response": "The positive manifold"
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "Question: A 17-year-old girl is brought to the physician by her mother because she has not had a menstrual period for 6 months. The patient is unconcerned about the lack of menses. Menarche occurred at the age of 12 years, and menses had occurred at regular 28-day intervals until they became irregular 1 year ago. She is a member of her high school gymnastics team. She appears emaciated. She is 163 cm (5 ft 4 in) tall and weighs 40 kg (88 lb); BMI is 15 kg/m2 . Her pulse is 54/min, and blood pressure is 80/50 mm Hg. Which of the following is the most likely cause of this patient's amenorrhea?\nA. Hypothyroidism\nB. Hypogonadotropic hypogonadism",
|
|
11
|
-
"positive_response": "Hypogonadotropic hypogonadism",
|
|
12
|
-
"negative_response": "Hypothyroidism"
|
|
13
|
-
}
|
|
14
|
-
]
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "bertaqa",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 2,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Question: Will the French league be cancelled?\nA. 1953\nB. 1952...",
|
|
11
|
-
"positive_response": "1952",
|
|
12
|
-
"negative_response": "1953",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: '1952' (log_prob=-0.500), Expected: '1952'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: '1952' (log_prob=-0.500), Expected: '1953'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"pair_id": 1,
|
|
31
|
-
"prompt": "Question: When was the first sound film in Basque made?\nA. In the 1960s\nB. In the 1930s...",
|
|
32
|
-
"positive_response": "In the 1930s",
|
|
33
|
-
"negative_response": "In the 1960s",
|
|
34
|
-
"positive_evaluation": {
|
|
35
|
-
"ground_truth": "TRUTHFUL",
|
|
36
|
-
"confidence": 1.0,
|
|
37
|
-
"expected": "TRUTHFUL",
|
|
38
|
-
"correct": true,
|
|
39
|
-
"details": "Predicted: 'In the 1930s' (log_prob=-0.500), Expected: 'In the 1930s'"
|
|
40
|
-
},
|
|
41
|
-
"negative_evaluation": {
|
|
42
|
-
"ground_truth": "UNTRUTHFUL",
|
|
43
|
-
"confidence": 0.0,
|
|
44
|
-
"expected": "UNTRUTHFUL",
|
|
45
|
-
"correct": true,
|
|
46
|
-
"details": "Predicted: 'In the 1930s' (log_prob=-0.500), Expected: 'In the 1960s'"
|
|
47
|
-
},
|
|
48
|
-
"both_correct": true
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Question: Will the French league be cancelled?\nA. 1953\nB. 1952",
|
|
5
|
-
"positive_response": "1952",
|
|
6
|
-
"negative_response": "1953"
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "Question: When was the first sound film in Basque made?\nA. In the 1960s\nB. In the 1930s",
|
|
11
|
-
"positive_response": "In the 1930s",
|
|
12
|
-
"negative_response": "In the 1960s"
|
|
13
|
-
}
|
|
14
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "bhtc_v2",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Text: Londresko Museoko ikerlari talde batek II. mendeko hilerri zahar bateko 22 gizakiren arrastoak...",
|
|
11
|
-
"positive_response": "Historia",
|
|
12
|
-
"negative_response": "Ingurumena",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'Historia' (log_prob=-0.500), Expected: 'Historia'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'Historia' (log_prob=-0.500), Expected: 'Ingurumena'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Text: Londresko Museoko ikerlari talde batek II. mendeko hilerri zahar bateko 22 gizakiren arrastoak aztertu ditu eta, eskeletoen morfologiari erreparatuta, horietako bi asiarrak, ziurrenik txinatarrak, zirela ondorioztatu dute..\nQuestion: What is the topic of the above text?\nA. Ingurumena\nB. Historia",
|
|
5
|
-
"positive_response": "Historia",
|
|
6
|
-
"negative_response": "Ingurumena"
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "bigbench",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 2,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Determine whether given opinion is positive, negative, neutral or ambivalent in terms of its sentime...",
|
|
11
|
-
"positive_response": "ambivalent",
|
|
12
|
-
"negative_response": "positive",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'ambivalent' (log_prob=-0.500), Expected: 'ambivalent'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'ambivalent' (log_prob=-0.500), Expected: 'positive'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"pair_id": 1,
|
|
31
|
-
"prompt": "Determine whether given opinion is positive, negative, neutral or ambivalent in terms of its sentime...",
|
|
32
|
-
"positive_response": "positive",
|
|
33
|
-
"negative_response": "ambivalent",
|
|
34
|
-
"positive_evaluation": {
|
|
35
|
-
"ground_truth": "TRUTHFUL",
|
|
36
|
-
"confidence": 1.0,
|
|
37
|
-
"expected": "TRUTHFUL",
|
|
38
|
-
"correct": true,
|
|
39
|
-
"details": "Predicted: 'positive' (log_prob=-0.500), Expected: 'positive'"
|
|
40
|
-
},
|
|
41
|
-
"negative_evaluation": {
|
|
42
|
-
"ground_truth": "UNTRUTHFUL",
|
|
43
|
-
"confidence": 0.0,
|
|
44
|
-
"expected": "UNTRUTHFUL",
|
|
45
|
-
"correct": true,
|
|
46
|
-
"details": "Predicted: 'positive' (log_prob=-0.500), Expected: 'ambivalent'"
|
|
47
|
-
},
|
|
48
|
-
"both_correct": true
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Determine whether given opinion is positive, negative, neutral or ambivalent in terms of its sentiment.\nQ: N\u00e3o o estou a esconder, mas quanto a mim \u00e9 um grande l\u00edder que tive com ele a an\u00e1lise de 1 e 2 exerc\u00edcios e o tipo \u00e9 fixe, v\u00e3o coser muitas coisas m\u00e1s sobre ele, mas n\u00e3o acreditem que alguma vez o diga. As tarefas que ele d\u00e1 no colo s\u00e3o c\u00f3smicas, como ele diz, e a maior parte das vezes n\u00e3o vale a pena fazer 1 porque ele est\u00e1 na metade superior. . as tarefas que ele d\u00e1 s\u00e3o estranhas, mas nos exerc\u00edcios da aldeia ele concretiza, ele sabe que tipos de tarefas v\u00e3o estar na roda, sem disparates desnecess\u00e1rios que n\u00e3o s\u00e3o \u00fateis. as tarefas s\u00e3o, na sua maioria, para serem resolvidas atrav\u00e9s da l\u00f3gica em primeiro lugar e \u00e9 por isso que nem sempre se consegue encontrar uma solu\u00e7\u00e3o. Mas n\u00e3o tem uma aldeia mais espec\u00edfica dir\u00e1 nos exerc\u00edcios e s\u00f3 precisa de um input para o exame se chumbar nos c\u00edrculos e ele toma uma pequena parte do que a palestra envolve. Recomendo \u00e0s pessoas que pensem logicamente, n\u00e3o se assustem com a sua apar\u00eancia \u00e9 um homem muito simp\u00e1tico.negativepositiveambivalentneutral\nA:",
|
|
5
|
-
"positive_response": "ambivalent",
|
|
6
|
-
"negative_response": "positive"
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "Determine whether given opinion is positive, negative, neutral or ambivalent in terms of its sentiment.\nQ: Hotel w malowniczym po\u0142o\u017ceniu po\u015br\u00f3d strumieni i staw\u00f3w otoczony zieleni\u0105 .negativepositiveambivalentneutral\nA:",
|
|
11
|
-
"positive_response": "positive",
|
|
12
|
-
"negative_response": "ambivalent"
|
|
13
|
-
}
|
|
14
|
-
]
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "blimp",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 2,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Which sentence is grammatically correct?...",
|
|
11
|
-
"positive_response": "Kristen hasn't worked with herself.",
|
|
12
|
-
"negative_response": "Kristen hasn't worked with itself.",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'Kristen hasn't worked with herself.' (log_prob=-0.500), Expected: 'Kristen hasn't worked with herself.'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'Kristen hasn't worked with herself.' (log_prob=-0.500), Expected: 'Kristen hasn't worked with itself.'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"pair_id": 1,
|
|
31
|
-
"prompt": "Which sentence is grammatically correct?...",
|
|
32
|
-
"positive_response": "Most cashiers would question a lot of libraries that Sherry hadn't toured that should embarrass a guy.",
|
|
33
|
-
"negative_response": "Most cashiers would question who a lot of libraries that Sherry hadn't toured should embarrass a guy.",
|
|
34
|
-
"positive_evaluation": {
|
|
35
|
-
"ground_truth": "TRUTHFUL",
|
|
36
|
-
"confidence": 1.0,
|
|
37
|
-
"expected": "TRUTHFUL",
|
|
38
|
-
"correct": true,
|
|
39
|
-
"details": "Predicted: 'Most cashiers would question a lot of libraries that Sherry hadn't toured that should embarrass a guy.' (log_prob=-0.500), Expected: 'Most cashiers would question a lot of libraries that Sherry hadn't toured that should embarrass a guy.'"
|
|
40
|
-
},
|
|
41
|
-
"negative_evaluation": {
|
|
42
|
-
"ground_truth": "UNTRUTHFUL",
|
|
43
|
-
"confidence": 0.0,
|
|
44
|
-
"expected": "UNTRUTHFUL",
|
|
45
|
-
"correct": true,
|
|
46
|
-
"details": "Predicted: 'Most cashiers would question a lot of libraries that Sherry hadn't toured that should embarrass a guy.' (log_prob=-0.500), Expected: 'Most cashiers would question who a lot of libraries that Sherry hadn't toured should embarrass a guy.'"
|
|
47
|
-
},
|
|
48
|
-
"both_correct": true
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Which sentence is grammatically correct?",
|
|
5
|
-
"positive_response": "Kristen hasn't worked with herself.",
|
|
6
|
-
"negative_response": "Kristen hasn't worked with itself."
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "Which sentence is grammatically correct?",
|
|
11
|
-
"positive_response": "Most cashiers would question a lot of libraries that Sherry hadn't toured that should embarrass a guy.",
|
|
12
|
-
"negative_response": "Most cashiers would question who a lot of libraries that Sherry hadn't toured should embarrass a guy."
|
|
13
|
-
}
|
|
14
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "boolq",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown, collect...",
|
|
11
|
-
"positive_response": "No",
|
|
12
|
-
"negative_response": "Yes",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'No' (log_prob=-0.500), Expected: 'No'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'No' (log_prob=-0.500), Expected: 'Yes'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, which requires a tropical climate to grow productively, returns from 8 to 9 units of energy for each unit expended, as compared to corn, which only returns about 1.34 units of fuel energy for each unit of energy expended. A 2006 University of California Berkeley study, after analyzing six separate studies, concluded that producing ethanol from corn uses much less petroleum than producing gasoline.\nQuestion: does ethanol take more energy make that produces?",
|
|
5
|
-
"positive_response": "No",
|
|
6
|
-
"negative_response": "Yes"
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "boolq-seq2seq",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Passage: Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown...",
|
|
11
|
-
"positive_response": "no",
|
|
12
|
-
"negative_response": "yes",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'no' (log_prob=-0.500), Expected: 'no'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'no' (log_prob=-0.500), Expected: 'yes'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Passage: Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, which requires a tropical climate to grow productively, returns from 8 to 9 units of energy for each unit expended, as compared to corn, which only returns about 1.34 units of fuel energy for each unit of energy expended. A 2006 University of California Berkeley study, after analyzing six separate studies, concluded that producing ethanol from corn uses much less petroleum than producing gasoline.\nQuestion: does ethanol take more energy make that produces\nAnswer (yes or no):",
|
|
5
|
-
"positive_response": "no",
|
|
6
|
-
"negative_response": "yes"
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "boolq",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown, collect...",
|
|
11
|
-
"positive_response": "No",
|
|
12
|
-
"negative_response": "Yes",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'No' (log_prob=-0.500), Expected: 'No'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'No' (log_prob=-0.500), Expected: 'Yes'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, which requires a tropical climate to grow productively, returns from 8 to 9 units of energy for each unit expended, as compared to corn, which only returns about 1.34 units of fuel energy for each unit of energy expended. A 2006 University of California Berkeley study, after analyzing six separate studies, concluded that producing ethanol from corn uses much less petroleum than producing gasoline.\nQuestion: does ethanol take more energy make that produces?",
|
|
5
|
-
"positive_response": "No",
|
|
6
|
-
"negative_response": "Yes"
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "c4",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Complete the following text:\nThe woman who died after falling from a bridge over the A21 has been id...",
|
|
11
|
-
"positive_response": "Marta Kendle, 37, fell from the Gracious Lane bridge on the morning of February 19",
|
|
12
|
-
"negative_response": "This sentence does not follow from the previous context.",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'Marta Kendle, 37, fell from the Gracious Lane bridge on the morning of February 19' (log_prob=-0.500), Expected: 'Marta Kendle, 37, fell from the Gracious Lane bridge on the morning of February 19'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'Marta Kendle, 37, fell from the Gracious Lane bridge on the morning of February 19' (log_prob=-0.500), Expected: 'This sentence does not follow from the previous context.'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Complete the following text:\nThe woman who died after falling from a bridge over the A21 has been identified as a Sevenoaks mum.",
|
|
5
|
-
"positive_response": "Marta Kendle, 37, fell from the Gracious Lane bridge on the morning of February 19",
|
|
6
|
-
"negative_response": "This sentence does not follow from the previous context."
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "cabreu",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Text: El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve ...",
|
|
11
|
-
"positive_response": "El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. \nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.",
|
|
12
|
-
"negative_response": "Per tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient. La investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement. Segons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. \nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.' (log_prob=-0.500), Expected: 'El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. \nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. \nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.' (log_prob=-0.500), Expected: 'Per tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient. La investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement. Segons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Text: El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nAix\u00ed ho constata un estudi realitzat per investigadors del Grup de recerca Neurovascular de l\u2019Institut Hospital del Mar d\u2019Investigacions M\u00e8diques (IMIM).\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nAlhora, tamb\u00e9 el grau de severitat de les seq\u00fceles \u00e9s m\u00e9s important.\nEl treball, que s\u2019ha publicat a la revista \u2018Scientific Reports\u2019, ha analitzat les dades de gaireb\u00e9 600 pacients.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%.\nEls investigadors van determinar la seva edat biol\u00f2gica a partir de marcadors epigen\u00e8tics (canvis en els gens causats per factors externs), concretament, la metilaci\u00f3 de l\u2019ADN (el principal mecanisme epigen\u00e8tic).\nAix\u00ed van comprovar, segons comenta Soriano-T\u00e1rraga, que l\u2019edat biol\u00f2gica aporta informaci\u00f3 extra.\n\u2018Es correlaciona molt b\u00e9 amb l\u2019edat cronol\u00f2gica, \u00e9s molt similar, per\u00f2 t\u00e9 informaci\u00f3 extra sobre l\u2019estat funcional de la persona\u2019, apunta Soriano-T\u00e1rraga.\nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.\nL\u2019estudi tamb\u00e9 va analitzar el pes de l\u2019edat biol\u00f2gica tenint en compte el tipus d\u2019ictus.\nAix\u00ed, en els ictus aterotromb\u00f2tics, que acostumen a donar-se en pacients m\u00e9s joves -entre 55 i 60 anys de mitjana-, va ser en els que es va mostrar com un millor indicador de mortalitat.\nEn canvi, en els cardioemb\u00f2lics, m\u00e9s habituals en pacients de m\u00e9s edat, l\u2019efecte de l\u2019edat biol\u00f2gica no era evident.\nAix\u00f2 confirma, segons la investigadora principal de l\u2019estudi, que l\u2019edat biol\u00f2gica \u00e9s un bon biomarcador.\n\u2018En un pacient jove l\u2019edat biol\u00f2gica, l\u2019estil de vida, t\u00e9 un major impacte, indica un envelliment m\u00e9s gran\u2019 i un risc m\u00e9s elevat de mortalitat despr\u00e9s de patir un atac, aix\u00ed com una major severitat de les seq\u00fceles, destaca Soriano-T\u00e1rraga.\n\u00c9s a dir, \u2018no s\u00f3n tan joves\u2019 com indica la seva edat cronol\u00f2gica, afegeix.\nEls investigadors assenyalen que estudis anteriors que comparaven pacients que havien patit un ictus amb pacients que no n\u2019havien patit, indicaven que per una mateixa edat cronol\u00f2gica, el grau d\u2019envelliment dels ictus era de 2,5 anys m\u00e9s gran de mitjana, sent de fins a 7 anys en els casos m\u00e9s joves.\nLa utilitzaci\u00f3 d\u2019aquest marcador \u2018pot servir per detectar els pacients que tenen un risc m\u00e9s elevat de morir despr\u00e9s de patir un ictus\u2019, segons Soriano-T\u00e1rraga.\nPer la seva banda, el doctor Jordi Jim\u00e9nez-Conde, neur\u00f2leg de l\u2019Hospital del Mar responsable de la l\u00ednia de recerca, ha remarcat que aquest estudi \u2018\u00e9s molt congruent amb les troballes dels nostres estudis previs, corroborant que l\u2019edat biol\u00f2gica t\u00e9 un gran valor informatiu sobre l\u2019estat d\u2019envelliment real de les persones, sobre el seu risc de patir malalties associades a l\u2019edat, i sobre la capacitat de l\u2019individu d\u2019afrontar-les\u2019.\nHi ha dos tipus d\u2019ictus principals, els hemorr\u00e0gics (quan es trenca un vas sanguini al cervell) i els isqu\u00e8mics (quan s\u2019obtura una de les art\u00e8ries del cervell).\nL\u2019ictus aterotromb\u00f2tic \u00e9s el tercer m\u00e9s freq\u00fcent i la seva causa subjacent, l\u2019ateroesclerosi \u00e9s tractable.\nAquesta patologia est\u00e0 associada a alts nivells de colesterol, a l\u2019h\u00e0bit del tabac i la diabetis.\nEn general, el 90% dels ictus estan relacionats amb factors de risc com aquests.\n\nGenerate a extractive summary:",
|
|
5
|
-
"positive_response": "El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement.\nSegons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus.\nLa investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. \nPer tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient.",
|
|
6
|
-
"negative_response": "Per tant, afegeix, \u2018\u00e9s un millor predictor de mortalitat a tres mesos en comparaci\u00f3 amb l\u2019edat cronol\u00f2gica\u2019, fins i tot sense tenir en compte altres factors externs, la gravetat de l\u2019ictus o l\u2019estat funcional previ del pacient. La investigadora principal de l\u2019estudi, la doctora Carolina Soriano-T\u00e1rraga, explica que es van analitzar gaireb\u00e9 600 pacients atesos a l\u2019Hospital del Mar per un ictus isqu\u00e8mic i que en aquests casos, la mortalitat mitjana al cap de tres mesos es va situar entre el 15 i el 20%. El risc de morir despr\u00e9s de patir un ictus isqu\u00e8mic es multiplica si l\u2019edat biol\u00f2gica, que ve marcada pels h\u00e0bits de vida o el lloc de resid\u00e8ncia, entre altres factors, \u00e9s superior a l\u2019edat cronol\u00f2gica, marcada per la data de naixement. Segons els resultats obtinguts, cada any d\u2019edat biol\u00f2gica acumulat per sobre de l\u2019edat cronol\u00f2gica, augmenta un 6% el risc de morir en un per\u00edode de tres mesos despr\u00e9s de patir un ictus."
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "careqa",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Question: In relation to iron metabolism and its control mediated by hepcidin, it is true that:\nA. H...",
|
|
11
|
-
"positive_response": "The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.",
|
|
12
|
-
"negative_response": "Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1).",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.' (log_prob=-0.500), Expected: 'The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.' (log_prob=-0.500), Expected: 'Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1).'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Question: In relation to iron metabolism and its control mediated by hepcidin, it is true that:\nA. Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1).\nB. The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.",
|
|
5
|
-
"positive_response": "The increase in serum iron or inflammation stimulates the synthesis of hepcidin in the liver, which negatively regulates the function of ferroportin.",
|
|
6
|
-
"negative_response": "Hepcidin reduces intestinal iron absorption through the inactivation of the divalent metal transporter 1 (DMT1)."
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "catalan_bench",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 2,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Translate to Valencian: Aquesta \u00e9s la llista les seves obres-...",
|
|
11
|
-
"positive_response": "Esta \u00e9s la llista les seues obres-",
|
|
12
|
-
"negative_response": "incorrect translation",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'Esta \u00e9s la llista les seues obres-' (log_prob=-0.500), Expected: 'Esta \u00e9s la llista les seues obres-'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'Esta \u00e9s la llista les seues obres-' (log_prob=-0.500), Expected: 'incorrect translation'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"pair_id": 1,
|
|
31
|
-
"prompt": "Story: Coralina Colom i Canillas (i no Carolina Colom com consta a la premsa en m\u00e9s d'una ocasi\u00f3), n...",
|
|
32
|
-
"positive_response": "L'any 1973.",
|
|
33
|
-
"negative_response": "La mosqueta, de Ruzzante.",
|
|
34
|
-
"positive_evaluation": {
|
|
35
|
-
"ground_truth": "TRUTHFUL",
|
|
36
|
-
"confidence": 1.0,
|
|
37
|
-
"expected": "TRUTHFUL",
|
|
38
|
-
"correct": true,
|
|
39
|
-
"details": "Predicted: 'L'any 1973.' (log_prob=-0.500), Expected: 'L'any 1973.'"
|
|
40
|
-
},
|
|
41
|
-
"negative_evaluation": {
|
|
42
|
-
"ground_truth": "UNTRUTHFUL",
|
|
43
|
-
"confidence": 0.0,
|
|
44
|
-
"expected": "UNTRUTHFUL",
|
|
45
|
-
"correct": true,
|
|
46
|
-
"details": "Predicted: 'L'any 1973.' (log_prob=-0.500), Expected: 'La mosqueta, de Ruzzante.'"
|
|
47
|
-
},
|
|
48
|
-
"both_correct": true
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Translate to Valencian: Aquesta \u00e9s la llista les seves obres-",
|
|
5
|
-
"positive_response": "Esta \u00e9s la llista les seues obres-",
|
|
6
|
-
"negative_response": "incorrect translation"
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "Story: Coralina Colom i Canillas (i no Carolina Colom com consta a la premsa en m\u00e9s d'una ocasi\u00f3), nascuda a Barcelona el 26 de gener de 1936, va ser una actriu i, posteriorment, una pedagoga i investigadora de la veu parlada enfocada als actors.\nHa estat, durant m\u00e9s de 30\u00a0anys, tota una instituci\u00f3 al departament de veu de l'Institut del Teatre, on ha impartit classes de veu. Moltes generacions d'actors catalans han assistit a les seves aules i l'esmenten en els seus curr\u00edculums professionals. Anteriorment a la tasca docent, havia estat primera actriu de teatre, a les d\u00e8cades dels anys 50 i 60, a les companyies de Mario Cabr\u00e9, Alejandro Ulloa i Joan Capri. L'estiu de 1956 form\u00e0 part de la Companyia de Teatre Ciutat Comtal, que actu\u00e0 dins la programaci\u00f3 del Teatre Grec de Montju\u00efc i sota la direcci\u00f3 de Dolly Latz. La seva darrera actuaci\u00f3 va ser l'any 1973, encarnant la protagonista femenina de l'obra La mosqueta, de Ruzzante, al costat de l'actor Ramon Teixidor. Tamb\u00e9 va fer cinema, espor\u00e0dicament.\nAl 1965 va rebre el Premi Internacional d'Interpretaci\u00f3 del VIII Cicle de Teatre Llat\u00ed per Flor de un d\u00eda.\nLes seves investigacions com a pedagoga queden recollides a Exercicis per al laboratori de veu i de dicci\u00f3, escrit en col\u00b7laboraci\u00f3 amb Rosa-Vict\u00f2ria Gras (1998).\nteatre\ncinema.\ndiscografia\nllibres\n\nQuestion: Quan va ser l\u2019\u00faltim cop que va pujar a un escenari?",
|
|
11
|
-
"positive_response": "L'any 1973.",
|
|
12
|
-
"negative_response": "La mosqueta, de Ruzzante."
|
|
13
|
-
}
|
|
14
|
-
]
|