wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/__init__.py +22 -6
- wisent/core/activations/activations.py +21 -39
- wisent/core/activations/activations_collector.py +141 -373
- wisent/core/activations/classifier_inference_strategy.py +194 -0
- wisent/core/activations/core/atoms.py +8 -92
- wisent/core/activations/extraction_strategy.py +308 -0
- wisent/core/agent/diagnose/response_diagnostics.py +3 -3
- wisent/core/agent/diagnose.py +3 -3
- wisent/core/autonomous_agent.py +2 -2
- wisent/core/cli/agent/apply_steering.py +23 -27
- wisent/core/cli/agent/evaluate_response.py +18 -20
- wisent/core/cli/agent/train_classifier.py +18 -20
- wisent/core/cli/cluster_benchmarks.py +472 -0
- wisent/core/cli/create_steering_vector.py +13 -5
- wisent/core/cli/generate_vector_from_task.py +4 -0
- wisent/core/cli/get_activations.py +12 -36
- wisent/core/cli/method_optimizer.py +859 -0
- wisent/core/cli/optimize.py +44 -5
- wisent/core/cli/optimize_classification.py +5 -6
- wisent/core/cli/optimize_sample_size.py +8 -22
- wisent/core/cli/optimize_steering.py +429 -153
- wisent/core/cli/optimize_weights.py +65 -6
- wisent/core/cli/steering_method_trainer.py +5 -4
- wisent/core/cli/steering_search_space.py +20 -15
- wisent/core/cli/tasks.py +14 -43
- wisent/core/cli/train_unified_goodness.py +17 -18
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1578 -173
- wisent/core/contrastive_pairs/diagnostics/linearity.py +63 -80
- wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
- wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
- wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
- wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
- wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
- wisent/core/evaluators/personalization/coherence.py +46 -0
- wisent/core/hyperparameter_optimizer.py +13 -13
- wisent/core/lm_eval_harness_ground_truth.py +7 -11
- wisent/core/main.py +3 -0
- wisent/core/models/wisent_model.py +8 -7
- wisent/core/opti/methods/opti_weights.py +29 -2
- wisent/core/optuna/classifier/activation_generator.py +14 -12
- wisent/core/optuna/steering/steering_optimization.py +14 -9
- wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +20 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
- wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
- wisent/core/parser_arguments/tasks_parser.py +7 -19
- wisent/core/steering_methods/core/atoms.py +1 -2
- wisent/core/steering_methods/methods/caa.py +1 -1
- wisent/core/steering_methods/methods/hyperplane.py +74 -0
- wisent/core/steering_methods/methods/prism.py +1 -2
- wisent/core/steering_methods/methods/pulse.py +39 -8
- wisent/core/steering_methods/methods/titan.py +59 -14
- wisent/core/steering_methods/registry.py +52 -12
- wisent/core/steering_optimizer.py +15 -15
- wisent/core/trainers/steering_trainer.py +9 -18
- wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
- wisent/scripts/run_quality_metrics_sweep.sh +22 -27
- wisent/tests/test_aggregation_geometry.py +236 -0
- wisent/tests/test_detector_accuracy.py +163 -0
- wisent/tests/test_geometry_exhaustive.py +1202 -0
- wisent/tests/visualize_geometry.py +255 -61
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/METADATA +1 -1
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/RECORD +82 -714
- wisent/core/activations/prompt_construction_strategy.py +0 -47
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
- wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
- wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
- wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
- wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
- wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
- wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
- wisent/examples/scripts/results/benchmark_tags.json +0 -917
- wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
- wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
- wisent/examples/scripts/results/failing_benchmarks.json +0 -946
- wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
- wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
- wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
- wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
- wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
- wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
- wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
- wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
- wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
- wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
- wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
- wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
- wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
- wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
- wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anli_pairs.json +0 -8
- wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
- wisent/examples/scripts/results/test_apps_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
- wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
- wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
- wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
- wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
- wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
- wisent/examples/scripts/results/test_atis_pairs.json +0 -8
- wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babi_pairs.json +0 -8
- wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
- wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
- wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
- wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
- wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
- wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
- wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
- wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
- wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
- wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
- wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
- wisent/examples/scripts/results/test_c4_pairs.json +0 -8
- wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
- wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cb_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
- wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
- wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
- wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
- wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
- wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
- wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cola_pairs.json +0 -8
- wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
- wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
- wisent/examples/scripts/results/test_conala_pairs.json +0 -8
- wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_concode_pairs.json +0 -8
- wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copa_pairs.json +0 -8
- wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
- wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
- wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
- wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
- wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
- wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
- wisent/examples/scripts/results/test_drop_pairs.json +0 -8
- wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
- wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
- wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
- wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_escola_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fda_pairs.json +0 -8
- wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
- wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
- wisent/examples/scripts/results/test_flores_pairs.json +0 -14
- wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
- wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
- wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
- wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
- wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
- wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
- wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
- wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
- wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
- wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
- wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
- wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
- wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
- wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
- wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
- wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
- wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
- wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
- wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
- wisent/examples/scripts/results/test_llama_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math500_pairs.json +0 -8
- wisent/examples/scripts/results/test_math_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math_pairs.json +0 -8
- wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
- wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
- wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
- wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mela_pairs.json +0 -14
- wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
- wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
- wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
- wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
- wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
- wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
- wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
- wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
- wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
- wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
- wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
- wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
- wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
- wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
- wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
- wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
- wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
- wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
- wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
- wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
- wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
- wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
- wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
- wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prost_pairs.json +0 -8
- wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
- wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
- wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
- wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
- wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
- wisent/examples/scripts/results/test_race_evaluation.json +0 -30
- wisent/examples/scripts/results/test_race_pairs.json +0 -8
- wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
- wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
- wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_recode_pairs.json +0 -8
- wisent/examples/scripts/results/test_record_evaluation.json +0 -30
- wisent/examples/scripts/results/test_record_pairs.json +0 -8
- wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
- wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
- wisent/examples/scripts/results/test_score_evaluation.json +0 -51
- wisent/examples/scripts/results/test_score_pairs.json +0 -14
- wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
- wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
- wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_swag_pairs.json +0 -8
- wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
- wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
- wisent/examples/scripts/results/test_translation_pairs.json +0 -14
- wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
- wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
- wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
- wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
- wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
- wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
- wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
- wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
- wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
- wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
- wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
- wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
- wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
- wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
- wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
- wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
- wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
- wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
- wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
- wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/WHEEL +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/top_level.txt +0 -0
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "medqa",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Question: A 21-year-old sexually active male complains of fever, pain during urination, and inflamma...",
|
|
11
|
-
"positive_response": "Ceftriaxone",
|
|
12
|
-
"negative_response": "Trimethoprim",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'Ceftriaxone' (log_prob=-0.500), Expected: 'Ceftriaxone'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'Ceftriaxone' (log_prob=-0.500), Expected: 'Trimethoprim'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Question: A 21-year-old sexually active male complains of fever, pain during urination, and inflammation and pain in the right knee. A culture of the joint fluid shows a bacteria that does not ferment maltose and has no polysaccharide capsule. The physician orders antibiotic therapy for the patient. The mechanism of action of action of the medication given blocks cell wall synthesis, which of the following was given?\nA. Trimethoprim\nB. Ceftriaxone",
|
|
5
|
-
"positive_response": "Ceftriaxone",
|
|
6
|
-
"negative_response": "Trimethoprim"
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "medtext",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Medical Record: A 50-year-old male presents with a history of recurrent kidney stones and osteopenia...",
|
|
11
|
-
"positive_response": "This patient's history of recurrent kidney stones, osteopenia, and high-dose vitamin D supplementation, along with laboratory findings of hypercalcemia and hypercalciuria, suggest the possibility of vitamin D toxicity. Excessive intake of vitamin D can cause increased absorption of calcium from the gut, leading to hypercalcemia and hypercalciuria, which can result in kidney stones and bone loss. Treatment would involve stopping the vitamin D supplementation and potentially providing intravenous fluids and loop diuretics to promote the excretion of calcium.",
|
|
12
|
-
"negative_response": "Unable to determine a specific diagnosis based on the information provided. Further testing would be required.",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'This patient's history of recurrent kidney stones, osteopenia, and high-dose vitamin D supplementation, along with laboratory findings of hypercalcemia and hypercalciuria, suggest the possibility of vitamin D toxicity. Excessive intake of vitamin D can cause increased absorption of calcium from the gut, leading to hypercalcemia and hypercalciuria, which can result in kidney stones and bone loss. Treatment would involve stopping the vitamin D supplementation and potentially providing intravenous fluids and loop diuretics to promote the excretion of calcium.' (log_prob=-0.500), Expected: 'This patient's history of recurrent kidney stones, osteopenia, and high-dose vitamin D supplementation, along with laboratory findings of hypercalcemia and hypercalciuria, suggest the possibility of vitamin D toxicity. Excessive intake of vitamin D can cause increased absorption of calcium from the gut, leading to hypercalcemia and hypercalciuria, which can result in kidney stones and bone loss. Treatment would involve stopping the vitamin D supplementation and potentially providing intravenous fluids and loop diuretics to promote the excretion of calcium.'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'This patient's history of recurrent kidney stones, osteopenia, and high-dose vitamin D supplementation, along with laboratory findings of hypercalcemia and hypercalciuria, suggest the possibility of vitamin D toxicity. Excessive intake of vitamin D can cause increased absorption of calcium from the gut, leading to hypercalcemia and hypercalciuria, which can result in kidney stones and bone loss. Treatment would involve stopping the vitamin D supplementation and potentially providing intravenous fluids and loop diuretics to promote the excretion of calcium.' (log_prob=-0.500), Expected: 'Unable to determine a specific diagnosis based on the information provided. Further testing would be required.'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Medical Record: A 50-year-old male presents with a history of recurrent kidney stones and osteopenia. He has been taking high-dose vitamin D supplements due to a previous diagnosis of vitamin D deficiency. Laboratory results reveal hypercalcemia and hypercalciuria. What is the likely diagnosis, and what is the treatment?\nProvide diagnosis and treatment:",
|
|
5
|
-
"positive_response": "This patient's history of recurrent kidney stones, osteopenia, and high-dose vitamin D supplementation, along with laboratory findings of hypercalcemia and hypercalciuria, suggest the possibility of vitamin D toxicity. Excessive intake of vitamin D can cause increased absorption of calcium from the gut, leading to hypercalcemia and hypercalciuria, which can result in kidney stones and bone loss. Treatment would involve stopping the vitamin D supplementation and potentially providing intravenous fluids and loop diuretics to promote the excretion of calcium.",
|
|
6
|
-
"negative_response": "Unable to determine a specific diagnosis based on the information provided. Further testing would be required."
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "mela",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 2,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Sentence: \u4ed6\u4e0d\u80fd\u6765\u7684\u539f\u56e0\u6211\u77e5\u9053\u4e86\uff1b\u4f60\u4e0d\u80fd\u6765\u7684\u539f\u56e0\u5462\uff1f\nDetermine whether this sentence is acceptable or unacceptable?\nA. Un...",
|
|
11
|
-
"positive_response": "Acceptable",
|
|
12
|
-
"negative_response": "Unacceptable",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'Acceptable' (log_prob=-0.500), Expected: 'Acceptable'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'Acceptable' (log_prob=-0.500), Expected: 'Unacceptable'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"pair_id": 1,
|
|
31
|
-
"prompt": "Sentence: \u0412\u0435\u0447\u0435\u0440 \u043d\u0435\u0440\u0435\u0434\u043a\u043e \u043d\u0430\u0447\u0438\u043d\u0430\u043b \u043e\u043d \u0441\u043f\u0435\u043a\u0442\u0430\u043a\u043b\u0435\u043c.\nDetermine whether this sentence is acceptable or unac...",
|
|
32
|
-
"positive_response": "Acceptable",
|
|
33
|
-
"negative_response": "Unacceptable",
|
|
34
|
-
"positive_evaluation": {
|
|
35
|
-
"ground_truth": "TRUTHFUL",
|
|
36
|
-
"confidence": 1.0,
|
|
37
|
-
"expected": "TRUTHFUL",
|
|
38
|
-
"correct": true,
|
|
39
|
-
"details": "Predicted: 'Acceptable' (log_prob=-0.500), Expected: 'Acceptable'"
|
|
40
|
-
},
|
|
41
|
-
"negative_evaluation": {
|
|
42
|
-
"ground_truth": "UNTRUTHFUL",
|
|
43
|
-
"confidence": 0.0,
|
|
44
|
-
"expected": "UNTRUTHFUL",
|
|
45
|
-
"correct": true,
|
|
46
|
-
"details": "Predicted: 'Acceptable' (log_prob=-0.500), Expected: 'Unacceptable'"
|
|
47
|
-
},
|
|
48
|
-
"both_correct": true
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Sentence: \u4ed6\u4e0d\u80fd\u6765\u7684\u539f\u56e0\u6211\u77e5\u9053\u4e86\uff1b\u4f60\u4e0d\u80fd\u6765\u7684\u539f\u56e0\u5462\uff1f\nDetermine whether this sentence is acceptable or unacceptable?\nA. Unacceptable\nB. Acceptable",
|
|
5
|
-
"positive_response": "Acceptable",
|
|
6
|
-
"negative_response": "Unacceptable"
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "Sentence: \u0412\u0435\u0447\u0435\u0440 \u043d\u0435\u0440\u0435\u0434\u043a\u043e \u043d\u0430\u0447\u0438\u043d\u0430\u043b \u043e\u043d \u0441\u043f\u0435\u043a\u0442\u0430\u043a\u043b\u0435\u043c.\nDetermine whether this sentence is acceptable or unacceptable?\nA. Unacceptable\nB. Acceptable",
|
|
11
|
-
"positive_response": "Acceptable",
|
|
12
|
-
"negative_response": "Unacceptable"
|
|
13
|
-
}
|
|
14
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "meqsum",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Extract and summarize the following medical question:\nI need/want to know who manufscturs Cetirizine...",
|
|
11
|
-
"positive_response": "Who manufactures cetirizine?",
|
|
12
|
-
"negative_response": "Unable to provide a summary. The question is unclear.",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'Who manufactures cetirizine?' (log_prob=-0.500), Expected: 'Who manufactures cetirizine?'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'Who manufactures cetirizine?' (log_prob=-0.500), Expected: 'Unable to provide a summary. The question is unclear.'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Extract and summarize the following medical question:\nI need/want to know who manufscturs Cetirizine. My Walmart is looking for a new supply and are not getting the recent",
|
|
5
|
-
"positive_response": "Who manufactures cetirizine?",
|
|
6
|
-
"negative_response": "Unable to provide a summary. The question is unclear."
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "mercury",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Translate this Java code to C#:\npublic ListSpeechSynthesisTasksResult listSpeechSynthesisTasks(ListS...",
|
|
11
|
-
"positive_response": "public virtual ListSpeechSynthesisTasksResponse ListSpeechSynthesisTasks(ListSpeechSynthesisTasksRequest request){var options = new InvokeOptions();options.RequestMarshaller = ListSpeechSynthesisTasksRequestMarshaller.Instance;options.ResponseUnmarshaller = ListSpeechSynthesisTasksResponseUnmarshaller.Instance;return Invoke<ListSpeechSynthesisTasksResponse>(request, options);}",
|
|
12
|
-
"negative_response": "public virtual ListSpeechSynthesisTasksResponse ListSpeechSynthesisTasks(ListSpeechSynthesisTasksRequest request){var options = new InvokeOptions();options.RequestMarshaller = ListSpeechSyn# CORRUPTEDthesisTasksRequestMarshaller.Instance;options.ResponseUnmarshaller = ListSpeechSynthesisTasksResponseUnmarshaller.Instance;return Invoke<ListSpeechSynthesisTasksResponse>(request, options);}",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'public virtual ListSpeechSynthesisTasksResponse ListSpeechSynthesisTasks(ListSpeechSynthesisTasksRequest request){var options = new InvokeOptions();options.RequestMarshaller = ListSpeechSynthesisTasksRequestMarshaller.Instance;options.ResponseUnmarshaller = ListSpeechSynthesisTasksResponseUnmarshaller.Instance;return Invoke<ListSpeechSynthesisTasksResponse>(request, options);}' (log_prob=-0.500), Expected: 'public virtual ListSpeechSynthesisTasksResponse ListSpeechSynthesisTasks(ListSpeechSynthesisTasksRequest request){var options = new InvokeOptions();options.RequestMarshaller = ListSpeechSynthesisTasksRequestMarshaller.Instance;options.ResponseUnmarshaller = ListSpeechSynthesisTasksResponseUnmarshaller.Instance;return Invoke<ListSpeechSynthesisTasksResponse>(request, options);}'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'public virtual ListSpeechSynthesisTasksResponse ListSpeechSynthesisTasks(ListSpeechSynthesisTasksRequest request){var options = new InvokeOptions();options.RequestMarshaller = ListSpeechSynthesisTasksRequestMarshaller.Instance;options.ResponseUnmarshaller = ListSpeechSynthesisTasksResponseUnmarshaller.Instance;return Invoke<ListSpeechSynthesisTasksResponse>(request, options);}' (log_prob=-0.500), Expected: 'public virtual ListSpeechSynthesisTasksResponse ListSpeechSynthesisTasks(ListSpeechSynthesisTasksRequest request){var options = new InvokeOptions();options.RequestMarshaller = ListSpeechSyn# CORRUPTEDthesisTasksRequestMarshaller.Instance;options.ResponseUnmarshaller = ListSpeechSynthesisTasksResponseUnmarshaller.Instance;return Invoke<ListSpeechSynthesisTasksResponse>(request, options);}'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Translate this Java code to C#:\npublic ListSpeechSynthesisTasksResult listSpeechSynthesisTasks(ListSpeechSynthesisTasksRequest request) {request = beforeClientExecution(request);return executeListSpeechSynthesisTasks(request);}",
|
|
5
|
-
"positive_response": "public virtual ListSpeechSynthesisTasksResponse ListSpeechSynthesisTasks(ListSpeechSynthesisTasksRequest request){var options = new InvokeOptions();options.RequestMarshaller = ListSpeechSynthesisTasksRequestMarshaller.Instance;options.ResponseUnmarshaller = ListSpeechSynthesisTasksResponseUnmarshaller.Instance;return Invoke<ListSpeechSynthesisTasksResponse>(request, options);}",
|
|
6
|
-
"negative_response": "public virtual ListSpeechSynthesisTasksResponse ListSpeechSynthesisTasks(ListSpeechSynthesisTasksRequest request){var options = new InvokeOptions();options.RequestMarshaller = ListSpeechSyn# CORRUPTEDthesisTasksRequestMarshaller.Instance;options.ResponseUnmarshaller = ListSpeechSynthesisTasksResponseUnmarshaller.Instance;return Invoke<ListSpeechSynthesisTasksResponse>(request, options);}"
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "metabench",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 2,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Complete the sentence: The toddler said something rude which offended Victoria but Samantha laughed....",
|
|
11
|
-
"positive_response": "phone",
|
|
12
|
-
"negative_response": "television",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'phone' (log_prob=-0.500), Expected: 'phone'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'phone' (log_prob=-0.500), Expected: 'television'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"pair_id": 1,
|
|
31
|
-
"prompt": "Question: Human tears contain the enzyme lysozyme, which damages the cell walls of bacteria. Which o...",
|
|
32
|
-
"positive_response": "Lysozyme breaks a specific type of bond in a bacterial cell wall molecule.",
|
|
33
|
-
"negative_response": "Lysozyme causes mutations in bacterial cell wall molecules.",
|
|
34
|
-
"positive_evaluation": {
|
|
35
|
-
"ground_truth": "TRUTHFUL",
|
|
36
|
-
"confidence": 1.0,
|
|
37
|
-
"expected": "TRUTHFUL",
|
|
38
|
-
"correct": true,
|
|
39
|
-
"details": "Predicted: 'Lysozyme breaks a specific type of bond in a bacterial cell wall molecule.' (log_prob=-0.500), Expected: 'Lysozyme breaks a specific type of bond in a bacterial cell wall molecule.'"
|
|
40
|
-
},
|
|
41
|
-
"negative_evaluation": {
|
|
42
|
-
"ground_truth": "UNTRUTHFUL",
|
|
43
|
-
"confidence": 0.0,
|
|
44
|
-
"expected": "UNTRUTHFUL",
|
|
45
|
-
"correct": true,
|
|
46
|
-
"details": "Predicted: 'Lysozyme breaks a specific type of bond in a bacterial cell wall molecule.' (log_prob=-0.500), Expected: 'Lysozyme causes mutations in bacterial cell wall molecules.'"
|
|
47
|
-
},
|
|
48
|
-
"both_correct": true
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Complete the sentence: The toddler said something rude which offended Victoria but Samantha laughed. Samantha always liked the honesty of children.\n\nKenneth asked Jason for some tips on growing the perfect mustache, because Kenneth had never had one before.\n\nAaron accepted from William a wool blanket since it was cold outside. William was a good friend.\n\nHeather poured the entire bottle of honey into her tea cup, until the bottle was empty.\n\nShe didn't like the style of the jacket, but enjoyed the look of the scarf, because the scarf was prestigious.\n\nMy employer offers a bonus of either a phone or a television, but unfortunately the _ is just way too large to be useful.\nA. phone\nB. television",
|
|
5
|
-
"positive_response": "phone",
|
|
6
|
-
"negative_response": "television"
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "Question: Human tears contain the enzyme lysozyme, which damages the cell walls of bacteria. Which of the following statements about lysozyme is most accurate?\nA. Lysozyme causes mutations in bacterial cell wall molecules.\nB. Lysozyme breaks a specific type of bond in a bacterial cell wall molecule.",
|
|
11
|
-
"positive_response": "Lysozyme breaks a specific type of bond in a bacterial cell wall molecule.",
|
|
12
|
-
"negative_response": "Lysozyme causes mutations in bacterial cell wall molecules."
|
|
13
|
-
}
|
|
14
|
-
]
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "mgsm",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 2,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Question: Kalinda arbeitet gerade an einem 360-teiligen Puzzle mit ihrer Mutter. Kalinda kann normal...",
|
|
11
|
-
"positive_response": "1",
|
|
12
|
-
"negative_response": "3",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: '1' (log_prob=-0.500), Expected: '1'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: '1' (log_prob=-0.500), Expected: '3'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"pair_id": 1,
|
|
31
|
-
"prompt": "Question: \u0415\u0441\u043b\u0438 \u043c\u0435\u0448\u043e\u0447\u0435\u043a \u0448\u0430\u0440\u0438\u043a\u043e\u0432 \u0441\u0442\u043e\u0438\u0442 20 $ \u0438 \u0446\u0435\u043d\u0430 \u043a\u0430\u0436\u0434\u044b\u0435 \u0434\u0432\u0430 \u043c\u0435\u0441\u044f\u0446\u0430 \u0440\u0430\u0441\u0442\u0435\u0442 \u043d\u0430 20 % \u043e\u0442 \u043f\u0435\u0440\u0432\u043e\u043d\u0430\u0447\u0430\u043b\u044c\u043d\u043e\u0439,...",
|
|
32
|
-
"positive_response": "92",
|
|
33
|
-
"negative_response": "91",
|
|
34
|
-
"positive_evaluation": {
|
|
35
|
-
"ground_truth": "TRUTHFUL",
|
|
36
|
-
"confidence": 1.0,
|
|
37
|
-
"expected": "TRUTHFUL",
|
|
38
|
-
"correct": true,
|
|
39
|
-
"details": "Predicted: '92' (log_prob=-0.500), Expected: '92'"
|
|
40
|
-
},
|
|
41
|
-
"negative_evaluation": {
|
|
42
|
-
"ground_truth": "UNTRUTHFUL",
|
|
43
|
-
"confidence": 0.0,
|
|
44
|
-
"expected": "UNTRUTHFUL",
|
|
45
|
-
"correct": true,
|
|
46
|
-
"details": "Predicted: '92' (log_prob=-0.500), Expected: '91'"
|
|
47
|
-
},
|
|
48
|
-
"both_correct": true
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Question: Kalinda arbeitet gerade an einem 360-teiligen Puzzle mit ihrer Mutter. Kalinda kann normalerweise 4 Teile pro Minute hinzuf\u00fcgen. Ihre Mutter kann \u00fcblicherweise halb so viele Teile pro Minuten platzieren wie Kalinda. Wie viele Stunden brauchen sie, um das Puzzle zu vervollst\u00e4ndigen?\nAnswer:",
|
|
5
|
-
"positive_response": "1",
|
|
6
|
-
"negative_response": "3"
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "Question: \u0415\u0441\u043b\u0438 \u043c\u0435\u0448\u043e\u0447\u0435\u043a \u0448\u0430\u0440\u0438\u043a\u043e\u0432 \u0441\u0442\u043e\u0438\u0442 20 $ \u0438 \u0446\u0435\u043d\u0430 \u043a\u0430\u0436\u0434\u044b\u0435 \u0434\u0432\u0430 \u043c\u0435\u0441\u044f\u0446\u0430 \u0440\u0430\u0441\u0442\u0435\u0442 \u043d\u0430 20 % \u043e\u0442 \u043f\u0435\u0440\u0432\u043e\u043d\u0430\u0447\u0430\u043b\u044c\u043d\u043e\u0439, \u0441\u0442\u043e\u043b\u044c\u043a\u043e \u0431\u0443\u0434\u0435\u0442 \u0441\u0442\u043e\u0438\u0442\u044c \u043c\u0435\u0448\u043e\u0447\u0435\u043a \u0448\u0430\u0440\u0438\u043a\u043e\u0432 \u0447\u0435\u0440\u0435\u0437 36 \u043c\u0435\u0441\u044f\u0446\u0435\u0432?\nAnswer:",
|
|
11
|
-
"positive_response": "92",
|
|
12
|
-
"negative_response": "91"
|
|
13
|
-
}
|
|
14
|
-
]
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "mimic_repsum",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 1,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Given the findings: FINDINGS: The patient is now extubated, and note is made of a focal stenosis of...",
|
|
11
|
-
"positive_response": "IMPRESSION: No acute cardiopulmonary abnormality. ST-T waveabnormalities in the anterolateral leads may be due to myocardial ischemia.Compared to the previous tracing of the T wave changes are moreprominent in leads I, aVL and now noted in lead V3. The aorta demonstrates calcifications of the aortic arch and within its descending portion. Additionally, there is a 1-mm left upper lobe nodule (4:100). FINDINGS: As compared to the previous radiograph, the lung volumes have slightly decreased. Virtual with inspiratory/expiratory sequences. Note is made of 2 mm upper lobe nodules, one on the left (4:75) and one on the right (4:93). Pulmonary vascularity is normal. No focal parenchymal opacity suggesting pneumonia, a left retrocardiac opacity, however, has newly appeared. Secretions in the right main stem bronchus as well as in the right lower lobe bronchus, with resultant air trapping in the right lower lobe. The lungs reveal biapical scar as well as paraseptal emphysema. On a separate workstation, three-dimensional virtual bronchoscopic images are created as well as multiplanar reformatted images and minimum intensity projection images. Osseous structures reveal no suspicious sclerotic or lytic lesion. Imaged portions of the upper abdomen reveal calcified granulomas in the spleen and liver. The heart size is normal. No newly appeared pleural effusion. IMPRESSION: 1. DFDdp The lungs are clear. There is no pleural or pericardial effusion. There are no acute osseous findings. No pleural effusion or pneumothorax is identified. Virtual with inspiratory/e Admitting Diagnosis: MYASTHENIA MEDICAL CONDITION: 67 year old woman with myasthenia s/p recent crisis with intubation now with persistent stridor REASON FOR THIS EXAMINATION: degree of airway stenosis. COMPARISON: . COMPARISON: . 1:16 PM CHEST (PORTABLE AP)",
|
|
12
|
-
"negative_response": "Unable to summarize the findings. Additional information is required.",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: 'IMPRESSION: No acute cardiopulmonary abnormality. ST-T waveabnormalities in the anterolateral leads may be due to myocardial ischemia.Compared to the previous tracing of the T wave changes are moreprominent in leads I, aVL and now noted in lead V3. The aorta demonstrates calcifications of the aortic arch and within its descending portion. Additionally, there is a 1-mm left upper lobe nodule (4:100). FINDINGS: As compared to the previous radiograph, the lung volumes have slightly decreased. Virtual with inspiratory/expiratory sequences. Note is made of 2 mm upper lobe nodules, one on the left (4:75) and one on the right (4:93). Pulmonary vascularity is normal. No focal parenchymal opacity suggesting pneumonia, a left retrocardiac opacity, however, has newly appeared. Secretions in the right main stem bronchus as well as in the right lower lobe bronchus, with resultant air trapping in the right lower lobe. The lungs reveal biapical scar as well as paraseptal emphysema. On a separate workstation, three-dimensional virtual bronchoscopic images are created as well as multiplanar reformatted images and minimum intensity projection images. Osseous structures reveal no suspicious sclerotic or lytic lesion. Imaged portions of the upper abdomen reveal calcified granulomas in the spleen and liver. The heart size is normal. No newly appeared pleural effusion. IMPRESSION: 1. DFDdp The lungs are clear. There is no pleural or pericardial effusion. There are no acute osseous findings. No pleural effusion or pneumothorax is identified. Virtual with inspiratory/e Admitting Diagnosis: MYASTHENIA MEDICAL CONDITION: 67 year old woman with myasthenia s/p recent crisis with intubation now with persistent stridor REASON FOR THIS EXAMINATION: degree of airway stenosis. COMPARISON: . COMPARISON: . 1:16 PM CHEST (PORTABLE AP)' (log_prob=-0.500), Expected: 'IMPRESSION: No acute cardiopulmonary abnormality. ST-T waveabnormalities in the anterolateral leads may be due to myocardial ischemia.Compared to the previous tracing of the T wave changes are moreprominent in leads I, aVL and now noted in lead V3. The aorta demonstrates calcifications of the aortic arch and within its descending portion. Additionally, there is a 1-mm left upper lobe nodule (4:100). FINDINGS: As compared to the previous radiograph, the lung volumes have slightly decreased. Virtual with inspiratory/expiratory sequences. Note is made of 2 mm upper lobe nodules, one on the left (4:75) and one on the right (4:93). Pulmonary vascularity is normal. No focal parenchymal opacity suggesting pneumonia, a left retrocardiac opacity, however, has newly appeared. Secretions in the right main stem bronchus as well as in the right lower lobe bronchus, with resultant air trapping in the right lower lobe. The lungs reveal biapical scar as well as paraseptal emphysema. On a separate workstation, three-dimensional virtual bronchoscopic images are created as well as multiplanar reformatted images and minimum intensity projection images. Osseous structures reveal no suspicious sclerotic or lytic lesion. Imaged portions of the upper abdomen reveal calcified granulomas in the spleen and liver. The heart size is normal. No newly appeared pleural effusion. IMPRESSION: 1. DFDdp The lungs are clear. There is no pleural or pericardial effusion. There are no acute osseous findings. No pleural effusion or pneumothorax is identified. Virtual with inspiratory/e Admitting Diagnosis: MYASTHENIA MEDICAL CONDITION: 67 year old woman with myasthenia s/p recent crisis with intubation now with persistent stridor REASON FOR THIS EXAMINATION: degree of airway stenosis. COMPARISON: . COMPARISON: . 1:16 PM CHEST (PORTABLE AP)'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: 'IMPRESSION: No acute cardiopulmonary abnormality. ST-T waveabnormalities in the anterolateral leads may be due to myocardial ischemia.Compared to the previous tracing of the T wave changes are moreprominent in leads I, aVL and now noted in lead V3. The aorta demonstrates calcifications of the aortic arch and within its descending portion. Additionally, there is a 1-mm left upper lobe nodule (4:100). FINDINGS: As compared to the previous radiograph, the lung volumes have slightly decreased. Virtual with inspiratory/expiratory sequences. Note is made of 2 mm upper lobe nodules, one on the left (4:75) and one on the right (4:93). Pulmonary vascularity is normal. No focal parenchymal opacity suggesting pneumonia, a left retrocardiac opacity, however, has newly appeared. Secretions in the right main stem bronchus as well as in the right lower lobe bronchus, with resultant air trapping in the right lower lobe. The lungs reveal biapical scar as well as paraseptal emphysema. On a separate workstation, three-dimensional virtual bronchoscopic images are created as well as multiplanar reformatted images and minimum intensity projection images. Osseous structures reveal no suspicious sclerotic or lytic lesion. Imaged portions of the upper abdomen reveal calcified granulomas in the spleen and liver. The heart size is normal. No newly appeared pleural effusion. IMPRESSION: 1. DFDdp The lungs are clear. There is no pleural or pericardial effusion. There are no acute osseous findings. No pleural effusion or pneumothorax is identified. Virtual with inspiratory/e Admitting Diagnosis: MYASTHENIA MEDICAL CONDITION: 67 year old woman with myasthenia s/p recent crisis with intubation now with persistent stridor REASON FOR THIS EXAMINATION: degree of airway stenosis. COMPARISON: . COMPARISON: . 1:16 PM CHEST (PORTABLE AP)' (log_prob=-0.500), Expected: 'Unable to summarize the findings. Additional information is required.'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
}
|
|
29
|
-
]
|
|
30
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Given the findings: FINDINGS: The patient is now extubated, and note is made of a focal stenosis of the trachea at the level of the thoracic inlet, stretching approximately 7 mm in the craniocaudad dimension consistent with focal stenosis likely related to the previously present endotracheal tube. Sinus rhythm with a single ventricular premature beat. This stenosis is relatively fixed, with a cross-sectional area at that level during inspiration of 52 mm2, narrowing only minimally 230 mm2 during expiration. Clinical correlation issuggested. COMPARISON: CT from and chest radiograph from . No contraindications for IV contrast.\nSummarize the findings.",
|
|
5
|
-
"positive_response": "IMPRESSION: No acute cardiopulmonary abnormality. ST-T waveabnormalities in the anterolateral leads may be due to myocardial ischemia.Compared to the previous tracing of the T wave changes are moreprominent in leads I, aVL and now noted in lead V3. The aorta demonstrates calcifications of the aortic arch and within its descending portion. Additionally, there is a 1-mm left upper lobe nodule (4:100). FINDINGS: As compared to the previous radiograph, the lung volumes have slightly decreased. Virtual with inspiratory/expiratory sequences. Note is made of 2 mm upper lobe nodules, one on the left (4:75) and one on the right (4:93). Pulmonary vascularity is normal. No focal parenchymal opacity suggesting pneumonia, a left retrocardiac opacity, however, has newly appeared. Secretions in the right main stem bronchus as well as in the right lower lobe bronchus, with resultant air trapping in the right lower lobe. The lungs reveal biapical scar as well as paraseptal emphysema. On a separate workstation, three-dimensional virtual bronchoscopic images are created as well as multiplanar reformatted images and minimum intensity projection images. Osseous structures reveal no suspicious sclerotic or lytic lesion. Imaged portions of the upper abdomen reveal calcified granulomas in the spleen and liver. The heart size is normal. No newly appeared pleural effusion. IMPRESSION: 1. DFDdp The lungs are clear. There is no pleural or pericardial effusion. There are no acute osseous findings. No pleural effusion or pneumothorax is identified. Virtual with inspiratory/e Admitting Diagnosis: MYASTHENIA MEDICAL CONDITION: 67 year old woman with myasthenia s/p recent crisis with intubation now with persistent stridor REASON FOR THIS EXAMINATION: degree of airway stenosis. COMPARISON: . COMPARISON: . 1:16 PM CHEST (PORTABLE AP)",
|
|
6
|
-
"negative_response": "Unable to summarize the findings. Additional information is required."
|
|
7
|
-
}
|
|
8
|
-
]
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "minerva_math",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 2,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Problem:\nThe positive difference between two consecutive even perfect squares is $268$. Compute the...",
|
|
11
|
-
"positive_response": "4624",
|
|
12
|
-
"negative_response": "The answer cannot be determined from the given information.",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: '4624' (log_prob=-0.500), Expected: '4624'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: '4624' (log_prob=-0.500), Expected: 'The answer cannot be determined from the given information.'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"pair_id": 1,
|
|
31
|
-
"prompt": "Problem:\nWhat is the domain of the function $f(x)=\\frac{x+2}{x^2-2x-24}$? Express your answer in int...",
|
|
32
|
-
"positive_response": "(-\\iny,-4)\\cup(-4,6)\\cup(6,\\iny)",
|
|
33
|
-
"negative_response": "The answer cannot be determined from the given information.",
|
|
34
|
-
"positive_evaluation": {
|
|
35
|
-
"ground_truth": "TRUTHFUL",
|
|
36
|
-
"confidence": 1.0,
|
|
37
|
-
"expected": "TRUTHFUL",
|
|
38
|
-
"correct": true,
|
|
39
|
-
"details": "Predicted: '(-\\iny,-4)\\cup(-4,6)\\cup(6,\\iny)' (log_prob=-0.500), Expected: '(-\\iny,-4)\\cup(-4,6)\\cup(6,\\iny)'"
|
|
40
|
-
},
|
|
41
|
-
"negative_evaluation": {
|
|
42
|
-
"ground_truth": "UNTRUTHFUL",
|
|
43
|
-
"confidence": 0.0,
|
|
44
|
-
"expected": "UNTRUTHFUL",
|
|
45
|
-
"correct": true,
|
|
46
|
-
"details": "Predicted: '(-\\iny,-4)\\cup(-4,6)\\cup(6,\\iny)' (log_prob=-0.500), Expected: 'The answer cannot be determined from the given information.'"
|
|
47
|
-
},
|
|
48
|
-
"both_correct": true
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Problem:\nThe positive difference between two consecutive even perfect squares is $268$. Compute the larger of the two squares.\n\nSolution:",
|
|
5
|
-
"positive_response": "4624",
|
|
6
|
-
"negative_response": "The answer cannot be determined from the given information."
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "Problem:\nWhat is the domain of the function $f(x)=\\frac{x+2}{x^2-2x-24}$? Express your answer in interval notation.\n\nSolution:",
|
|
11
|
-
"positive_response": "(-\\iny,-4)\\cup(-4,6)\\cup(6,\\iny)",
|
|
12
|
-
"negative_response": "The answer cannot be determined from the given information."
|
|
13
|
-
}
|
|
14
|
-
]
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "mlqa",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 2,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Context: \u090f\u0915 \u0938\u0918\u0928 \u0935\u093f\u092a\u0923\u0928 \u092a\u094d\u0930\u092f\u093e\u0938 \u0915\u0940 \u0935\u091c\u0939 \u0938\u0947, \u0935\u0948\u0932\u0947\u0902\u091f\u093e\u0907\u0928 \u0926\u093f\u0935\u0938 \u0915\u0941\u091b \u090f\u0936\u093f\u092f\u093e\u0908 \u0926\u0947\u0936\u094b\u0902 \u092e\u0947\u0902 \u092d\u0940 \u092e\u0928\u093e\u092f\u093e \u091c\u093e\u0924\u093e \u0939\u0948\u0964 \u0938\u093f\u0902\u0917\u093e\u092a...",
|
|
11
|
-
"positive_response": "\u091a\u0949\u0915\u0932\u0947\u091f",
|
|
12
|
-
"negative_response": "I don't know.",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: '\u091a\u0949\u0915\u0932\u0947\u091f' (log_prob=-0.500), Expected: '\u091a\u0949\u0915\u0932\u0947\u091f'"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: '\u091a\u0949\u0915\u0932\u0947\u091f' (log_prob=-0.500), Expected: 'I don't know.'"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"pair_id": 1,
|
|
31
|
-
"prompt": "Context: James Clerk Maxwell FRS FRSE (Reino Unido: /d\u0361\u0292e\u026amz kl\u0251\u02d0k \u02c8m\u00e6ksw\u0259l/; 13 de junio de 1831-5 ...",
|
|
32
|
-
"positive_response": "f\u00edsica matem\u00e1tica",
|
|
33
|
-
"negative_response": "I don't know.",
|
|
34
|
-
"positive_evaluation": {
|
|
35
|
-
"ground_truth": "TRUTHFUL",
|
|
36
|
-
"confidence": 1.0,
|
|
37
|
-
"expected": "TRUTHFUL",
|
|
38
|
-
"correct": true,
|
|
39
|
-
"details": "Predicted: 'f\u00edsica matem\u00e1tica' (log_prob=-0.500), Expected: 'f\u00edsica matem\u00e1tica'"
|
|
40
|
-
},
|
|
41
|
-
"negative_evaluation": {
|
|
42
|
-
"ground_truth": "UNTRUTHFUL",
|
|
43
|
-
"confidence": 0.0,
|
|
44
|
-
"expected": "UNTRUTHFUL",
|
|
45
|
-
"correct": true,
|
|
46
|
-
"details": "Predicted: 'f\u00edsica matem\u00e1tica' (log_prob=-0.500), Expected: 'I don't know.'"
|
|
47
|
-
},
|
|
48
|
-
"both_correct": true
|
|
49
|
-
}
|
|
50
|
-
]
|
|
51
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Context: \u090f\u0915 \u0938\u0918\u0928 \u0935\u093f\u092a\u0923\u0928 \u092a\u094d\u0930\u092f\u093e\u0938 \u0915\u0940 \u0935\u091c\u0939 \u0938\u0947, \u0935\u0948\u0932\u0947\u0902\u091f\u093e\u0907\u0928 \u0926\u093f\u0935\u0938 \u0915\u0941\u091b \u090f\u0936\u093f\u092f\u093e\u0908 \u0926\u0947\u0936\u094b\u0902 \u092e\u0947\u0902 \u092d\u0940 \u092e\u0928\u093e\u092f\u093e \u091c\u093e\u0924\u093e \u0939\u0948\u0964 \u0938\u093f\u0902\u0917\u093e\u092a\u0941\u0930, \u091a\u0940\u0928 \u0914\u0930 \u0926\u0915\u094d\u0937\u093f\u0923 \u0915\u094b\u0930\u093f\u092f\u093e \u0915\u0947 \u0932\u094b\u0917 \u0935\u0948\u0932\u0947\u0902\u091f\u093e\u0907\u0928 \u0926\u093f\u0935\u0938 \u0915\u0947 \u0924\u094d\u092f\u094c\u0939\u093e\u0930 \u092a\u0930 \u0938\u092c\u0938\u0947 \u0905\u0927\u093f\u0915 \u092a\u0948\u0938\u093e \u0916\u0930\u094d\u091a\u093e \u0915\u0930\u0924\u0947 \u0939\u0948\u0902\u0964\u091c\u093e\u092a\u093e\u0928 \u092e\u0947\u0902 1960 \u092e\u0947\u0902 \u092e\u094b\u0930\u093f\u0928\u093e\u0917\u093e, \u091c\u094b \u0915\u0940 \u091c\u093e\u092a\u093e\u0928 \u0915\u0940 \u0938\u092c\u0938\u0947 \u092c\u0921\u093c\u0940 \u092e\u093f\u0937\u094d\u0920\u093e\u0928\u094d\u0928 \u0915\u0902\u092a\u0928\u093f\u092f\u094b\u0902 \u092e\u0947\u0902 \u0938\u0947 \u090f\u0915 \u0939\u0948, \u0928\u0947 \u0935\u0930\u094d\u0924\u092e\u093e\u0928 \u092a\u094d\u0930\u0925\u093e \u0915\u093f \u0915\u0947\u0935\u0932 \u092e\u0939\u093f\u0932\u093e\u090f\u0902 \u0939\u0940 \u092a\u0941\u0930\u0941\u0937\u094b\u0902 \u0915\u094b \u091a\u0949\u0915\u0932\u0947\u091f \u0926\u0947 \u0938\u0915\u0924\u0940 \u0939\u0948\u0902, \u0915\u0940 \u0936\u0941\u0930\u0941\u0906\u0924 \u0915\u0940 \u0925\u0940\u0964 \u0935\u093f\u0936\u0947\u0937 \u0930\u0942\u092a \u0938\u0947, \u0911\u092b\u093f\u0938 \u0915\u0940 \u092e\u0939\u093f\u0932\u093e\u090f\u0902 \u0909\u0928\u0915\u0947 \u0938\u0939 \u0915\u093e\u0930\u094d\u092f\u0915\u0930\u094d\u0924\u093e\u0913\u0902 \u0915\u094b \u091a\u0949\u0915\u0932\u0947\u091f \u0926\u0947\u0902\u0917\u0940\u0964 \u090f\u0915 \u092e\u0939\u0940\u0928\u0947 \u092c\u093e\u0926 \u092e\u093e\u0930\u094d\u091a 14 \u0915\u094b \u0935\u094d\u0939\u093e\u0907\u091f \u0926\u093f\u0935\u0938 \u092e\u0928\u093e\u092f\u093e \u091c\u093e\u0924\u093e \u0939\u0948\u0964 \u0907\u0938\u0915\u0940 \u0909\u0924\u094d\u092a\u0924\u094d\u0924\u093f \u091c\u093e\u092a\u093e\u0928\u0940 \u0930\u093e\u0937\u094d\u091f\u094d\u0930\u0940\u092f \u092e\u093f\u0937\u094d\u0920\u093e\u0928\u094d\u0928 \u0909\u0926\u094d\u092f\u094b\u0917 \u0938\u0902\u0918 \u0926\u094d\u0935\u093e\u0930\u093e \u090f\u0915 \"\u0909\u0924\u094d\u0924\u0930 \u0926\u093f\u0935\u0938\" \u0915\u0947 \u0930\u0942\u092a \u092e\u0947\u0902 \u0915\u0940 \u0917\u092f\u0940 \u0925\u0940, \u091c\u0939\u093e\u0901 \u092a\u0941\u0930\u0941\u0937\u094b\u0902 \u0938\u0947 \u092f\u0947 \u0909\u092e\u094d\u092e\u0940\u0926 \u0915\u0940 \u091c\u093e\u0924\u0940 \u0925\u0940 \u0915\u0940 \u0935\u0947 \u0909\u0928 \u092e\u0939\u093f\u0932\u093e\u0913\u0902 \u0915\u094b \u0935\u093e\u092a\u0938 \u091a\u0949\u0915\u0932\u0947\u091f \u0926\u0947\u0902\u0917\u0947 \u091c\u093f\u0928\u094d\u0939\u094b\u0902\u0928\u0947 \u0909\u0928\u094d\u0939\u0947\u0902 \u092a\u0939\u0932\u0947 \u091a\u0949\u0915\u0932\u0947\u091f \u0926\u093f\u092f\u093e \u0925\u093e\u0964 \u092a\u0936\u094d\u091a\u093f\u092e\u0940 \u0926\u0947\u0936\u094b\u0902 \u0915\u0947 \u0935\u093f\u092a\u0930\u0940\u0924, \u0924\u094b\u0939\u092b\u094b\u0902 \u091c\u0948\u0938\u0947 \u0915\u0940 \u0915\u0948\u0902\u0921\u0940, \u092b\u0942\u0932, \u092f\u093e \u0930\u093e\u0924\u094d\u0930\u093f-\u092d\u094b\u091c \u0915\u094b \u0915\u093e\u092b\u0940 \u0905\u0938\u093e\u092e\u093e\u0928\u094d\u092f \u092e\u093e\u0928\u093e \u091c\u093e\u0924\u093e \u0939\u0948\u0964 \u0915\u0908 \u092e\u0939\u093f\u0932\u093e\u0913\u0902 \u0915\u0947 \u0932\u093f\u090f \u092f\u0947 \u0926\u093e\u092f\u093f\u0924\u094d\u0935 \u0938\u093e \u092c\u0928 \u0917\u092f\u093e \u0939\u0948 \u0915\u093f \u0935\u0947 \u0905\u092a\u0928\u0947 \u0938\u092d\u0940 \u092a\u0941\u0930\u0941\u0937 \u0938\u0939-\u0915\u093e\u0930\u094d\u092f\u0915\u0930\u094d\u0924\u093e\u0913\u0902 \u0915\u094b \u091a\u0949\u0915\u0932\u0947\u091f \u0926\u0947\u0902\u0917\u0940\u0964 \u090f\u0915 \u0906\u0926\u092e\u0940 \u0915\u0940 \u0932\u094b\u0915\u092a\u094d\u0930\u093f\u092f\u0924\u093e \u0907\u0938 \u092c\u093e\u0924 \u0938\u0947 \u092e\u093e\u092a\u0940 \u091c\u093e \u0938\u0915\u0924\u0940 \u0939\u0948 \u0915\u093f \u0909\u0938\u0947 \u0909\u0938 \u0926\u093f\u0928 \u0915\u093f\u0924\u0928\u0940 \u091a\u094c\u0915\u0932\u0947\u091f\u0947\u0902 \u092e\u093f\u0932\u0940\u0902; \u0915\u093f\u0924\u0928\u0940 \u092e\u093e\u0924\u094d\u0930\u093e \u092e\u0947\u0902 \u091a\u094c\u0915\u0932\u0947\u091f\u0947\u0902 \u092e\u093f\u0932\u0940\u0902, \u092f\u0947 \u092a\u0941\u0930\u0941\u0937\u094b\u0902 \u0915\u0947 \u0932\u093f\u090f \u090f\u0915 \u092d\u093e\u0935\u0941\u0915 \u0935\u093f\u0937\u092f \u0939\u094b\u0924\u093e \u0939\u0948 \u0914\u0930 \u0935\u0947 \u0907\u0938\u0915\u0947 \u0935\u093f\u0937\u092f \u092e\u0947\u0902 \u0906\u092a\u0915\u094b \u0924\u092d\u0940 \u092c\u0924\u093e\u090f\u0901\u0917\u0947 \u091c\u092c \u0909\u0928\u094d\u0939\u0947\u0902 \u092f\u0915\u0940\u0928 \u0939\u094b \u091c\u093e\u092f\u0947 \u0915\u0940 \u0907\u0938 \u092c\u093e\u0924 \u0915\u094b \u092b\u0948\u0932\u093e\u092f\u093e \u0928\u0939\u0940\u0902 \u091c\u093e\u092f\u0947\u0917\u093e.\u0907\u0938\u0915\u094b \u0917\u093f\u0930\u0940-\u091a\u094b\u0915\u094b (\u7fa9\u7406\u30c1\u30e7\u30b3) \u0915\u0947 \u0928\u093e\u092e \u0938\u0947 \u091c\u093e\u0928\u093e \u091c\u093e\u0924\u093e \u0939\u0948, \u092f\u0947 \u092c\u0928\u093e \u0939\u0948 \u0936\u092c\u094d\u0926\u094b\u0902 \u0917\u093f\u0930\u0940 (\"\u0926\u093e\u092f\u093f\u0924\u094d\u0935\") \u0914\u0930 \u091a\u094b\u0915\u094b (\"\u091a\u0949\u0915\u0932\u0947\u091f\") \u0938\u0947, \u0905\u0932\u094b\u0915\u092a\u094d\u0930\u093f\u092f \u0915\u093e\u0930\u094d\u092f\u0915\u0930\u094d\u0924\u093e\u0913\u0902 \u0915\u094b \u0915\u0947\u0935\u0932 \u0905\u0924\u093f-\u0905\u0928\u093f\u0935\u093e\u0930\u094d\u092f '\u091a\u094b-\u0917\u093f\u0930\u0940 \u091a\u094b\u0915\u094b ' \u0938\u0938\u094d\u0924\u0940 \u091a\u094c\u0915\u0932\u0947\u091f\u0947\u0902 \u0939\u0940 \u092e\u093f\u0932\u0924\u0940 \u0939\u0948\u0902\u0964 \u092f\u0947 \u0939\u094b\u0928\u094d\u092e\u0940-\u091a\u094b\u0915\u094b (\u672c\u547d\u30c1\u30e7\u30b3) \u0938\u0947 \u0905\u0932\u0917 \u0939\u0948; \u092a\u094d\u0930\u0947\u092e\u0940 \u0915\u094b \u0926\u0940 \u091c\u093e\u0928\u0947 \u0935\u093e\u0932\u0940 \u091a\u0949\u0915\u0932\u0947\u091f.\u0926\u094b\u0938\u094d\u0924, \u0916\u093e\u0938\u0915\u0930 \u0932\u0921\u093c\u0915\u093f\u092f\u093e\u0902 \u0906\u092a\u0938 \u092e\u0947\u0902 \u091a\u0949\u0915\u0932\u0947\u091f \u0926\u0947 \u0938\u0915\u0924\u0940 \u0939\u0948\u0902 \u091c\u093f\u0938\u0947 \u0924\u094b\u092e\u094b-\u091a\u094b\u0915\u094b (\u53cb\u30c1\u30e7\u30b3) \u0915\u0939\u0924\u0947 \u0939\u0948\u0902; \u0924\u094b\u092e\u094b \u0915\u093e \u0905\u0930\u094d\u0925 \u0939\u0948 \"\u0926\u094b\u0938\u094d\u0924\".\u0926\u0915\u094d\u0937\u093f\u0923 \u0915\u094b\u0930\u093f\u092f\u093e \u092e\u0947\u0902 \u092e\u0939\u093f\u0932\u093e\u090f\u0902 \u092a\u0941\u0930\u0941\u0937\u094b\u0902 \u0915\u094b 14 \u092b\u093c\u0930\u0935\u0930\u0940 \u0915\u094b \u091a\u0949\u0915\u0932\u0947\u091f \u0926\u0947\u0924\u0940 \u0939\u0948\u0902 \u0914\u0930 \u092a\u0941\u0930\u0941\u0937 \u0909\u0928\u094d\u0939\u0947\u0902 14 \u092e\u093e\u0930\u094d\u091a \u0915\u094b \u0917\u0948\u0930-\u091a\u0949\u0915\u0932\u0947\u091f \u0915\u0948\u0902\u0921\u0940 \u0926\u0947\u0924\u0947 \u0939\u0948\u0902\u0964 \u0905\u092a\u094d\u0930\u0948\u0932 14 (\u0915\u093e\u0932\u0947 \u0926\u093f\u0935\u0938) \u0915\u094b, \u0909\u0928\u094d\u0939\u0947\u0902 \u091c\u093f\u0928\u0915\u094b 14 \u092b\u093c\u0930\u0935\u0930\u0940 \u092f\u093e 14 \u092e\u093e\u0930\u094d\u091a \u0915\u094b \u0915\u0941\u091b \u0928\u0939\u0940\u0902 \u092e\u093f\u0932\u093e \u0935\u094b \u091a\u0940\u0928\u0940 \u092d\u094b\u091c\u0928\u093e\u0932\u092f \u092e\u0947\u0902 \u0915\u093e\u0932\u0947 \u0928\u0942\u0921\u0932 \u0916\u093e\u0928\u0947 \u091c\u093e\u0924\u0947 \u0939\u0948\u0902 \u0914\u0930 \u0905\u092a\u0928\u0947 \u0905\u0915\u0947\u0932\u0947\u092a\u0928 \u0915\u093e \"\u0936\u094b\u0915\" \u092e\u0928\u093e\u0924\u0947 \u0939\u0948\u0902\u0964 \u0915\u094b\u0930\u093f\u092f\u0928 11 \u0928\u0935\u092e\u094d\u092c\u0930 \u0915\u094b \u092a\u0947\u092a\u0947\u0930\u094b \u0926\u093f\u0935\u0938 \u092e\u0928\u093e\u0924\u0947 \u0939\u0948\u0902, \u091c\u092c \u091c\u0935\u093e\u0928 \u091c\u094b\u0921\u093c\u0947 \u090f\u0915 \u0926\u0942\u0938\u0930\u0947 \u0915\u094b \u092a\u0947\u092a\u0947\u0930\u094b \u092c\u093f\u0938\u094d\u0915\u0941\u091f \u0926\u0947\u0924\u0947 \u0939\u0948\u0902\u0964 \u0926\u093f\u0928\u093e\u0902\u0915'11 / 11' \u0915\u0941\u0915\u0940 \u0915\u0947 \u0932\u0902\u092c\u0947 \u0906\u0915\u093e\u0930 \u0915\u0947 \u0938\u092e\u093e\u0928 \u092e\u093e\u0928\u093e \u091c\u093e\u0924\u093e \u0939\u0948\u0964 \u0915\u094b\u0930\u093f\u092f\u093e \u092e\u0947\u0902 \u0939\u0930 \u092e\u0939\u0940\u0928\u0947 \u0915\u093f 14 \u0924\u093e\u0930\u0940\u0916 \u0915\u094b \u092a\u094d\u0930\u0947\u092e \u0938\u0947 \u0938\u092e\u094d\u092c\u0902\u0927\u093f\u0924 \u0915\u094b\u0908 \u0926\u093f\u0928 \u092e\u093e\u0928\u093e \u091c\u093e\u0924\u093e \u0939\u0948, \u0939\u093e\u0932\u093e\u0901\u0915\u093f \u0909\u0928\u092e\u0947\u0902 \u0938\u0947 \u0905\u0927\u093f\u0915\u0924\u0930 \u0915\u093e \u0915\u094b\u0908 \u092e\u0939\u0924\u094d\u0924\u094d\u0935 \u0928\u0939\u0940\u0902 \u0939\u0948\u0964 \u091c\u0928\u0935\u0930\u0940 \u0926\u093f\u0938\u0902\u092c\u0930 \u0924\u0915: \u092e\u094b\u092e\u092c\u0924\u094d\u0924\u0940 \u0926\u093f\u0935\u0938, \u0935\u0948\u0932\u0947\u0902\u091f\u093e\u0907\u0928 \u0926\u093f\u0935\u0938, \u0935\u094d\u0939\u093e\u0907\u091f \u0926\u093f\u0935\u0938, \u0915\u093e\u0932\u093e \u0926\u093f\u0935\u0938, \u0917\u0941\u0932\u093e\u092c \u0926\u093f\u0935\u0938, \u091a\u0942\u092e\u0928\u0947 \u0915\u093e \u0926\u093f\u0935\u0938, \u0938\u093f\u0932\u094d\u0935\u0930 \u0926\u093f\u0935\u0938, \u0917\u094d\u0930\u0940\u0928 \u0926\u093f\u0935\u0938, \u0938\u0902\u0917\u0940\u0924 \u0926\u093f\u0935\u0938, \u0935\u093e\u0907\u0928 \u0926\u093f\u0935\u0938, \u0938\u093f\u0928\u0947\u092e\u093e \u0926\u093f\u0935\u0938 \u0914\u0930 \u0906\u0932\u093f\u0902\u0917\u0928 \u0926\u093f\u0935\u0938.\u091a\u0940\u0928 \u092e\u0947\u0902, \u0938\u093e\u092e\u093e\u0928\u094d\u092f \u0938\u094d\u0925\u093f\u0924\u093f \u092f\u0947 \u0939\u0948\u0902 \u0915\u093f \u0906\u0926\u092e\u0940 \u0909\u0938 \u0914\u0930\u0924 \u0915\u094b \u091a\u0949\u0915\u0932\u0947\u091f \u092f\u093e \u092b\u0942\u0932 \u092f\u093e \u0926\u094b\u0928\u094b\u0902 \u0926\u0947\u0924\u093e \u0939\u0948 \u091c\u093f\u0938\u0947 \u0935\u094b \u092a\u094d\u092f\u093e\u0930 \u0915\u0930\u0924\u093e \u0939\u0948\u0964 \u091a\u0940\u0928\u0940 \u092e\u0947\u0902, \u0935\u0948\u0932\u0947\u0902\u091f\u093e\u0907\u0928 \u0926\u093f\u0935\u0938 \u0915\u094b ([19]) \u0915\u0939\u093e \u091c\u093e\u0924\u093e \u0939\u0948\u0964\u0938\u093e\u0901\u091a\u093e:Zh-stp\n\nQuestion: \u0645\u0627 \u0647\u064a \u0627\u0644\u062d\u0644\u0648\u0649 \u0627\u0644\u0623\u0643\u062b\u0631 \u062a\u0642\u062f\u064a\u0645\u0627\u064b \u0643\u0647\u062f\u0627\u064a\u0627\u061f\n\nAnswer:",
|
|
5
|
-
"positive_response": "\u091a\u0949\u0915\u0932\u0947\u091f",
|
|
6
|
-
"negative_response": "I don't know."
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "Context: James Clerk Maxwell FRS FRSE (Reino Unido: /d\u0361\u0292e\u026amz kl\u0251\u02d0k \u02c8m\u00e6ksw\u0259l/; 13 de junio de 1831-5 de noviembre de 1879)[1]\u200b fue un cient\u00edfico escoc\u00e9s especializado en el campo de la f\u00edsica matem\u00e1tica. Su mayor logro fue la formulaci\u00f3n de la teor\u00eda cl\u00e1sica de la radiaci\u00f3n electromagn\u00e9tica, que unific\u00f3 por primera vez la electricidad, el magnetismo y la luz como manifestaciones distintas de un mismo fen\u00f3meno.[2]\u200b Las ecuaciones de Maxwell, formuladas para el electromagnetismo, han sido ampliamente consideradas la \u201csegunda gran unificaci\u00f3n de la f\u00edsica\u201d, siendo la primera aquella realizada por Isaac Newton.[3]\u200b\n\nQuestion: In which area did Maxwell specialize?\n\nAnswer:",
|
|
11
|
-
"positive_response": "f\u00edsica matem\u00e1tica",
|
|
12
|
-
"negative_response": "I don't know."
|
|
13
|
-
}
|
|
14
|
-
]
|