eval-framework 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.0 → eval_framework-0.3.2}/PKG-INFO +1 -1
- {eval_framework-0.3.0 → eval_framework-0.3.2}/pyproject.toml +1 -1
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/response_generator.py +24 -8
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/suite.py +33 -31
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/base.py +8 -7
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/global_mmlu.py +7 -1
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mbpp.py +2 -1
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mmmlu.py +7 -1
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +8 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/eval_config.py +2 -1
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/task_names.py +3 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/task_style.py +49 -6
- {eval_framework-0.3.0 → eval_framework-0.3.2}/LICENSE +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/README.md +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.0 → eval_framework-0.3.2}/src/template_formatting/py.typed +0 -0
|
@@ -67,14 +67,18 @@ class ResponseGenerator:
|
|
|
67
67
|
if config.perturbation_config is not None:
|
|
68
68
|
perturbation_task_class = create_perturbation_class(task_class, config.perturbation_config)
|
|
69
69
|
self.task = perturbation_task_class.with_overwrite(
|
|
70
|
-
self.few_shot,
|
|
70
|
+
self.few_shot,
|
|
71
|
+
custom_subjects=self.config.task_subjects,
|
|
72
|
+
custom_hf_revision=self.config.hf_revision,
|
|
71
73
|
)
|
|
72
74
|
else:
|
|
73
75
|
self.task = task_class.with_overwrite(
|
|
74
|
-
self.few_shot,
|
|
76
|
+
self.few_shot,
|
|
77
|
+
custom_subjects=self.config.task_subjects,
|
|
78
|
+
custom_hf_revision=self.config.hf_revision,
|
|
75
79
|
)
|
|
76
80
|
|
|
77
|
-
self.response_type =
|
|
81
|
+
self.response_type, _ = self.task._get_type_and_metrics()
|
|
78
82
|
|
|
79
83
|
def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]:
|
|
80
84
|
"""
|
|
@@ -89,7 +93,10 @@ class ResponseGenerator:
|
|
|
89
93
|
task_stop_sequences = getattr(self.task, "stop_sequences", None)
|
|
90
94
|
task_max_tokens = self.config.max_tokens or getattr(self.task, "max_tokens", None)
|
|
91
95
|
# if both task and model define a max_token, the smaller value is used
|
|
92
|
-
max_tokens = min(
|
|
96
|
+
max_tokens = min(
|
|
97
|
+
[x for x in [llm_max_tokens, task_max_tokens] if x is not None],
|
|
98
|
+
default=None,
|
|
99
|
+
)
|
|
93
100
|
logger.info(f"Set max_tokens to {max_tokens}")
|
|
94
101
|
# if both task and model define stop sequences, those are merged into one list
|
|
95
102
|
stop_sequences_merged = (llm_stop_sequences or []) + (task_stop_sequences or [])
|
|
@@ -117,7 +124,9 @@ class ResponseGenerator:
|
|
|
117
124
|
loglikelihoods={},
|
|
118
125
|
loglikelihoods_sequence_positions={},
|
|
119
126
|
raw_loglikelihood_error=Error(
|
|
120
|
-
error_class=e.__class__.__name__,
|
|
127
|
+
error_class=e.__class__.__name__,
|
|
128
|
+
message=str(e),
|
|
129
|
+
traceback=traceback.format_exc(),
|
|
121
130
|
),
|
|
122
131
|
)
|
|
123
132
|
for _ in range(len(samples))
|
|
@@ -142,7 +151,9 @@ class ResponseGenerator:
|
|
|
142
151
|
)
|
|
143
152
|
return loglikelihood_list
|
|
144
153
|
|
|
145
|
-
def _generative_output_type_selector(
|
|
154
|
+
def _generative_output_type_selector(
|
|
155
|
+
self,
|
|
156
|
+
) -> Callable[[list[Sample]], list[Completion] | list[Loglikelihood]]:
|
|
146
157
|
"""
|
|
147
158
|
Selects the generative output type based on the response type.
|
|
148
159
|
:return: function to generate responses
|
|
@@ -151,7 +162,10 @@ class ResponseGenerator:
|
|
|
151
162
|
case ResponseType.COMPLETION:
|
|
152
163
|
stop_sequences, max_tokens = self._llm_task_param_precedence()
|
|
153
164
|
return partial(
|
|
154
|
-
self.task.generate_completions,
|
|
165
|
+
self.task.generate_completions,
|
|
166
|
+
self.llm,
|
|
167
|
+
stop_sequences=stop_sequences,
|
|
168
|
+
max_tokens=max_tokens,
|
|
155
169
|
) # type: ignore[call-arg]
|
|
156
170
|
case ResponseType.LOGLIKELIHOODS:
|
|
157
171
|
return self._generate_loglikelihoods
|
|
@@ -245,7 +259,9 @@ class ResponseGenerator:
|
|
|
245
259
|
|
|
246
260
|
samples_batch: list[Sample] = []
|
|
247
261
|
with tqdm(
|
|
248
|
-
total=total_num_samples,
|
|
262
|
+
total=total_num_samples,
|
|
263
|
+
desc=f"Processing {self.response_type.value}",
|
|
264
|
+
disable=get_disable_bar_flag(),
|
|
249
265
|
) as pbar:
|
|
250
266
|
samples = self.task.iterate_samples(self.num_samples)
|
|
251
267
|
for i, sample in enumerate(repeat_samples(samples, repeats)):
|
|
@@ -46,7 +46,17 @@ def parse_strings_to_task_or_suite(v: str | list) -> str | list:
|
|
|
46
46
|
return [{"tasks": item, "name": item} if isinstance(item, str) else item for item in v]
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
_VALID_METHODS = {"mean", "median"
|
|
49
|
+
_VALID_METHODS = {"mean", "median"}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class MetricSource(BaseModel):
|
|
53
|
+
"""A single (child, metric) pair used as an input to a SuiteAggregate. See the examples folder
|
|
54
|
+
for how these are used."""
|
|
55
|
+
|
|
56
|
+
model_config = ConfigDict(extra="forbid")
|
|
57
|
+
|
|
58
|
+
child: str
|
|
59
|
+
metric: str
|
|
50
60
|
|
|
51
61
|
|
|
52
62
|
class SuiteAggregate(BaseModel):
|
|
@@ -55,7 +65,7 @@ class SuiteAggregate(BaseModel):
|
|
|
55
65
|
model_config = ConfigDict(extra="forbid")
|
|
56
66
|
|
|
57
67
|
name: str
|
|
58
|
-
|
|
68
|
+
sources: list[MetricSource]
|
|
59
69
|
method: str | Callable[[list[float]], float] = "mean"
|
|
60
70
|
|
|
61
71
|
@field_validator("method")
|
|
@@ -197,52 +207,44 @@ def compute_aggregates(
|
|
|
197
207
|
aggregates: list[SuiteAggregate],
|
|
198
208
|
child_results: dict[str, SuiteResult],
|
|
199
209
|
) -> dict[str, float | None]:
|
|
200
|
-
"""Compute suite-level stats from
|
|
210
|
+
"""Compute suite-level stats from explicitly named (child, metric) sources.
|
|
201
211
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
212
|
+
For each `SuiteAggregate`, the value from each `MetricSource` is looked up by
|
|
213
|
+
child name and exact metric key. Sources whose child is missing or whose metric is
|
|
214
|
+
None or NaN are silently skipped. If no sources yield a valid value the aggregate is None.
|
|
205
215
|
"""
|
|
206
216
|
result: dict[str, float | None] = {}
|
|
207
217
|
|
|
208
218
|
for agg in aggregates:
|
|
209
|
-
|
|
210
|
-
|
|
219
|
+
values: list[float] = []
|
|
220
|
+
for source in agg.sources:
|
|
221
|
+
child = child_results.get(source.child)
|
|
211
222
|
if child is None:
|
|
212
|
-
|
|
213
|
-
f"SuiteAggregate '{agg.name}' uses
|
|
214
|
-
f"
|
|
223
|
+
logger.warning(
|
|
224
|
+
f"SuiteAggregate '{agg.name}' uses source '{source.child}' which is not a child of the suite. "
|
|
225
|
+
f"Available children: {list(child_results.keys())}."
|
|
215
226
|
)
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
for child_name, child in child_results.items():
|
|
224
|
-
for m in agg.metric:
|
|
225
|
-
val = child.aggregates.get(m)
|
|
226
|
-
if val is not None and not math.isnan(val):
|
|
227
|
-
values[child_name] = val
|
|
228
|
-
break
|
|
229
|
-
result[agg.name] = _apply_method(agg.method, values) if values else None
|
|
227
|
+
continue
|
|
228
|
+
val = child.aggregates.get(source.metric)
|
|
229
|
+
if val is not None and not math.isnan(val):
|
|
230
|
+
values.append(val)
|
|
231
|
+
else:
|
|
232
|
+
logger.warning(f"The value for source '{source.child}' with metric '{source.metric}' is None or NaN.")
|
|
233
|
+
result[agg.name] = _apply_method(agg.method, values) if values else None
|
|
230
234
|
|
|
231
235
|
return result
|
|
232
236
|
|
|
233
237
|
|
|
234
238
|
def _apply_method(
|
|
235
239
|
method: str | Callable[[list[float]], float],
|
|
236
|
-
values:
|
|
240
|
+
values: list[float],
|
|
237
241
|
) -> float:
|
|
238
|
-
vals = list(values.values())
|
|
239
|
-
|
|
240
242
|
if callable(method):
|
|
241
|
-
return method(
|
|
243
|
+
return method(values)
|
|
242
244
|
elif method == "mean":
|
|
243
|
-
return float(np.mean(
|
|
245
|
+
return float(np.mean(values))
|
|
244
246
|
elif method == "median":
|
|
245
|
-
return float(np.median(
|
|
247
|
+
return float(np.median(values))
|
|
246
248
|
else:
|
|
247
249
|
raise ValueError(f"Unknown aggregation method: '{method}'. Use mean or median.")
|
|
248
250
|
|
|
@@ -34,6 +34,7 @@ class ResponseType(Enum):
|
|
|
34
34
|
class TaskStyle(Enum):
|
|
35
35
|
MULTIPLE_CHOICE = "multiple_choice"
|
|
36
36
|
CLOZE = "cloze"
|
|
37
|
+
BPB = "bpb"
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
class Language(Enum):
|
|
@@ -311,7 +312,7 @@ class BaseTask[SubjectType](ABC):
|
|
|
311
312
|
|
|
312
313
|
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
313
314
|
if hasattr(self, "TASK_STYLER"):
|
|
314
|
-
return self.TASK_STYLER.get_possible_completions(self._get_choices(item))
|
|
315
|
+
return self.TASK_STYLER.get_possible_completions(self._get_choices(item), self._get_correct_index(item))
|
|
315
316
|
return None
|
|
316
317
|
|
|
317
318
|
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
@@ -331,12 +332,7 @@ class BaseTask[SubjectType](ABC):
|
|
|
331
332
|
return None
|
|
332
333
|
|
|
333
334
|
def get_metadata(self) -> dict[str, str | list[str]]:
|
|
334
|
-
|
|
335
|
-
response_type = self.TASK_STYLER.response_type
|
|
336
|
-
metrics = self.TASK_STYLER.metrics
|
|
337
|
-
else:
|
|
338
|
-
response_type = self.RESPONSE_TYPE
|
|
339
|
-
metrics = self.METRICS
|
|
335
|
+
response_type, metrics = self._get_type_and_metrics()
|
|
340
336
|
|
|
341
337
|
meta: dict[str, str | list[str]] = {
|
|
342
338
|
"dataset_path": self.DATASET_PATH,
|
|
@@ -423,3 +419,8 @@ class BaseTask[SubjectType](ABC):
|
|
|
423
419
|
)
|
|
424
420
|
)
|
|
425
421
|
return completion_list
|
|
422
|
+
|
|
423
|
+
def _get_type_and_metrics(self) -> tuple[ResponseType, list[type["BaseMetric"]]]:
|
|
424
|
+
if hasattr(self, "TASK_STYLER"):
|
|
425
|
+
return self.TASK_STYLER.response_type, self.TASK_STYLER.metrics
|
|
426
|
+
return self.RESPONSE_TYPE, self.METRICS
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/global_mmlu.py
RENAMED
|
@@ -477,7 +477,7 @@ class GlobalMMLU(BaseTask[tuple[str, str]]):
|
|
|
477
477
|
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
|
|
478
478
|
SUBJECTS = list(product(GLOBAL_MMLU_LANGUAGES, MMLU_SUBJECTS))
|
|
479
479
|
PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4)
|
|
480
|
-
LANGUAGE = {
|
|
480
|
+
LANGUAGE: Language | dict[str, Language] | None = {
|
|
481
481
|
str((lang_code.split("_")[0], subject)): LANGUAGE_NAME_MAP[lang_code]
|
|
482
482
|
for lang_code, subjects in LANGUAGE_SUBJECTS_MAP.items()
|
|
483
483
|
for subject in subjects
|
|
@@ -531,3 +531,9 @@ class GlobalMMLU(BaseTask[tuple[str, str]]):
|
|
|
531
531
|
|
|
532
532
|
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
533
533
|
return [f" {key}" for key in self.keys]
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
class GlobalMMLU_German(GlobalMMLU):
|
|
537
|
+
NAME = "GlobalMMLU_German"
|
|
538
|
+
SUBJECTS = [("de", subject) for subject in MMLU_SUBJECTS]
|
|
539
|
+
LANGUAGE = Language.DEU
|
|
@@ -276,7 +276,8 @@ class MBPP_OLMES(MBPP):
|
|
|
276
276
|
|
|
277
277
|
def __init__(self, num_fewshot: int = 3) -> None:
|
|
278
278
|
super().__init__(num_fewshot)
|
|
279
|
-
|
|
279
|
+
if num_fewshot != 3:
|
|
280
|
+
logger.warning(f"MBPP_OLMES supports only 3-shot, got {num_fewshot}")
|
|
280
281
|
self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"]
|
|
281
282
|
|
|
282
283
|
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
@@ -426,7 +426,7 @@ class MMMLU(BaseTask[tuple[str, str]]):
|
|
|
426
426
|
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
|
|
427
427
|
SUBJECTS = list(product(MMMLU_LANGS, MMLU_SUBJECTS))
|
|
428
428
|
PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(4)
|
|
429
|
-
LANGUAGE = {
|
|
429
|
+
LANGUAGE: Language | dict[str, Language] | None = {
|
|
430
430
|
str((lang_code.split("_")[0], subject)): LANGUAGE_NAME_MAP[lang_code]
|
|
431
431
|
for lang_code, subjects in LANGUAGE_SUBJECTS_MAP.items()
|
|
432
432
|
for subject in subjects
|
|
@@ -480,6 +480,12 @@ class MMMLU(BaseTask[tuple[str, str]]):
|
|
|
480
480
|
return [f" {key}" for key in self.keys]
|
|
481
481
|
|
|
482
482
|
|
|
483
|
+
class MMMLU_German(MMMLU):
|
|
484
|
+
NAME = "MMMLU_German"
|
|
485
|
+
SUBJECTS = [("DE_DE", subject) for subject in MMLU_SUBJECTS]
|
|
486
|
+
LANGUAGE = Language.DEU
|
|
487
|
+
|
|
488
|
+
|
|
483
489
|
class MMMLU_GERMAN_COT(MMMLU):
|
|
484
490
|
NAME = "MMMLU_GERMAN_COT"
|
|
485
491
|
RESPONSE_TYPE = ResponseType.COMPLETION
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/naturalqs_open.py
RENAMED
|
@@ -3,6 +3,7 @@ from typing import Any
|
|
|
3
3
|
from eval_framework.metrics.completion.drop_completion import DropF1ExactMatch, DropMetricContext
|
|
4
4
|
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
|
|
5
5
|
from eval_framework.tasks.task_style import (
|
|
6
|
+
BPBStyle,
|
|
6
7
|
ClozeStyle,
|
|
7
8
|
MCStyle,
|
|
8
9
|
answer_key_to_index,
|
|
@@ -93,3 +94,10 @@ class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base):
|
|
|
93
94
|
|
|
94
95
|
NAME = "NaturalQsOpenMC_OLMES"
|
|
95
96
|
TASK_STYLER = MCStyle(space_prefixed_labels=True)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class NaturalQsOpenBPB(_NaturalQsOpenChoice_Base):
|
|
100
|
+
"""BPB-only variant."""
|
|
101
|
+
|
|
102
|
+
NAME = "NaturalQsOpenBPB"
|
|
103
|
+
TASK_STYLER = BPBStyle()
|
|
@@ -112,7 +112,8 @@ class EvalConfig(BaseConfig):
|
|
|
112
112
|
@model_validator(mode="after")
|
|
113
113
|
def validate_llm_judge_defined(self) -> "EvalConfig":
|
|
114
114
|
task = get_task(self.task_name)
|
|
115
|
-
|
|
115
|
+
_, task_metrics = task(num_fewshot=0)._get_type_and_metrics()
|
|
116
|
+
for metric_class in task_metrics:
|
|
116
117
|
if issubclass(metric_class, BaseLLMJudgeMetric):
|
|
117
118
|
assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."
|
|
118
119
|
return self
|
|
@@ -119,8 +119,10 @@ def register_all_tasks() -> None:
|
|
|
119
119
|
register_lazy_task("eval_framework.tasks.benchmarks.mmlu_pro.MMLU_PRO_COT")
|
|
120
120
|
register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_COT")
|
|
121
121
|
register_lazy_task("eval_framework.tasks.benchmarks.mmmlu.MMMLU")
|
|
122
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mmmlu.MMMLU_German")
|
|
122
123
|
register_lazy_task("eval_framework.tasks.benchmarks.mmmlu.MMMLU_GERMAN_COT")
|
|
123
124
|
register_lazy_task("eval_framework.tasks.benchmarks.global_mmlu.GlobalMMLU")
|
|
125
|
+
register_lazy_task("eval_framework.tasks.benchmarks.global_mmlu.GlobalMMLU_German")
|
|
124
126
|
register_lazy_task("eval_framework.tasks.benchmarks.pawsx.PAWSX")
|
|
125
127
|
register_lazy_task("eval_framework.tasks.benchmarks.piqa.PIQA")
|
|
126
128
|
register_lazy_task("eval_framework.tasks.benchmarks.piqa.PIQA_IDK")
|
|
@@ -190,6 +192,7 @@ def register_all_tasks() -> None:
|
|
|
190
192
|
register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenCloze")
|
|
191
193
|
register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenMC")
|
|
192
194
|
register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenMC_OLMES")
|
|
195
|
+
register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenBPB")
|
|
193
196
|
register_lazy_task("eval_framework.tasks.benchmarks.social_iqa.SocialIQACloze")
|
|
194
197
|
register_lazy_task("eval_framework.tasks.benchmarks.social_iqa.SocialIQAMC_OLMES")
|
|
195
198
|
register_lazy_task("eval_framework.tasks.benchmarks.social_iqa.SocialIQAMC")
|
|
@@ -52,6 +52,10 @@ dataset attributes and data-access methods. Variants only differ in ``TASK_STYL
|
|
|
52
52
|
class ARC_MC(_ARC_Base):
|
|
53
53
|
NAME = "ARC_MC"
|
|
54
54
|
TASK_STYLER = MCStyle(space_prefixed_labels=True)
|
|
55
|
+
|
|
56
|
+
class ARC_BPB(_ARC_Base):
|
|
57
|
+
NAME = "ARC_BPB"
|
|
58
|
+
TASK_STYLER = BPBStyle()
|
|
55
59
|
"""
|
|
56
60
|
|
|
57
61
|
import hashlib
|
|
@@ -111,8 +115,13 @@ class TaskStyler(ABC):
|
|
|
111
115
|
"""Return the ground-truth string for scoring."""
|
|
112
116
|
|
|
113
117
|
@abstractmethod
|
|
114
|
-
def get_possible_completions(self, choices: list[str]) -> list[str]:
|
|
115
|
-
"""Return the list of
|
|
118
|
+
def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
|
|
119
|
+
"""Return the list of completion strings to be evaluated.
|
|
120
|
+
|
|
121
|
+
``correct_index`` is only required by ``BPBStyle``, which scores solely the
|
|
122
|
+
ground-truth completion. ``MCStyle`` and ``ClozeStyle`` score all choices and
|
|
123
|
+
ignore it; callers may omit it when using those stylers.
|
|
124
|
+
"""
|
|
116
125
|
|
|
117
126
|
@abstractmethod
|
|
118
127
|
def get_cue_text(self) -> str:
|
|
@@ -196,7 +205,8 @@ class MCStyle(TaskStyler):
|
|
|
196
205
|
labels = get_n_letters(len(choices))
|
|
197
206
|
return f" {labels[correct_index]}"
|
|
198
207
|
|
|
199
|
-
def get_possible_completions(self, choices: list[str]) -> list[str]:
|
|
208
|
+
def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
|
|
209
|
+
"""Note: `correct_index` is ignored for `MCStyle` and only used for `BPBStyle`."""
|
|
200
210
|
return [f" {label}" for label in get_n_letters(len(choices))]
|
|
201
211
|
|
|
202
212
|
|
|
@@ -241,10 +251,12 @@ class ClozeStyle(TaskStyler):
|
|
|
241
251
|
question_prefix: str = "Question: ",
|
|
242
252
|
cue_text: str = "Answer:",
|
|
243
253
|
trailing_newline: bool = True,
|
|
254
|
+
leading_space_continuations: bool = True,
|
|
244
255
|
) -> None:
|
|
245
256
|
self.question_prefix = question_prefix
|
|
246
257
|
self._cue_text = cue_text
|
|
247
258
|
self.trailing_newline = trailing_newline
|
|
259
|
+
self.leading_space_continuations = leading_space_continuations
|
|
248
260
|
|
|
249
261
|
def get_cue_text(self) -> str:
|
|
250
262
|
return self._cue_text
|
|
@@ -254,10 +266,41 @@ class ClozeStyle(TaskStyler):
|
|
|
254
266
|
return f"{text}\n" if self.trailing_newline else text
|
|
255
267
|
|
|
256
268
|
def get_ground_truth(self, choices: list[str], correct_index: int) -> str:
|
|
257
|
-
return f" {choices[correct_index]}"
|
|
269
|
+
return f" {choices[correct_index]}" if self.leading_space_continuations else choices[correct_index]
|
|
270
|
+
|
|
271
|
+
def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
|
|
272
|
+
return [f" {c}" for c in choices] if self.leading_space_continuations else [f"{c}" for c in choices]
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
class BPBStyle(ClozeStyle):
|
|
276
|
+
"""BPB-only styler: prompt identical to ClozeStyle, but scores only the ground-truth completion.
|
|
277
|
+
|
|
278
|
+
One LLM forward pass per sample instead of N (one per choice), making evaluation
|
|
279
|
+
significantly faster when accuracy metrics are not needed.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
question_prefix: Prepended to the raw question (default ``"Question: "``).
|
|
283
|
+
cue_text: Assistant cue after the prompt (default ``"Answer:"``).
|
|
284
|
+
trailing_newline: When ``True`` (default), the instruction ends with ``"\\n"``.
|
|
285
|
+
|
|
286
|
+
Assembled prompt example (3 choices)::
|
|
287
|
+
|
|
288
|
+
"Question: What is the capital of France?\\n"
|
|
289
|
+
|
|
290
|
+
Scored completions: [" Paris"] ← ground truth only, one forward pass
|
|
291
|
+
Ground truth: " Paris"
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
metrics: list[type["BaseMetric"]] = [BitsPerByteLoglikelihood]
|
|
295
|
+
task_style = TaskStyle.BPB
|
|
258
296
|
|
|
259
|
-
def get_possible_completions(self, choices: list[str]) -> list[str]:
|
|
260
|
-
|
|
297
|
+
def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
|
|
298
|
+
if correct_index is None:
|
|
299
|
+
raise ValueError(
|
|
300
|
+
"BPBStyle evaluates the loglikelihood of the ground truth answer only,"
|
|
301
|
+
"and thus requires the correct index."
|
|
302
|
+
)
|
|
303
|
+
return [f" {choices[correct_index]}"] if self.leading_space_continuations else [choices[correct_index]]
|
|
261
304
|
|
|
262
305
|
|
|
263
306
|
# ---------------------------------------------------------------------------
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/external/drop_process_results.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/aggregators/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/aggregators/aggregators.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/niah_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/result_processors/wandb_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/balancedcopa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/goldenswag.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/lab_bench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/multipl_e.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/social_iqa.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.0 → eval_framework-0.3.2}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|