eval-framework 0.3.5__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.5 → eval_framework-0.3.6}/PKG-INFO +1 -1
- {eval_framework-0.3.5 → eval_framework-0.3.6}/pyproject.toml +1 -1
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/math_minerva_completion.py +33 -12
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/minerva_math_utils.py +45 -4
- {eval_framework-0.3.5 → eval_framework-0.3.6}/LICENSE +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/README.md +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/base.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/task_style.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.5 → eval_framework-0.3.6}/src/template_formatting/py.typed +0 -0
|
@@ -17,21 +17,28 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
17
17
|
Minerva MATH: reports Exact Match and Exact Match (Flex).
|
|
18
18
|
Uses raw_completion to extract multiple candidates; primary for exact_match,
|
|
19
19
|
all candidates with both Minerva and Hendrycks equivalence for exact_match_flex.
|
|
20
|
+
|
|
21
|
+
English Minerva extraction is the default. Subclasses select other
|
|
22
|
+
final-answer styles by overriding ``COT_STYLE`` / ``RELAXED``.
|
|
20
23
|
"""
|
|
21
24
|
|
|
22
25
|
NAME = "Math Minerva Completion"
|
|
23
26
|
KEYS = ["Exact", "Exact Flex"]
|
|
24
27
|
AGGREGATORS = [PassAtK()]
|
|
25
28
|
|
|
29
|
+
# Defaults; subclasses override these class attributes to define variants.
|
|
30
|
+
COT_STYLE: str = "minerva"
|
|
31
|
+
RELAXED: bool = False
|
|
32
|
+
|
|
26
33
|
def __init__(
|
|
27
34
|
self,
|
|
28
35
|
use_cot: bool = True,
|
|
29
|
-
cot_style: str =
|
|
30
|
-
relaxed: bool =
|
|
36
|
+
cot_style: str | None = None,
|
|
37
|
+
relaxed: bool | None = None,
|
|
31
38
|
) -> None:
|
|
32
39
|
self.use_cot = use_cot
|
|
33
|
-
self.cot_style = cot_style
|
|
34
|
-
self.relaxed = relaxed
|
|
40
|
+
self.cot_style = cot_style if cot_style is not None else self.COT_STYLE
|
|
41
|
+
self.relaxed = relaxed if relaxed is not None else self.RELAXED
|
|
35
42
|
|
|
36
43
|
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
37
44
|
if response.error:
|
|
@@ -60,7 +67,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
60
67
|
]
|
|
61
68
|
|
|
62
69
|
raw = response.raw_completion or response.completion
|
|
63
|
-
all_candidates = extract_answers(
|
|
70
|
+
all_candidates = extract_answers(
|
|
71
|
+
raw,
|
|
72
|
+
use_cot=self.use_cot,
|
|
73
|
+
cot_style=self.cot_style,
|
|
74
|
+
relaxed=self.relaxed,
|
|
75
|
+
)
|
|
64
76
|
|
|
65
77
|
exact_match = 0.0
|
|
66
78
|
if all_candidates:
|
|
@@ -83,10 +95,19 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
83
95
|
class MathMinervaCompletionRelaxed(MathMinervaCompletion):
|
|
84
96
|
"""MathMinervaCompletion with relaxed=True by default (flexible final-answer matching)."""
|
|
85
97
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
98
|
+
NAME = "Math Minerva Completion Relaxed"
|
|
99
|
+
RELAXED = True
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class MathMinervaCompletionDE(MathMinervaCompletion):
|
|
103
|
+
"""MathMinervaCompletion with German final-answer extraction (``Finale Antwort: …``)."""
|
|
104
|
+
|
|
105
|
+
NAME = "Math Minerva Completion DE"
|
|
106
|
+
COT_STYLE = "minerva_de"
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class MathMinervaCompletionRelaxedDE(MathMinervaCompletionDE):
|
|
110
|
+
"""MathMinervaCompletionDE with relaxed=True by default."""
|
|
111
|
+
|
|
112
|
+
NAME = "Math Minerva Completion Relaxed DE"
|
|
113
|
+
RELAXED = True
|
|
@@ -12,6 +12,7 @@ from sympy.parsing.latex.errors import LaTeXParsingError
|
|
|
12
12
|
|
|
13
13
|
INVALID_ANSWER = "[invalidanswer]"
|
|
14
14
|
END_SEQ = "I hope it is correct."
|
|
15
|
+
END_SEQ_DE = "Ich hoffe, die Antwort ist korrekt." # German pendant to END_SEQ
|
|
15
16
|
|
|
16
17
|
# Minerva normalize_final_answer: appendix D of Lewkowycz et al. (2022)
|
|
17
18
|
SUBSTITUTIONS = [
|
|
@@ -143,6 +144,44 @@ def get_unnormalized_answer(text: str, relaxed: bool = False) -> str:
|
|
|
143
144
|
return INVALID_ANSWER
|
|
144
145
|
|
|
145
146
|
|
|
147
|
+
def get_unnormalized_answer_de(text: str, relaxed: bool = False) -> str:
|
|
148
|
+
"""German analogue of ``get_unnormalized_answer``."""
|
|
149
|
+
if relaxed:
|
|
150
|
+
match = re.search(
|
|
151
|
+
r"(?i)(?:finale|endgültige)\s+antwort\s*:\s*"
|
|
152
|
+
r"(?:die\s+(?:finale\s+|endgültige\s+)?antwort\s+(?:ist|lautet)\s*)?(.*)",
|
|
153
|
+
text,
|
|
154
|
+
re.DOTALL,
|
|
155
|
+
)
|
|
156
|
+
if match:
|
|
157
|
+
raw = match.group(1).strip()
|
|
158
|
+
raw = re.sub(
|
|
159
|
+
r"\.?\s*ich\s+hoffe,?\s+(?:die\s+antwort|sie|es)\s+(?:ist|sei)\s+korrekt\.?\s*$",
|
|
160
|
+
"",
|
|
161
|
+
raw,
|
|
162
|
+
flags=re.IGNORECASE,
|
|
163
|
+
).strip()
|
|
164
|
+
return raw
|
|
165
|
+
return INVALID_ANSWER
|
|
166
|
+
text = text + END_SEQ_DE
|
|
167
|
+
match = re.search(
|
|
168
|
+
r"Finale Antwort: Die finale Antwort lautet(.*?)\. Ich hoffe, die Antwort ist korrekt\.",
|
|
169
|
+
text,
|
|
170
|
+
)
|
|
171
|
+
if match:
|
|
172
|
+
return match.group(1).strip()
|
|
173
|
+
return INVALID_ANSWER
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# Registry of supported `cot_style` values
|
|
177
|
+
# Keys are the strings passed by metric configurations; values are language-specific final-answer extractors
|
|
178
|
+
# `(text: str, relaxed: bool) -> str`. Extend this dict to add a new language.
|
|
179
|
+
COT_EXTRACTORS = {
|
|
180
|
+
"minerva": get_unnormalized_answer,
|
|
181
|
+
"minerva_de": get_unnormalized_answer_de,
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
|
|
146
185
|
def normalized_gold_from_solution(solution: str) -> str | None:
|
|
147
186
|
"""Extract and normalize the gold answer from a solution string (last \\boxed{...})."""
|
|
148
187
|
boxed = last_boxed_only_string(solution)
|
|
@@ -368,10 +407,12 @@ def extract_answers(
|
|
|
368
407
|
all_answers: list[str] = []
|
|
369
408
|
|
|
370
409
|
if use_cot:
|
|
371
|
-
if cot_style
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
410
|
+
if cot_style not in COT_EXTRACTORS:
|
|
411
|
+
raise ValueError(f"Unknown cot_style {cot_style!r}; valid: {sorted(COT_EXTRACTORS)}")
|
|
412
|
+
extractor = COT_EXTRACTORS[cot_style]
|
|
413
|
+
minerva_answer = normalize_final_answer(extractor(raw, relaxed=relaxed))
|
|
414
|
+
if minerva_answer and minerva_answer != INVALID_ANSWER:
|
|
415
|
+
all_answers.append(minerva_answer)
|
|
375
416
|
boxed = last_boxed_only_string(raw)
|
|
376
417
|
if boxed is not None:
|
|
377
418
|
try:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/external/drop_process_results.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/aggregators/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/aggregators/aggregators.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/niah_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/result_processors/wandb_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/balancedcopa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/global_mmlu.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/goldenswag.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/lab_bench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/multipl_e.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/naturalqs_open.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/social_iqa.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.5 → eval_framework-0.3.6}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|