eval-framework 0.3.1__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.1 → eval_framework-0.3.3}/PKG-INFO +1 -1
- {eval_framework-0.3.1 → eval_framework-0.3.3}/pyproject.toml +1 -1
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/drop_completion.py +2 -2
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/math_minerva_completion.py +9 -22
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mbpp.py +2 -1
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/truthfulqa.py +17 -2
- {eval_framework-0.3.1 → eval_framework-0.3.3}/LICENSE +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/README.md +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/base.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/task_style.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.1 → eval_framework-0.3.3}/src/template_formatting/py.typed +0 -0
|
@@ -18,7 +18,7 @@ class DropMetricContext(BaseMetricContext):
|
|
|
18
18
|
class DropF1ExactMatch(BaseMetric[Completion]):
|
|
19
19
|
"""DROP F1 and exact match. Requires DropMetricContext with answer_tuples."""
|
|
20
20
|
|
|
21
|
-
NAME = "
|
|
21
|
+
NAME = "Drop F1"
|
|
22
22
|
KEYS = ["f1", "exact_match"]
|
|
23
23
|
|
|
24
24
|
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
@@ -52,5 +52,5 @@ class DropF1ExactMatch(BaseMetric[Completion]):
|
|
|
52
52
|
higher_is_better=True,
|
|
53
53
|
error=response.error,
|
|
54
54
|
)
|
|
55
|
-
for name, key in zip(
|
|
55
|
+
for name, key in zip(self.NAMES, self.KEYS)
|
|
56
56
|
]
|
|
@@ -20,6 +20,7 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
NAME = "Math Minerva Completion"
|
|
23
|
+
KEYS = ["Exact", "Exact Flex"]
|
|
23
24
|
AGGREGATORS = [PassAtK()]
|
|
24
25
|
|
|
25
26
|
def __init__(
|
|
@@ -36,17 +37,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
36
37
|
if response.error:
|
|
37
38
|
return [
|
|
38
39
|
MetricResult(
|
|
39
|
-
metric_name=
|
|
40
|
+
metric_name=x,
|
|
40
41
|
value=None,
|
|
41
42
|
higher_is_better=True,
|
|
42
43
|
error=response.error,
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
metric_name="Exact Match (Flex)",
|
|
46
|
-
value=None,
|
|
47
|
-
higher_is_better=True,
|
|
48
|
-
error=response.error,
|
|
49
|
-
),
|
|
44
|
+
)
|
|
45
|
+
for x in self.NAMES
|
|
50
46
|
]
|
|
51
47
|
|
|
52
48
|
gold = response.ground_truth
|
|
@@ -55,17 +51,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
55
51
|
if not gold:
|
|
56
52
|
return [
|
|
57
53
|
MetricResult(
|
|
58
|
-
metric_name=
|
|
59
|
-
value=None,
|
|
60
|
-
higher_is_better=True,
|
|
61
|
-
error="No ground truth available",
|
|
62
|
-
),
|
|
63
|
-
MetricResult(
|
|
64
|
-
metric_name="Exact Match (Flex)",
|
|
54
|
+
metric_name=x,
|
|
65
55
|
value=None,
|
|
66
56
|
higher_is_better=True,
|
|
67
57
|
error="No ground truth available",
|
|
68
|
-
)
|
|
58
|
+
)
|
|
59
|
+
for x in self.NAMES
|
|
69
60
|
]
|
|
70
61
|
|
|
71
62
|
raw = response.raw_completion or response.completion
|
|
@@ -84,12 +75,8 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
84
75
|
)
|
|
85
76
|
|
|
86
77
|
return [
|
|
87
|
-
MetricResult(metric_name=
|
|
88
|
-
|
|
89
|
-
metric_name="Exact Match (Flex)",
|
|
90
|
-
value=exact_match_flex,
|
|
91
|
-
higher_is_better=True,
|
|
92
|
-
),
|
|
78
|
+
MetricResult(metric_name=name, value=value, higher_is_better=True)
|
|
79
|
+
for name, value in zip(self.NAMES, [exact_match, exact_match_flex])
|
|
93
80
|
]
|
|
94
81
|
|
|
95
82
|
|
|
@@ -276,7 +276,8 @@ class MBPP_OLMES(MBPP):
|
|
|
276
276
|
|
|
277
277
|
def __init__(self, num_fewshot: int = 3) -> None:
|
|
278
278
|
super().__init__(num_fewshot)
|
|
279
|
-
|
|
279
|
+
if num_fewshot != 3:
|
|
280
|
+
logger.warning(f"MBPP_OLMES supports only 3-shot, got {num_fewshot}")
|
|
280
281
|
self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"]
|
|
281
282
|
|
|
282
283
|
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
@@ -106,13 +106,18 @@ class TRUTHFULQA_OLMES(TRUTHFULQA):
|
|
|
106
106
|
"""
|
|
107
107
|
|
|
108
108
|
NAME = "TruthfulQA_OLMES"
|
|
109
|
+
FEWSHOT_SPLIT = "validation" # use dataset few-shot for multiple-choice options
|
|
110
|
+
PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
|
|
111
|
+
|
|
112
|
+
def _get_cue_text(self, item: dict[str, Any]) -> str:
|
|
113
|
+
return "Answer:"
|
|
109
114
|
|
|
110
115
|
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
111
116
|
question = item["question"]
|
|
112
117
|
choices = item[self.target_identifier]["choices"]
|
|
113
118
|
labels = get_n_letters(len(choices))
|
|
114
119
|
options = "\n".join(f" {label}. {choice}" for label, choice in zip(labels, choices))
|
|
115
|
-
return f"
|
|
120
|
+
return f"Question: {question}\n{options}\n"
|
|
116
121
|
|
|
117
122
|
def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
|
|
118
123
|
labels_arr = item[self.target_identifier]["labels"]
|
|
@@ -126,7 +131,17 @@ class TRUTHFULQA_OLMES(TRUTHFULQA):
|
|
|
126
131
|
return [f" {letter}" for letter in letters]
|
|
127
132
|
|
|
128
133
|
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
129
|
-
|
|
134
|
+
# Reuse BaseTask's split-based sampler.
|
|
135
|
+
return BaseTask._sample_fewshot_examples(self, item)
|
|
136
|
+
|
|
137
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
138
|
+
cue_text = self._get_cue_text(item)
|
|
139
|
+
labels_arr = item[self.target_identifier]["labels"]
|
|
140
|
+
letters = get_n_letters(len(labels_arr))
|
|
141
|
+
# Pick one correct option for the demonstration answer.
|
|
142
|
+
correct_letters = [letters[i] for i, label in enumerate(labels_arr) if label == 1]
|
|
143
|
+
letter = correct_letters[0] if correct_letters else letters[0]
|
|
144
|
+
return f"{cue_text} {letter}"
|
|
130
145
|
|
|
131
146
|
|
|
132
147
|
class TRUTHFULQA_IDK(TRUTHFULQA):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/external/drop_process_results.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/aggregators/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/aggregators/aggregators.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/niah_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/result_processors/wandb_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/balancedcopa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/global_mmlu.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/goldenswag.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/lab_bench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/multipl_e.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/naturalqs_open.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/social_iqa.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.1 → eval_framework-0.3.3}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|