eval-framework 0.3.4__tar.gz → 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.4 → eval_framework-0.3.5}/PKG-INFO +1 -1
- {eval_framework-0.3.4 → eval_framework-0.3.5}/pyproject.toml +1 -1
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/evaluation_generator.py +27 -2
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/response_generator.py +4 -3
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/result_processor.py +4 -4
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/base.py +5 -2
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/aidanbench.py +11 -2
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/eval_config.py +4 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/LICENSE +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/README.md +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/task_style.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.4 → eval_framework-0.3.5}/src/template_formatting/py.typed +0 -0
|
@@ -127,7 +127,18 @@ class EvaluationGenerator:
|
|
|
127
127
|
return results
|
|
128
128
|
|
|
129
129
|
def _aggregate_results(self, results: list[Result]) -> dict[str, float | None]:
|
|
130
|
-
data = pd.DataFrame(
|
|
130
|
+
data = pd.DataFrame(
|
|
131
|
+
[
|
|
132
|
+
{
|
|
133
|
+
"metric_name": r.metric_name,
|
|
134
|
+
"subject": r.subject,
|
|
135
|
+
"key": r.key,
|
|
136
|
+
"value": r.value,
|
|
137
|
+
"error": r.error,
|
|
138
|
+
}
|
|
139
|
+
for r in results
|
|
140
|
+
]
|
|
141
|
+
)
|
|
131
142
|
if len(data) == 0:
|
|
132
143
|
return {}
|
|
133
144
|
data.fillna({"key": ""}, inplace=True)
|
|
@@ -251,7 +262,20 @@ class EvaluationGenerator:
|
|
|
251
262
|
return aggregated_results
|
|
252
263
|
|
|
253
264
|
def _aggregate_results_with_aggregators(self, results: list[Result]) -> dict[str, float | None]:
|
|
254
|
-
data = pd.DataFrame(
|
|
265
|
+
data = pd.DataFrame(
|
|
266
|
+
[
|
|
267
|
+
{
|
|
268
|
+
"metric_name": r.metric_name,
|
|
269
|
+
"metric_class_name": r.metric_class_name,
|
|
270
|
+
"subject": r.subject,
|
|
271
|
+
"key": r.key,
|
|
272
|
+
"value": r.value,
|
|
273
|
+
"error": r.error,
|
|
274
|
+
"prompt": r.prompt,
|
|
275
|
+
}
|
|
276
|
+
for r in results
|
|
277
|
+
]
|
|
278
|
+
)
|
|
255
279
|
if len(data) == 0:
|
|
256
280
|
return {}
|
|
257
281
|
data = data.fillna({"key": ""})
|
|
@@ -313,6 +337,7 @@ class EvaluationGenerator:
|
|
|
313
337
|
raise ValueError("No saved completions found. Run 'run_completions' first.")
|
|
314
338
|
|
|
315
339
|
metrics_results = self._run_metric_calculators(responses)
|
|
340
|
+
del responses
|
|
316
341
|
aggregated_results = self._aggregate_results(metrics_results)
|
|
317
342
|
results_with_aggregators = self._aggregate_results_with_aggregators(metrics_results)
|
|
318
343
|
aggregated_results.update(results_with_aggregators)
|
|
@@ -114,8 +114,8 @@ class ResponseGenerator:
|
|
|
114
114
|
try:
|
|
115
115
|
raw_loglikelihoods = self.llm.logprobs(samples)
|
|
116
116
|
except Exception as e:
|
|
117
|
-
if raise_errors():
|
|
118
|
-
raise
|
|
117
|
+
if raise_errors() or self.config.fail_on_error:
|
|
118
|
+
raise
|
|
119
119
|
logger.info(f"Error: {e.__class__.__name__} {e}")
|
|
120
120
|
raw_loglikelihoods = [
|
|
121
121
|
RawLoglikelihood(
|
|
@@ -166,7 +166,8 @@ class ResponseGenerator:
|
|
|
166
166
|
self.llm,
|
|
167
167
|
stop_sequences=stop_sequences,
|
|
168
168
|
max_tokens=max_tokens,
|
|
169
|
-
|
|
169
|
+
fail_on_error=self.config.fail_on_error,
|
|
170
|
+
)
|
|
170
171
|
case ResponseType.LOGLIKELIHOODS:
|
|
171
172
|
return self._generate_loglikelihoods
|
|
172
173
|
case _:
|
|
@@ -36,9 +36,9 @@ class ResultsFileProcessor(ResultProcessor):
|
|
|
36
36
|
return {}
|
|
37
37
|
|
|
38
38
|
def save_responses(self, responses: list[Completion | Loglikelihood]) -> None:
|
|
39
|
-
responses_data = [response.model_dump(mode="json", serialize_as_any=True) for response in responses]
|
|
40
39
|
with jsonlines.open(self.output_dir / "output.jsonl", "w") as f:
|
|
41
|
-
|
|
40
|
+
for response in responses:
|
|
41
|
+
f.write(response.model_dump(mode="json", serialize_as_any=True))
|
|
42
42
|
|
|
43
43
|
def save_response(self, response: Completion | Loglikelihood) -> None:
|
|
44
44
|
with jsonlines.open(self.output_dir / "output.jsonl", "a") as f:
|
|
@@ -72,9 +72,9 @@ class ResultsFileProcessor(ResultProcessor):
|
|
|
72
72
|
return responses
|
|
73
73
|
|
|
74
74
|
def save_metrics_results(self, results: list[Result]) -> None:
|
|
75
|
-
result_data = [x.model_dump(mode="json") for x in results]
|
|
76
75
|
with jsonlines.open(self.output_dir / "results.jsonl", "w") as f:
|
|
77
|
-
|
|
76
|
+
for result in results:
|
|
77
|
+
f.write(result.model_dump(mode="json"))
|
|
78
78
|
|
|
79
79
|
def save_metrics_result(self, result: Result) -> None:
|
|
80
80
|
with jsonlines.open(self.output_dir / "results.jsonl", "a") as f:
|
|
@@ -352,12 +352,15 @@ class BaseTask[SubjectType](ABC):
|
|
|
352
352
|
samples: list[Sample],
|
|
353
353
|
stop_sequences: list[str] | None = None,
|
|
354
354
|
max_tokens: int | None = None,
|
|
355
|
+
fail_on_error: bool = False,
|
|
355
356
|
) -> list[Completion]:
|
|
356
357
|
"""
|
|
357
358
|
Generates completions for the sample.
|
|
358
359
|
:param sample: sample to generate completions for
|
|
359
360
|
:param stop_sequences: stop sequences to use in completion generation
|
|
360
361
|
:param max_tokens: maximum tokens to use in completion generation
|
|
362
|
+
:param fail_on_error: if True, re-raise the original exception instead of capturing it
|
|
363
|
+
into a per-sample Error completion
|
|
361
364
|
:return: completion
|
|
362
365
|
"""
|
|
363
366
|
if stop_sequences is None:
|
|
@@ -367,8 +370,8 @@ class BaseTask[SubjectType](ABC):
|
|
|
367
370
|
try:
|
|
368
371
|
raw_completions = llm.generate(samples=samples, stop_sequences=stop_sequences, max_tokens=max_tokens)
|
|
369
372
|
except Exception as e:
|
|
370
|
-
if raise_errors():
|
|
371
|
-
raise
|
|
373
|
+
if raise_errors() or fail_on_error:
|
|
374
|
+
raise
|
|
372
375
|
logger.info(f"Error: {e.__class__.__name__} {e}")
|
|
373
376
|
raw_completions = [
|
|
374
377
|
RawCompletion(
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
@@ -104,7 +104,12 @@ class AidanBenchOriginal(BaseTask[str]):
|
|
|
104
104
|
return [Message(role=Role.USER, content=instruction_message)]
|
|
105
105
|
|
|
106
106
|
def _generation_loop(
|
|
107
|
-
self,
|
|
107
|
+
self,
|
|
108
|
+
llm: "BaseLLM",
|
|
109
|
+
stop_sequences: list[str] | None,
|
|
110
|
+
max_tokens: int | None,
|
|
111
|
+
initial_samples: list[Sample],
|
|
112
|
+
fail_on_error: bool = False,
|
|
108
113
|
) -> tuple[list[list[Message]], list[Union["Error", None]]]:
|
|
109
114
|
initial_messages = [s.messages for s in initial_samples]
|
|
110
115
|
samples = [(s, False) for s in initial_samples] # (sample, is_done)
|
|
@@ -118,6 +123,7 @@ class AidanBenchOriginal(BaseTask[str]):
|
|
|
118
123
|
[samples[i][0] for i in not_done_idx],
|
|
119
124
|
stop_sequences=stop_sequences,
|
|
120
125
|
max_tokens=max_tokens,
|
|
126
|
+
fail_on_error=fail_on_error,
|
|
121
127
|
)
|
|
122
128
|
new_completion_messages: list[list[Message] | None] = [c.messages for c in new_completions]
|
|
123
129
|
new_errors = [c.error for c in new_completions]
|
|
@@ -164,11 +170,14 @@ class AidanBenchOriginal(BaseTask[str]):
|
|
|
164
170
|
samples: list[Sample],
|
|
165
171
|
stop_sequences: list[str] | None = None,
|
|
166
172
|
max_tokens: int | None = None,
|
|
173
|
+
fail_on_error: bool = False,
|
|
167
174
|
) -> list[Completion]:
|
|
168
175
|
assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
|
|
169
176
|
"Each sample must have exactly one USER message."
|
|
170
177
|
)
|
|
171
|
-
all_message_histories, errors = self._generation_loop(
|
|
178
|
+
all_message_histories, errors = self._generation_loop(
|
|
179
|
+
llm, stop_sequences, max_tokens, samples, fail_on_error=fail_on_error
|
|
180
|
+
)
|
|
172
181
|
|
|
173
182
|
completion_list = []
|
|
174
183
|
for idx, sample in enumerate(samples):
|
|
@@ -27,6 +27,7 @@ KEYS_UNRELATED_TO_RESULTS = {
|
|
|
27
27
|
"save_intermediate_results",
|
|
28
28
|
"save_logs",
|
|
29
29
|
"delete_output_dir_after_upload",
|
|
30
|
+
"fail_on_error",
|
|
30
31
|
}
|
|
31
32
|
|
|
32
33
|
|
|
@@ -59,6 +60,9 @@ class EvalConfig(BaseConfig):
|
|
|
59
60
|
# how many times to repeat a single sample
|
|
60
61
|
# can be used to reduce variance of tasks with low number of samples, e.g. AIME24
|
|
61
62
|
repeats: Annotated[int, BeforeValidator(lambda v: 1 if v is None else v), Field(ge=1)] = 1
|
|
63
|
+
# When True, request/sample errors (e.g. unreachable inference endpoint, exhausted retries)
|
|
64
|
+
# propagate instead of being captured into a blank Error result.
|
|
65
|
+
fail_on_error: Annotated[bool, BeforeValidator(lambda v: False if v is None else v)] = False
|
|
62
66
|
# Adding a new member? Remember to update KEYS_UNRELATED_TO_RESULTS if it doesn't impact eval results.
|
|
63
67
|
|
|
64
68
|
@property
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/drop_process_results.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/aggregators.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/niah_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/result_processors/wandb_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/balancedcopa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/global_mmlu.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/goldenswag.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/lab_bench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/multipl_e.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/naturalqs_open.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/social_iqa.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.4 → eval_framework-0.3.5}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|