eval-framework 0.3.2__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.2 → eval_framework-0.3.4}/PKG-INFO +1 -1
- {eval_framework-0.3.2 → eval_framework-0.3.4}/pyproject.toml +1 -1
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/drop_completion.py +2 -2
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/math_minerva_completion.py +9 -22
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/response_generator.py +1 -1
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/base.py +30 -11
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +1 -1
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/truthfulqa.py +17 -2
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/eval_config.py +1 -1
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/utils.py +19 -1
- {eval_framework-0.3.2 → eval_framework-0.3.4}/LICENSE +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/README.md +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/task_style.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.2 → eval_framework-0.3.4}/src/template_formatting/py.typed +0 -0
|
@@ -18,7 +18,7 @@ class DropMetricContext(BaseMetricContext):
|
|
|
18
18
|
class DropF1ExactMatch(BaseMetric[Completion]):
|
|
19
19
|
"""DROP F1 and exact match. Requires DropMetricContext with answer_tuples."""
|
|
20
20
|
|
|
21
|
-
NAME = "
|
|
21
|
+
NAME = "Drop F1"
|
|
22
22
|
KEYS = ["f1", "exact_match"]
|
|
23
23
|
|
|
24
24
|
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
@@ -52,5 +52,5 @@ class DropF1ExactMatch(BaseMetric[Completion]):
|
|
|
52
52
|
higher_is_better=True,
|
|
53
53
|
error=response.error,
|
|
54
54
|
)
|
|
55
|
-
for name, key in zip(
|
|
55
|
+
for name, key in zip(self.NAMES, self.KEYS)
|
|
56
56
|
]
|
|
@@ -20,6 +20,7 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
NAME = "Math Minerva Completion"
|
|
23
|
+
KEYS = ["Exact", "Exact Flex"]
|
|
23
24
|
AGGREGATORS = [PassAtK()]
|
|
24
25
|
|
|
25
26
|
def __init__(
|
|
@@ -36,17 +37,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
36
37
|
if response.error:
|
|
37
38
|
return [
|
|
38
39
|
MetricResult(
|
|
39
|
-
metric_name=
|
|
40
|
+
metric_name=x,
|
|
40
41
|
value=None,
|
|
41
42
|
higher_is_better=True,
|
|
42
43
|
error=response.error,
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
metric_name="Exact Match (Flex)",
|
|
46
|
-
value=None,
|
|
47
|
-
higher_is_better=True,
|
|
48
|
-
error=response.error,
|
|
49
|
-
),
|
|
44
|
+
)
|
|
45
|
+
for x in self.NAMES
|
|
50
46
|
]
|
|
51
47
|
|
|
52
48
|
gold = response.ground_truth
|
|
@@ -55,17 +51,12 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
55
51
|
if not gold:
|
|
56
52
|
return [
|
|
57
53
|
MetricResult(
|
|
58
|
-
metric_name=
|
|
59
|
-
value=None,
|
|
60
|
-
higher_is_better=True,
|
|
61
|
-
error="No ground truth available",
|
|
62
|
-
),
|
|
63
|
-
MetricResult(
|
|
64
|
-
metric_name="Exact Match (Flex)",
|
|
54
|
+
metric_name=x,
|
|
65
55
|
value=None,
|
|
66
56
|
higher_is_better=True,
|
|
67
57
|
error="No ground truth available",
|
|
68
|
-
)
|
|
58
|
+
)
|
|
59
|
+
for x in self.NAMES
|
|
69
60
|
]
|
|
70
61
|
|
|
71
62
|
raw = response.raw_completion or response.completion
|
|
@@ -84,12 +75,8 @@ class MathMinervaCompletion(BaseMetric[Completion]):
|
|
|
84
75
|
)
|
|
85
76
|
|
|
86
77
|
return [
|
|
87
|
-
MetricResult(metric_name=
|
|
88
|
-
|
|
89
|
-
metric_name="Exact Match (Flex)",
|
|
90
|
-
value=exact_match_flex,
|
|
91
|
-
higher_is_better=True,
|
|
92
|
-
),
|
|
78
|
+
MetricResult(metric_name=name, value=value, higher_is_better=True)
|
|
79
|
+
for name, value in zip(self.NAMES, [exact_match, exact_match_flex])
|
|
93
80
|
]
|
|
94
81
|
|
|
95
82
|
|
|
@@ -78,7 +78,7 @@ class ResponseGenerator:
|
|
|
78
78
|
custom_hf_revision=self.config.hf_revision,
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
self.response_type
|
|
81
|
+
self.response_type = self.task.get_response_type()
|
|
82
82
|
|
|
83
83
|
def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]:
|
|
84
84
|
"""
|
|
@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
|
|
|
15
15
|
from pydantic import BaseModel, ConfigDict
|
|
16
16
|
|
|
17
17
|
from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
|
|
18
|
-
from eval_framework.tasks.utils import raise_errors
|
|
18
|
+
from eval_framework.tasks.utils import classproperty, raise_errors
|
|
19
19
|
from template_formatting.formatter import Message, Role
|
|
20
20
|
|
|
21
21
|
if TYPE_CHECKING:
|
|
@@ -91,8 +91,6 @@ class BaseTask[SubjectType](ABC):
|
|
|
91
91
|
DATASET_PATH: str
|
|
92
92
|
SAMPLE_SPLIT: str
|
|
93
93
|
FEWSHOT_SPLIT: str
|
|
94
|
-
RESPONSE_TYPE: ResponseType
|
|
95
|
-
METRICS: list[type["BaseMetric"]]
|
|
96
94
|
SUBJECTS: list[SubjectType]
|
|
97
95
|
HF_REVISION: str | None = None # tag name, or branch name, or commit hash to ensure reproducibility
|
|
98
96
|
|
|
@@ -104,6 +102,10 @@ class BaseTask[SubjectType](ABC):
|
|
|
104
102
|
# language by subtopic, or `None` (for tasks not specific to a single language).
|
|
105
103
|
LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None
|
|
106
104
|
|
|
105
|
+
# RESPONSE_TYPE and METRICS use exposed as classproperties, so you can access them via either
|
|
106
|
+
# `TaskClass.*` or `task.*` (or `task.get_metrics()`). This avoids mypy conflicts from re-declaring class vars.
|
|
107
|
+
# By default, these values come from TASK_STYLER if set, otherwise from legacy class attributes.
|
|
108
|
+
|
|
107
109
|
def __init__(self, num_fewshot: int = 0) -> None:
|
|
108
110
|
self.num_fewshot = num_fewshot
|
|
109
111
|
self.stop_sequences: list[str] | None = None
|
|
@@ -332,14 +334,12 @@ class BaseTask[SubjectType](ABC):
|
|
|
332
334
|
return None
|
|
333
335
|
|
|
334
336
|
def get_metadata(self) -> dict[str, str | list[str]]:
|
|
335
|
-
response_type, metrics = self._get_type_and_metrics()
|
|
336
|
-
|
|
337
337
|
meta: dict[str, str | list[str]] = {
|
|
338
338
|
"dataset_path": self.DATASET_PATH,
|
|
339
339
|
"sample_split": self.SAMPLE_SPLIT,
|
|
340
340
|
"fewshot_split": self.FEWSHOT_SPLIT,
|
|
341
|
-
"response_type":
|
|
342
|
-
"metrics": [m.NAME for m in
|
|
341
|
+
"response_type": self.get_response_type().value,
|
|
342
|
+
"metrics": [m.NAME for m in self.get_metrics()],
|
|
343
343
|
"subjects": [str(s) for s in self.SUBJECTS],
|
|
344
344
|
}
|
|
345
345
|
if hasattr(self, "TASK_STYLER"):
|
|
@@ -420,7 +420,26 @@ class BaseTask[SubjectType](ABC):
|
|
|
420
420
|
)
|
|
421
421
|
return completion_list
|
|
422
422
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
423
|
+
@classmethod
|
|
424
|
+
def get_response_type(cls) -> ResponseType:
|
|
425
|
+
"""Return the response type of the task (or the styler if it exists)."""
|
|
426
|
+
if hasattr(cls, "TASK_STYLER"):
|
|
427
|
+
return cls.TASK_STYLER.response_type
|
|
428
|
+
return cls.RESPONSE_TYPE
|
|
429
|
+
|
|
430
|
+
@classmethod
|
|
431
|
+
def get_metrics(cls) -> list[type["BaseMetric"]]:
|
|
432
|
+
"""Return the metrics of the task (or the styler if it exists)."""
|
|
433
|
+
if hasattr(cls, "TASK_STYLER"):
|
|
434
|
+
return cls.TASK_STYLER.metrics
|
|
435
|
+
return cls.METRICS
|
|
436
|
+
|
|
437
|
+
@classproperty
|
|
438
|
+
def RESPONSE_TYPE(cls) -> ResponseType:
|
|
439
|
+
"""For backwards compatibility."""
|
|
440
|
+
return cls.get_response_type()
|
|
441
|
+
|
|
442
|
+
@classproperty
|
|
443
|
+
def METRICS(cls) -> list[type["BaseMetric"]]:
|
|
444
|
+
"""For backwards compatibility."""
|
|
445
|
+
return cls.get_metrics()
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/naturalqs_open.py
RENAMED
|
@@ -86,7 +86,7 @@ class NaturalQsOpenCloze(_NaturalQsOpenChoice_Base):
|
|
|
86
86
|
|
|
87
87
|
class NaturalQsOpenMC(_NaturalQsOpenChoice_Base):
|
|
88
88
|
NAME = "NaturalQsOpenMC"
|
|
89
|
-
TASK_STYLER = MCStyle(
|
|
89
|
+
TASK_STYLER = MCStyle()
|
|
90
90
|
|
|
91
91
|
|
|
92
92
|
class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base):
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
@@ -106,13 +106,18 @@ class TRUTHFULQA_OLMES(TRUTHFULQA):
|
|
|
106
106
|
"""
|
|
107
107
|
|
|
108
108
|
NAME = "TruthfulQA_OLMES"
|
|
109
|
+
FEWSHOT_SPLIT = "validation" # use dataset few-shot for multiple-choice options
|
|
110
|
+
PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
|
|
111
|
+
|
|
112
|
+
def _get_cue_text(self, item: dict[str, Any]) -> str:
|
|
113
|
+
return "Answer:"
|
|
109
114
|
|
|
110
115
|
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
111
116
|
question = item["question"]
|
|
112
117
|
choices = item[self.target_identifier]["choices"]
|
|
113
118
|
labels = get_n_letters(len(choices))
|
|
114
119
|
options = "\n".join(f" {label}. {choice}" for label, choice in zip(labels, choices))
|
|
115
|
-
return f"
|
|
120
|
+
return f"Question: {question}\n{options}\n"
|
|
116
121
|
|
|
117
122
|
def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
|
|
118
123
|
labels_arr = item[self.target_identifier]["labels"]
|
|
@@ -126,7 +131,17 @@ class TRUTHFULQA_OLMES(TRUTHFULQA):
|
|
|
126
131
|
return [f" {letter}" for letter in letters]
|
|
127
132
|
|
|
128
133
|
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
129
|
-
|
|
134
|
+
# Reuse BaseTask's split-based sampler.
|
|
135
|
+
return BaseTask._sample_fewshot_examples(self, item)
|
|
136
|
+
|
|
137
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
138
|
+
cue_text = self._get_cue_text(item)
|
|
139
|
+
labels_arr = item[self.target_identifier]["labels"]
|
|
140
|
+
letters = get_n_letters(len(labels_arr))
|
|
141
|
+
# Pick one correct option for the demonstration answer.
|
|
142
|
+
correct_letters = [letters[i] for i, label in enumerate(labels_arr) if label == 1]
|
|
143
|
+
letter = correct_letters[0] if correct_letters else letters[0]
|
|
144
|
+
return f"{cue_text} {letter}"
|
|
130
145
|
|
|
131
146
|
|
|
132
147
|
class TRUTHFULQA_IDK(TRUTHFULQA):
|
|
@@ -112,7 +112,7 @@ class EvalConfig(BaseConfig):
|
|
|
112
112
|
@model_validator(mode="after")
|
|
113
113
|
def validate_llm_judge_defined(self) -> "EvalConfig":
|
|
114
114
|
task = get_task(self.task_name)
|
|
115
|
-
|
|
115
|
+
task_metrics = task(num_fewshot=0).get_metrics()
|
|
116
116
|
for metric_class in task_metrics:
|
|
117
117
|
if issubclass(metric_class, BaseLLMJudgeMetric):
|
|
118
118
|
assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."
|
|
@@ -8,7 +8,7 @@ import string
|
|
|
8
8
|
import threading
|
|
9
9
|
from collections.abc import Callable
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Any, Literal, NamedTuple
|
|
11
|
+
from typing import Any, Literal, NamedTuple, overload
|
|
12
12
|
|
|
13
13
|
import dill
|
|
14
14
|
import numpy as np
|
|
@@ -22,6 +22,24 @@ logger = logging.getLogger(__name__)
|
|
|
22
22
|
RANDOM_SEED = 42 # hacky way to get around circular import
|
|
23
23
|
redis_warning_printed = False
|
|
24
24
|
|
|
25
|
+
|
|
26
|
+
class classproperty[T]:
|
|
27
|
+
"""Descriptor supporting property-like access on classes and instances."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, fget: Callable[[Any], T]) -> None:
|
|
30
|
+
self.fget = fget
|
|
31
|
+
|
|
32
|
+
@overload
|
|
33
|
+
def __get__(self, obj: None, owner: type[Any]) -> T: ...
|
|
34
|
+
|
|
35
|
+
@overload
|
|
36
|
+
def __get__(self, obj: object, owner: type[Any] | None = None) -> T: ...
|
|
37
|
+
|
|
38
|
+
def __get__(self, obj: object | None, owner: type[Any] | None = None) -> T:
|
|
39
|
+
cls = owner if owner is not None else type(obj)
|
|
40
|
+
return self.fget(cls)
|
|
41
|
+
|
|
42
|
+
|
|
25
43
|
_pools: dict[tuple[str | None, tuple[str, ...] | None], ContainerPoolManager] = {}
|
|
26
44
|
_pools_lock = threading.Lock()
|
|
27
45
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/drop_process_results.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/aggregators.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/niah_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/result_processors/wandb_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/balancedcopa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/global_mmlu.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/goldenswag.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/lab_bench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/multipl_e.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/social_iqa.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.2 → eval_framework-0.3.4}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|