eval-framework 0.3.3__tar.gz → 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.3 → eval_framework-0.3.5}/PKG-INFO +1 -1
- {eval_framework-0.3.3 → eval_framework-0.3.5}/pyproject.toml +1 -1
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/evaluation_generator.py +27 -2
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/response_generator.py +5 -4
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/result_processor.py +4 -4
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/base.py +35 -13
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/aidanbench.py +11 -2
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +1 -1
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/eval_config.py +5 -1
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/utils.py +19 -1
- {eval_framework-0.3.3 → eval_framework-0.3.5}/LICENSE +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/README.md +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/task_style.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.5}/src/template_formatting/py.typed +0 -0
|
@@ -127,7 +127,18 @@ class EvaluationGenerator:
|
|
|
127
127
|
return results
|
|
128
128
|
|
|
129
129
|
def _aggregate_results(self, results: list[Result]) -> dict[str, float | None]:
|
|
130
|
-
data = pd.DataFrame(
|
|
130
|
+
data = pd.DataFrame(
|
|
131
|
+
[
|
|
132
|
+
{
|
|
133
|
+
"metric_name": r.metric_name,
|
|
134
|
+
"subject": r.subject,
|
|
135
|
+
"key": r.key,
|
|
136
|
+
"value": r.value,
|
|
137
|
+
"error": r.error,
|
|
138
|
+
}
|
|
139
|
+
for r in results
|
|
140
|
+
]
|
|
141
|
+
)
|
|
131
142
|
if len(data) == 0:
|
|
132
143
|
return {}
|
|
133
144
|
data.fillna({"key": ""}, inplace=True)
|
|
@@ -251,7 +262,20 @@ class EvaluationGenerator:
|
|
|
251
262
|
return aggregated_results
|
|
252
263
|
|
|
253
264
|
def _aggregate_results_with_aggregators(self, results: list[Result]) -> dict[str, float | None]:
|
|
254
|
-
data = pd.DataFrame(
|
|
265
|
+
data = pd.DataFrame(
|
|
266
|
+
[
|
|
267
|
+
{
|
|
268
|
+
"metric_name": r.metric_name,
|
|
269
|
+
"metric_class_name": r.metric_class_name,
|
|
270
|
+
"subject": r.subject,
|
|
271
|
+
"key": r.key,
|
|
272
|
+
"value": r.value,
|
|
273
|
+
"error": r.error,
|
|
274
|
+
"prompt": r.prompt,
|
|
275
|
+
}
|
|
276
|
+
for r in results
|
|
277
|
+
]
|
|
278
|
+
)
|
|
255
279
|
if len(data) == 0:
|
|
256
280
|
return {}
|
|
257
281
|
data = data.fillna({"key": ""})
|
|
@@ -313,6 +337,7 @@ class EvaluationGenerator:
|
|
|
313
337
|
raise ValueError("No saved completions found. Run 'run_completions' first.")
|
|
314
338
|
|
|
315
339
|
metrics_results = self._run_metric_calculators(responses)
|
|
340
|
+
del responses
|
|
316
341
|
aggregated_results = self._aggregate_results(metrics_results)
|
|
317
342
|
results_with_aggregators = self._aggregate_results_with_aggregators(metrics_results)
|
|
318
343
|
aggregated_results.update(results_with_aggregators)
|
|
@@ -78,7 +78,7 @@ class ResponseGenerator:
|
|
|
78
78
|
custom_hf_revision=self.config.hf_revision,
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
self.response_type
|
|
81
|
+
self.response_type = self.task.get_response_type()
|
|
82
82
|
|
|
83
83
|
def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]:
|
|
84
84
|
"""
|
|
@@ -114,8 +114,8 @@ class ResponseGenerator:
|
|
|
114
114
|
try:
|
|
115
115
|
raw_loglikelihoods = self.llm.logprobs(samples)
|
|
116
116
|
except Exception as e:
|
|
117
|
-
if raise_errors():
|
|
118
|
-
raise
|
|
117
|
+
if raise_errors() or self.config.fail_on_error:
|
|
118
|
+
raise
|
|
119
119
|
logger.info(f"Error: {e.__class__.__name__} {e}")
|
|
120
120
|
raw_loglikelihoods = [
|
|
121
121
|
RawLoglikelihood(
|
|
@@ -166,7 +166,8 @@ class ResponseGenerator:
|
|
|
166
166
|
self.llm,
|
|
167
167
|
stop_sequences=stop_sequences,
|
|
168
168
|
max_tokens=max_tokens,
|
|
169
|
-
|
|
169
|
+
fail_on_error=self.config.fail_on_error,
|
|
170
|
+
)
|
|
170
171
|
case ResponseType.LOGLIKELIHOODS:
|
|
171
172
|
return self._generate_loglikelihoods
|
|
172
173
|
case _:
|
|
@@ -36,9 +36,9 @@ class ResultsFileProcessor(ResultProcessor):
|
|
|
36
36
|
return {}
|
|
37
37
|
|
|
38
38
|
def save_responses(self, responses: list[Completion | Loglikelihood]) -> None:
|
|
39
|
-
responses_data = [response.model_dump(mode="json", serialize_as_any=True) for response in responses]
|
|
40
39
|
with jsonlines.open(self.output_dir / "output.jsonl", "w") as f:
|
|
41
|
-
|
|
40
|
+
for response in responses:
|
|
41
|
+
f.write(response.model_dump(mode="json", serialize_as_any=True))
|
|
42
42
|
|
|
43
43
|
def save_response(self, response: Completion | Loglikelihood) -> None:
|
|
44
44
|
with jsonlines.open(self.output_dir / "output.jsonl", "a") as f:
|
|
@@ -72,9 +72,9 @@ class ResultsFileProcessor(ResultProcessor):
|
|
|
72
72
|
return responses
|
|
73
73
|
|
|
74
74
|
def save_metrics_results(self, results: list[Result]) -> None:
|
|
75
|
-
result_data = [x.model_dump(mode="json") for x in results]
|
|
76
75
|
with jsonlines.open(self.output_dir / "results.jsonl", "w") as f:
|
|
77
|
-
|
|
76
|
+
for result in results:
|
|
77
|
+
f.write(result.model_dump(mode="json"))
|
|
78
78
|
|
|
79
79
|
def save_metrics_result(self, result: Result) -> None:
|
|
80
80
|
with jsonlines.open(self.output_dir / "results.jsonl", "a") as f:
|
|
@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
|
|
|
15
15
|
from pydantic import BaseModel, ConfigDict
|
|
16
16
|
|
|
17
17
|
from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
|
|
18
|
-
from eval_framework.tasks.utils import raise_errors
|
|
18
|
+
from eval_framework.tasks.utils import classproperty, raise_errors
|
|
19
19
|
from template_formatting.formatter import Message, Role
|
|
20
20
|
|
|
21
21
|
if TYPE_CHECKING:
|
|
@@ -91,8 +91,6 @@ class BaseTask[SubjectType](ABC):
|
|
|
91
91
|
DATASET_PATH: str
|
|
92
92
|
SAMPLE_SPLIT: str
|
|
93
93
|
FEWSHOT_SPLIT: str
|
|
94
|
-
RESPONSE_TYPE: ResponseType
|
|
95
|
-
METRICS: list[type["BaseMetric"]]
|
|
96
94
|
SUBJECTS: list[SubjectType]
|
|
97
95
|
HF_REVISION: str | None = None # tag name, or branch name, or commit hash to ensure reproducibility
|
|
98
96
|
|
|
@@ -104,6 +102,10 @@ class BaseTask[SubjectType](ABC):
|
|
|
104
102
|
# language by subtopic, or `None` (for tasks not specific to a single language).
|
|
105
103
|
LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None
|
|
106
104
|
|
|
105
|
+
# RESPONSE_TYPE and METRICS use exposed as classproperties, so you can access them via either
|
|
106
|
+
# `TaskClass.*` or `task.*` (or `task.get_metrics()`). This avoids mypy conflicts from re-declaring class vars.
|
|
107
|
+
# By default, these values come from TASK_STYLER if set, otherwise from legacy class attributes.
|
|
108
|
+
|
|
107
109
|
def __init__(self, num_fewshot: int = 0) -> None:
|
|
108
110
|
self.num_fewshot = num_fewshot
|
|
109
111
|
self.stop_sequences: list[str] | None = None
|
|
@@ -332,14 +334,12 @@ class BaseTask[SubjectType](ABC):
|
|
|
332
334
|
return None
|
|
333
335
|
|
|
334
336
|
def get_metadata(self) -> dict[str, str | list[str]]:
|
|
335
|
-
response_type, metrics = self._get_type_and_metrics()
|
|
336
|
-
|
|
337
337
|
meta: dict[str, str | list[str]] = {
|
|
338
338
|
"dataset_path": self.DATASET_PATH,
|
|
339
339
|
"sample_split": self.SAMPLE_SPLIT,
|
|
340
340
|
"fewshot_split": self.FEWSHOT_SPLIT,
|
|
341
|
-
"response_type":
|
|
342
|
-
"metrics": [m.NAME for m in
|
|
341
|
+
"response_type": self.get_response_type().value,
|
|
342
|
+
"metrics": [m.NAME for m in self.get_metrics()],
|
|
343
343
|
"subjects": [str(s) for s in self.SUBJECTS],
|
|
344
344
|
}
|
|
345
345
|
if hasattr(self, "TASK_STYLER"):
|
|
@@ -352,12 +352,15 @@ class BaseTask[SubjectType](ABC):
|
|
|
352
352
|
samples: list[Sample],
|
|
353
353
|
stop_sequences: list[str] | None = None,
|
|
354
354
|
max_tokens: int | None = None,
|
|
355
|
+
fail_on_error: bool = False,
|
|
355
356
|
) -> list[Completion]:
|
|
356
357
|
"""
|
|
357
358
|
Generates completions for the sample.
|
|
358
359
|
:param sample: sample to generate completions for
|
|
359
360
|
:param stop_sequences: stop sequences to use in completion generation
|
|
360
361
|
:param max_tokens: maximum tokens to use in completion generation
|
|
362
|
+
:param fail_on_error: if True, re-raise the original exception instead of capturing it
|
|
363
|
+
into a per-sample Error completion
|
|
361
364
|
:return: completion
|
|
362
365
|
"""
|
|
363
366
|
if stop_sequences is None:
|
|
@@ -367,8 +370,8 @@ class BaseTask[SubjectType](ABC):
|
|
|
367
370
|
try:
|
|
368
371
|
raw_completions = llm.generate(samples=samples, stop_sequences=stop_sequences, max_tokens=max_tokens)
|
|
369
372
|
except Exception as e:
|
|
370
|
-
if raise_errors():
|
|
371
|
-
raise
|
|
373
|
+
if raise_errors() or fail_on_error:
|
|
374
|
+
raise
|
|
372
375
|
logger.info(f"Error: {e.__class__.__name__} {e}")
|
|
373
376
|
raw_completions = [
|
|
374
377
|
RawCompletion(
|
|
@@ -420,7 +423,26 @@ class BaseTask[SubjectType](ABC):
|
|
|
420
423
|
)
|
|
421
424
|
return completion_list
|
|
422
425
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
426
|
+
@classmethod
|
|
427
|
+
def get_response_type(cls) -> ResponseType:
|
|
428
|
+
"""Return the response type of the task (or the styler if it exists)."""
|
|
429
|
+
if hasattr(cls, "TASK_STYLER"):
|
|
430
|
+
return cls.TASK_STYLER.response_type
|
|
431
|
+
return cls.RESPONSE_TYPE
|
|
432
|
+
|
|
433
|
+
@classmethod
|
|
434
|
+
def get_metrics(cls) -> list[type["BaseMetric"]]:
|
|
435
|
+
"""Return the metrics of the task (or the styler if it exists)."""
|
|
436
|
+
if hasattr(cls, "TASK_STYLER"):
|
|
437
|
+
return cls.TASK_STYLER.metrics
|
|
438
|
+
return cls.METRICS
|
|
439
|
+
|
|
440
|
+
@classproperty
|
|
441
|
+
def RESPONSE_TYPE(cls) -> ResponseType:
|
|
442
|
+
"""For backwards compatibility."""
|
|
443
|
+
return cls.get_response_type()
|
|
444
|
+
|
|
445
|
+
@classproperty
|
|
446
|
+
def METRICS(cls) -> list[type["BaseMetric"]]:
|
|
447
|
+
"""For backwards compatibility."""
|
|
448
|
+
return cls.get_metrics()
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
@@ -104,7 +104,12 @@ class AidanBenchOriginal(BaseTask[str]):
|
|
|
104
104
|
return [Message(role=Role.USER, content=instruction_message)]
|
|
105
105
|
|
|
106
106
|
def _generation_loop(
|
|
107
|
-
self,
|
|
107
|
+
self,
|
|
108
|
+
llm: "BaseLLM",
|
|
109
|
+
stop_sequences: list[str] | None,
|
|
110
|
+
max_tokens: int | None,
|
|
111
|
+
initial_samples: list[Sample],
|
|
112
|
+
fail_on_error: bool = False,
|
|
108
113
|
) -> tuple[list[list[Message]], list[Union["Error", None]]]:
|
|
109
114
|
initial_messages = [s.messages for s in initial_samples]
|
|
110
115
|
samples = [(s, False) for s in initial_samples] # (sample, is_done)
|
|
@@ -118,6 +123,7 @@ class AidanBenchOriginal(BaseTask[str]):
|
|
|
118
123
|
[samples[i][0] for i in not_done_idx],
|
|
119
124
|
stop_sequences=stop_sequences,
|
|
120
125
|
max_tokens=max_tokens,
|
|
126
|
+
fail_on_error=fail_on_error,
|
|
121
127
|
)
|
|
122
128
|
new_completion_messages: list[list[Message] | None] = [c.messages for c in new_completions]
|
|
123
129
|
new_errors = [c.error for c in new_completions]
|
|
@@ -164,11 +170,14 @@ class AidanBenchOriginal(BaseTask[str]):
|
|
|
164
170
|
samples: list[Sample],
|
|
165
171
|
stop_sequences: list[str] | None = None,
|
|
166
172
|
max_tokens: int | None = None,
|
|
173
|
+
fail_on_error: bool = False,
|
|
167
174
|
) -> list[Completion]:
|
|
168
175
|
assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
|
|
169
176
|
"Each sample must have exactly one USER message."
|
|
170
177
|
)
|
|
171
|
-
all_message_histories, errors = self._generation_loop(
|
|
178
|
+
all_message_histories, errors = self._generation_loop(
|
|
179
|
+
llm, stop_sequences, max_tokens, samples, fail_on_error=fail_on_error
|
|
180
|
+
)
|
|
172
181
|
|
|
173
182
|
completion_list = []
|
|
174
183
|
for idx, sample in enumerate(samples):
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/naturalqs_open.py
RENAMED
|
@@ -86,7 +86,7 @@ class NaturalQsOpenCloze(_NaturalQsOpenChoice_Base):
|
|
|
86
86
|
|
|
87
87
|
class NaturalQsOpenMC(_NaturalQsOpenChoice_Base):
|
|
88
88
|
NAME = "NaturalQsOpenMC"
|
|
89
|
-
TASK_STYLER = MCStyle(
|
|
89
|
+
TASK_STYLER = MCStyle()
|
|
90
90
|
|
|
91
91
|
|
|
92
92
|
class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base):
|
|
@@ -27,6 +27,7 @@ KEYS_UNRELATED_TO_RESULTS = {
|
|
|
27
27
|
"save_intermediate_results",
|
|
28
28
|
"save_logs",
|
|
29
29
|
"delete_output_dir_after_upload",
|
|
30
|
+
"fail_on_error",
|
|
30
31
|
}
|
|
31
32
|
|
|
32
33
|
|
|
@@ -59,6 +60,9 @@ class EvalConfig(BaseConfig):
|
|
|
59
60
|
# how many times to repeat a single sample
|
|
60
61
|
# can be used to reduce variance of tasks with low number of samples, e.g. AIME24
|
|
61
62
|
repeats: Annotated[int, BeforeValidator(lambda v: 1 if v is None else v), Field(ge=1)] = 1
|
|
63
|
+
# When True, request/sample errors (e.g. unreachable inference endpoint, exhausted retries)
|
|
64
|
+
# propagate instead of being captured into a blank Error result.
|
|
65
|
+
fail_on_error: Annotated[bool, BeforeValidator(lambda v: False if v is None else v)] = False
|
|
62
66
|
# Adding a new member? Remember to update KEYS_UNRELATED_TO_RESULTS if it doesn't impact eval results.
|
|
63
67
|
|
|
64
68
|
@property
|
|
@@ -112,7 +116,7 @@ class EvalConfig(BaseConfig):
|
|
|
112
116
|
@model_validator(mode="after")
|
|
113
117
|
def validate_llm_judge_defined(self) -> "EvalConfig":
|
|
114
118
|
task = get_task(self.task_name)
|
|
115
|
-
|
|
119
|
+
task_metrics = task(num_fewshot=0).get_metrics()
|
|
116
120
|
for metric_class in task_metrics:
|
|
117
121
|
if issubclass(metric_class, BaseLLMJudgeMetric):
|
|
118
122
|
assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."
|
|
@@ -8,7 +8,7 @@ import string
|
|
|
8
8
|
import threading
|
|
9
9
|
from collections.abc import Callable
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Any, Literal, NamedTuple
|
|
11
|
+
from typing import Any, Literal, NamedTuple, overload
|
|
12
12
|
|
|
13
13
|
import dill
|
|
14
14
|
import numpy as np
|
|
@@ -22,6 +22,24 @@ logger = logging.getLogger(__name__)
|
|
|
22
22
|
RANDOM_SEED = 42 # hacky way to get around circular import
|
|
23
23
|
redis_warning_printed = False
|
|
24
24
|
|
|
25
|
+
|
|
26
|
+
class classproperty[T]:
|
|
27
|
+
"""Descriptor supporting property-like access on classes and instances."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, fget: Callable[[Any], T]) -> None:
|
|
30
|
+
self.fget = fget
|
|
31
|
+
|
|
32
|
+
@overload
|
|
33
|
+
def __get__(self, obj: None, owner: type[Any]) -> T: ...
|
|
34
|
+
|
|
35
|
+
@overload
|
|
36
|
+
def __get__(self, obj: object, owner: type[Any] | None = None) -> T: ...
|
|
37
|
+
|
|
38
|
+
def __get__(self, obj: object | None, owner: type[Any] | None = None) -> T:
|
|
39
|
+
cls = owner if owner is not None else type(obj)
|
|
40
|
+
return self.fget(cls)
|
|
41
|
+
|
|
42
|
+
|
|
25
43
|
_pools: dict[tuple[str | None, tuple[str, ...] | None], ContainerPoolManager] = {}
|
|
26
44
|
_pools_lock = threading.Lock()
|
|
27
45
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/drop_process_results.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/aggregators/aggregators.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/niah_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/result_processors/wandb_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/balancedcopa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/global_mmlu.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/goldenswag.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/lab_bench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/multipl_e.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/social_iqa.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.5}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|