eval-framework 0.3.3__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.3 → eval_framework-0.3.4}/PKG-INFO +1 -1
- {eval_framework-0.3.3 → eval_framework-0.3.4}/pyproject.toml +1 -1
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/response_generator.py +1 -1
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/base.py +30 -11
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +1 -1
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/eval_config.py +1 -1
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/utils.py +19 -1
- {eval_framework-0.3.3 → eval_framework-0.3.4}/LICENSE +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/README.md +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/task_style.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.3 → eval_framework-0.3.4}/src/template_formatting/py.typed +0 -0
|
@@ -78,7 +78,7 @@ class ResponseGenerator:
|
|
|
78
78
|
custom_hf_revision=self.config.hf_revision,
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
self.response_type
|
|
81
|
+
self.response_type = self.task.get_response_type()
|
|
82
82
|
|
|
83
83
|
def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]:
|
|
84
84
|
"""
|
|
@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
|
|
|
15
15
|
from pydantic import BaseModel, ConfigDict
|
|
16
16
|
|
|
17
17
|
from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
|
|
18
|
-
from eval_framework.tasks.utils import raise_errors
|
|
18
|
+
from eval_framework.tasks.utils import classproperty, raise_errors
|
|
19
19
|
from template_formatting.formatter import Message, Role
|
|
20
20
|
|
|
21
21
|
if TYPE_CHECKING:
|
|
@@ -91,8 +91,6 @@ class BaseTask[SubjectType](ABC):
|
|
|
91
91
|
DATASET_PATH: str
|
|
92
92
|
SAMPLE_SPLIT: str
|
|
93
93
|
FEWSHOT_SPLIT: str
|
|
94
|
-
RESPONSE_TYPE: ResponseType
|
|
95
|
-
METRICS: list[type["BaseMetric"]]
|
|
96
94
|
SUBJECTS: list[SubjectType]
|
|
97
95
|
HF_REVISION: str | None = None # tag name, or branch name, or commit hash to ensure reproducibility
|
|
98
96
|
|
|
@@ -104,6 +102,10 @@ class BaseTask[SubjectType](ABC):
|
|
|
104
102
|
# language by subtopic, or `None` (for tasks not specific to a single language).
|
|
105
103
|
LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None
|
|
106
104
|
|
|
105
|
+
# RESPONSE_TYPE and METRICS use exposed as classproperties, so you can access them via either
|
|
106
|
+
# `TaskClass.*` or `task.*` (or `task.get_metrics()`). This avoids mypy conflicts from re-declaring class vars.
|
|
107
|
+
# By default, these values come from TASK_STYLER if set, otherwise from legacy class attributes.
|
|
108
|
+
|
|
107
109
|
def __init__(self, num_fewshot: int = 0) -> None:
|
|
108
110
|
self.num_fewshot = num_fewshot
|
|
109
111
|
self.stop_sequences: list[str] | None = None
|
|
@@ -332,14 +334,12 @@ class BaseTask[SubjectType](ABC):
|
|
|
332
334
|
return None
|
|
333
335
|
|
|
334
336
|
def get_metadata(self) -> dict[str, str | list[str]]:
|
|
335
|
-
response_type, metrics = self._get_type_and_metrics()
|
|
336
|
-
|
|
337
337
|
meta: dict[str, str | list[str]] = {
|
|
338
338
|
"dataset_path": self.DATASET_PATH,
|
|
339
339
|
"sample_split": self.SAMPLE_SPLIT,
|
|
340
340
|
"fewshot_split": self.FEWSHOT_SPLIT,
|
|
341
|
-
"response_type":
|
|
342
|
-
"metrics": [m.NAME for m in
|
|
341
|
+
"response_type": self.get_response_type().value,
|
|
342
|
+
"metrics": [m.NAME for m in self.get_metrics()],
|
|
343
343
|
"subjects": [str(s) for s in self.SUBJECTS],
|
|
344
344
|
}
|
|
345
345
|
if hasattr(self, "TASK_STYLER"):
|
|
@@ -420,7 +420,26 @@ class BaseTask[SubjectType](ABC):
|
|
|
420
420
|
)
|
|
421
421
|
return completion_list
|
|
422
422
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
423
|
+
@classmethod
|
|
424
|
+
def get_response_type(cls) -> ResponseType:
|
|
425
|
+
"""Return the response type of the task (or the styler if it exists)."""
|
|
426
|
+
if hasattr(cls, "TASK_STYLER"):
|
|
427
|
+
return cls.TASK_STYLER.response_type
|
|
428
|
+
return cls.RESPONSE_TYPE
|
|
429
|
+
|
|
430
|
+
@classmethod
|
|
431
|
+
def get_metrics(cls) -> list[type["BaseMetric"]]:
|
|
432
|
+
"""Return the metrics of the task (or the styler if it exists)."""
|
|
433
|
+
if hasattr(cls, "TASK_STYLER"):
|
|
434
|
+
return cls.TASK_STYLER.metrics
|
|
435
|
+
return cls.METRICS
|
|
436
|
+
|
|
437
|
+
@classproperty
|
|
438
|
+
def RESPONSE_TYPE(cls) -> ResponseType:
|
|
439
|
+
"""For backwards compatibility."""
|
|
440
|
+
return cls.get_response_type()
|
|
441
|
+
|
|
442
|
+
@classproperty
|
|
443
|
+
def METRICS(cls) -> list[type["BaseMetric"]]:
|
|
444
|
+
"""For backwards compatibility."""
|
|
445
|
+
return cls.get_metrics()
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/naturalqs_open.py
RENAMED
|
@@ -86,7 +86,7 @@ class NaturalQsOpenCloze(_NaturalQsOpenChoice_Base):
|
|
|
86
86
|
|
|
87
87
|
class NaturalQsOpenMC(_NaturalQsOpenChoice_Base):
|
|
88
88
|
NAME = "NaturalQsOpenMC"
|
|
89
|
-
TASK_STYLER = MCStyle(
|
|
89
|
+
TASK_STYLER = MCStyle()
|
|
90
90
|
|
|
91
91
|
|
|
92
92
|
class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base):
|
|
@@ -112,7 +112,7 @@ class EvalConfig(BaseConfig):
|
|
|
112
112
|
@model_validator(mode="after")
|
|
113
113
|
def validate_llm_judge_defined(self) -> "EvalConfig":
|
|
114
114
|
task = get_task(self.task_name)
|
|
115
|
-
|
|
115
|
+
task_metrics = task(num_fewshot=0).get_metrics()
|
|
116
116
|
for metric_class in task_metrics:
|
|
117
117
|
if issubclass(metric_class, BaseLLMJudgeMetric):
|
|
118
118
|
assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."
|
|
@@ -8,7 +8,7 @@ import string
|
|
|
8
8
|
import threading
|
|
9
9
|
from collections.abc import Callable
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Any, Literal, NamedTuple
|
|
11
|
+
from typing import Any, Literal, NamedTuple, overload
|
|
12
12
|
|
|
13
13
|
import dill
|
|
14
14
|
import numpy as np
|
|
@@ -22,6 +22,24 @@ logger = logging.getLogger(__name__)
|
|
|
22
22
|
RANDOM_SEED = 42 # hacky way to get around circular import
|
|
23
23
|
redis_warning_printed = False
|
|
24
24
|
|
|
25
|
+
|
|
26
|
+
class classproperty[T]:
|
|
27
|
+
"""Descriptor supporting property-like access on classes and instances."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, fget: Callable[[Any], T]) -> None:
|
|
30
|
+
self.fget = fget
|
|
31
|
+
|
|
32
|
+
@overload
|
|
33
|
+
def __get__(self, obj: None, owner: type[Any]) -> T: ...
|
|
34
|
+
|
|
35
|
+
@overload
|
|
36
|
+
def __get__(self, obj: object, owner: type[Any] | None = None) -> T: ...
|
|
37
|
+
|
|
38
|
+
def __get__(self, obj: object | None, owner: type[Any] | None = None) -> T:
|
|
39
|
+
cls = owner if owner is not None else type(obj)
|
|
40
|
+
return self.fget(cls)
|
|
41
|
+
|
|
42
|
+
|
|
25
43
|
_pools: dict[tuple[str | None, tuple[str, ...] | None], ContainerPoolManager] = {}
|
|
26
44
|
_pools_lock = threading.Lock()
|
|
27
45
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/drop_process_results.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/aggregators/aggregators.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/niah_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/result_processors/wandb_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/balancedcopa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/global_mmlu.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/goldenswag.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/lab_bench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/multipl_e.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/social_iqa.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.3 → eval_framework-0.3.4}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|