eval-framework 0.3.6__tar.gz → 0.3.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.6 → eval_framework-0.3.7}/PKG-INFO +1 -1
- {eval_framework-0.3.6 → eval_framework-0.3.7}/pyproject.toml +1 -1
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/openai.py +23 -10
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/ifeval.py +2 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/LICENSE +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/README.md +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/base.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/task_style.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.6 → eval_framework-0.3.7}/src/template_formatting/py.typed +0 -0
|
@@ -8,18 +8,38 @@ from functools import partial
|
|
|
8
8
|
|
|
9
9
|
import tiktoken
|
|
10
10
|
from openai import OpenAI
|
|
11
|
-
from openai.types.chat import
|
|
11
|
+
from openai.types.chat import (
|
|
12
|
+
ChatCompletionAssistantMessageParam,
|
|
13
|
+
ChatCompletionMessageParam,
|
|
14
|
+
ChatCompletionSystemMessageParam,
|
|
15
|
+
ChatCompletionUserMessageParam,
|
|
16
|
+
)
|
|
12
17
|
from tokenizers import Tokenizer
|
|
13
18
|
from transformers import AutoTokenizer
|
|
14
19
|
|
|
15
20
|
from eval_framework.llm.base import BaseLLM
|
|
16
21
|
from eval_framework.shared.types import ConcatCompression, Error, RawCompletion, RawLoglikelihood
|
|
17
22
|
from eval_framework.tasks.base import Sample
|
|
18
|
-
from template_formatting.formatter import BaseFormatter, ConcatFormatter, HFFormatter, Message
|
|
23
|
+
from template_formatting.formatter import BaseFormatter, ConcatFormatter, HFFormatter, Message, Role
|
|
19
24
|
|
|
20
25
|
logger = logging.getLogger(__name__)
|
|
21
26
|
|
|
22
27
|
|
|
28
|
+
def _to_chat_completion_message(message: Message) -> ChatCompletionMessageParam:
|
|
29
|
+
match message.role:
|
|
30
|
+
case Role.SYSTEM:
|
|
31
|
+
return ChatCompletionSystemMessageParam(role="system", content=message.content)
|
|
32
|
+
case Role.USER:
|
|
33
|
+
return ChatCompletionUserMessageParam(role="user", content=message.content)
|
|
34
|
+
case Role.ASSISTANT:
|
|
35
|
+
return ChatCompletionAssistantMessageParam(role="assistant", content=message.content)
|
|
36
|
+
case None:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
"Cannot send a Message without a role through the chat completion API; "
|
|
39
|
+
"the legacy roleless format is only supported for fine-tuning."
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
23
43
|
class OpenAIModel(BaseLLM):
|
|
24
44
|
"""
|
|
25
45
|
LLM wrapper for OpenAI API providing text/chat completions and log-probability evaluation output.
|
|
@@ -158,14 +178,7 @@ class OpenAIModel(BaseLLM):
|
|
|
158
178
|
|
|
159
179
|
else:
|
|
160
180
|
# Use chat completion API
|
|
161
|
-
chat_messages = [
|
|
162
|
-
(
|
|
163
|
-
ChatCompletionUserMessageParam(role="user", content=m.content)
|
|
164
|
-
if m.role is not None and m.role.value.lower() == "user"
|
|
165
|
-
else ChatCompletionAssistantMessageParam(role="assistant", content=m.content)
|
|
166
|
-
)
|
|
167
|
-
for m in single_messages
|
|
168
|
-
]
|
|
181
|
+
chat_messages = [_to_chat_completion_message(m) for m in single_messages]
|
|
169
182
|
assert self._model_name is not None
|
|
170
183
|
chat_response = self._client.chat.completions.create(
|
|
171
184
|
model=self._model_name,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
3
|
from eval_framework.metrics.completion.ifeval import IFEvalMetric, IFEvalMetricContext
|
|
4
|
+
from eval_framework.metrics.completion.language_checker import LanguageRawConsistencyChecker
|
|
4
5
|
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
|
|
5
6
|
|
|
6
7
|
|
|
@@ -76,3 +77,4 @@ class IFEvalDe(IFEval):
|
|
|
76
77
|
DATASET_PATH = "jzhang86/de_ifeval"
|
|
77
78
|
SUBJECTS = [NO_SUBJECT]
|
|
78
79
|
LANGUAGE = {NO_SUBJECT: Language.DEU}
|
|
80
|
+
METRICS = [IFEvalMetric, LanguageRawConsistencyChecker]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/drop_process_results.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/aggregators/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/aggregators/aggregators.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/niah_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/result_processors/wandb_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/balancedcopa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/global_mmlu.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/goldenswag.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/lab_bench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/multipl_e.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/naturalqs_open.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/social_iqa.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.3.6 → eval_framework-0.3.7}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|