eval-framework 0.3.7__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.7 → eval_framework-0.5.0}/PKG-INFO +30 -34
- {eval_framework-0.3.7 → eval_framework-0.5.0}/README.md +0 -1
- {eval_framework-0.3.7 → eval_framework-0.5.0}/pyproject.toml +43 -63
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/evaluation_generator.py +1 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/openai.py +65 -14
- eval_framework-0.5.0/src/eval_framework/llm/vllm_local_server.py +217 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/base.py +18 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/code_assertion.py +11 -14
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +20 -26
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/multipl_e_assertion.py +7 -1
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/comparison_grader.py +2 -2
- eval_framework-0.5.0/src/eval_framework/tasks/__init__.py +12 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/base.py +14 -5
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/aidanbench.py +2 -2
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores200.py +3 -3
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores_plus.py +1 -2
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -1
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/squad.py +21 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/tablebench.py +5 -1
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/triviaqa.py +27 -1
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/wmt.py +2 -2
- eval_framework-0.5.0/src/eval_framework/tasks/dataset_revisions.py +106 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/perturbation.py +2 -2
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/registry.py +76 -45
- eval_framework-0.5.0/src/eval_framework/tasks/task-dataset-revisions.json +62 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/task_names.py +2 -122
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/task_style.py +64 -2
- eval_framework-0.3.7/src/eval_framework/metrics/completion/comet.py +0 -56
- eval_framework-0.3.7/src/eval_framework/tasks/__init__.py +0 -6
- {eval_framework-0.3.7 → eval_framework-0.5.0}/LICENSE +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.7 → eval_framework-0.5.0}/src/template_formatting/py.typed +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Evaluation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
7
7
|
Version 2.0, January 2004
|
|
@@ -211,49 +211,47 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
211
211
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
212
212
|
Classifier: Topic :: Software Development :: Libraries
|
|
213
213
|
Classifier: Typing :: Typed
|
|
214
|
-
Requires-Dist: pyyaml>=6.0.
|
|
215
|
-
Requires-Dist: xmltodict>=0.
|
|
216
|
-
Requires-Dist: pydantic>=2.
|
|
217
|
-
Requires-Dist: datasets>=
|
|
218
|
-
Requires-Dist: sacrebleu>=2.
|
|
219
|
-
Requires-Dist: pycountry>=
|
|
220
|
-
Requires-Dist: nltk>=3.9.
|
|
221
|
-
Requires-Dist: python-dotenv>=1.
|
|
222
|
-
Requires-Dist: lingua-language-detector>=2.0
|
|
223
|
-
Requires-Dist: google-crc32c>=1.
|
|
224
|
-
Requires-Dist: kubernetes>=31.0.0,<32
|
|
214
|
+
Requires-Dist: pyyaml>=6.0.3,<7
|
|
215
|
+
Requires-Dist: xmltodict>=1.0.4,<1.1
|
|
216
|
+
Requires-Dist: pydantic>=2.13.4,<3
|
|
217
|
+
Requires-Dist: datasets>=5.0.0,<6
|
|
218
|
+
Requires-Dist: sacrebleu>=2.6.0,<3
|
|
219
|
+
Requires-Dist: pycountry>=26.2.16,<27
|
|
220
|
+
Requires-Dist: nltk>=3.9.4,<4
|
|
221
|
+
Requires-Dist: python-dotenv>=1.2.2,<2
|
|
222
|
+
Requires-Dist: lingua-language-detector>=2.2.0,<3
|
|
223
|
+
Requires-Dist: google-crc32c>=1.8.0,<2
|
|
225
224
|
Requires-Dist: langdetect>=1.0.9,<2
|
|
226
|
-
Requires-Dist: spacy>=3.8.
|
|
227
|
-
Requires-Dist: jsonschema>=4.
|
|
228
|
-
Requires-Dist: mysql-connector-python>=9.
|
|
229
|
-
Requires-Dist: psycopg2-binary>=2.9.
|
|
225
|
+
Requires-Dist: spacy>=3.8.14,<4
|
|
226
|
+
Requires-Dist: jsonschema>=4.26.0,<5
|
|
227
|
+
Requires-Dist: mysql-connector-python>=9.7.0,<10
|
|
228
|
+
Requires-Dist: psycopg2-binary>=2.9.12,<3
|
|
230
229
|
Requires-Dist: sympy>=1.13.1,<2
|
|
231
|
-
Requires-Dist: llm-sandbox[docker]==0.3.
|
|
230
|
+
Requires-Dist: llm-sandbox[docker]==0.3.39
|
|
232
231
|
Requires-Dist: jsonlines>=4,<5
|
|
233
|
-
Requires-Dist: lxml>=6,<7
|
|
234
|
-
Requires-Dist: python-iso639>=
|
|
235
|
-
Requires-Dist: wandb>=0.
|
|
236
|
-
Requires-Dist: boto3>=1.
|
|
237
|
-
Requires-Dist: numpy>=
|
|
232
|
+
Requires-Dist: lxml>=6.1.1,<7
|
|
233
|
+
Requires-Dist: python-iso639>=2026.4.20
|
|
234
|
+
Requires-Dist: wandb>=0.27.2,<1
|
|
235
|
+
Requires-Dist: boto3>=1.43.19,<2
|
|
236
|
+
Requires-Dist: numpy>=2.2.6
|
|
238
237
|
Requires-Dist: antlr4-python3-runtime==4.11.0
|
|
239
|
-
Requires-Dist: scipy>=1.
|
|
238
|
+
Requires-Dist: scipy>=1.17.1,<2
|
|
240
239
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
241
|
-
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,
|
|
240
|
+
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
|
|
242
241
|
Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
|
|
243
|
-
Requires-Dist:
|
|
244
|
-
Requires-Dist:
|
|
245
|
-
Requires-Dist:
|
|
246
|
-
Requires-Dist:
|
|
247
|
-
Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
|
|
242
|
+
Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
|
|
243
|
+
Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
|
|
244
|
+
Requires-Dist: mistral-common>=1.11.3,<2 ; extra == 'mistral'
|
|
245
|
+
Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
|
|
248
246
|
Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
|
|
249
247
|
Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
|
|
250
|
-
Requires-Dist: tiktoken>=0.
|
|
248
|
+
Requires-Dist: tiktoken>=0.13.0,<1 ; extra == 'openai'
|
|
251
249
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
|
|
252
250
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
|
|
253
251
|
Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
|
|
254
252
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
|
|
255
253
|
Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
|
|
256
|
-
Requires-Dist: accelerate>=
|
|
254
|
+
Requires-Dist: accelerate>=1.14.0,<2 ; extra == 'transformers'
|
|
257
255
|
Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
|
|
258
256
|
Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
|
|
259
257
|
Requires-Python: >=3.12, <3.13
|
|
@@ -261,7 +259,6 @@ Project-URL: repository, https://github.com/Aleph-Alpha-Research/eval-framework
|
|
|
261
259
|
Provides-Extra: accelerate
|
|
262
260
|
Provides-Extra: all
|
|
263
261
|
Provides-Extra: api
|
|
264
|
-
Provides-Extra: comet
|
|
265
262
|
Provides-Extra: determined
|
|
266
263
|
Provides-Extra: mistral
|
|
267
264
|
Provides-Extra: openai
|
|
@@ -319,7 +316,6 @@ pip install eval_framework
|
|
|
319
316
|
|
|
320
317
|
There are optional extras available to unlock specific features of the library:
|
|
321
318
|
- `api` for inference using the aleph-alpha client.
|
|
322
|
-
- `comet` for the COMET metric.
|
|
323
319
|
- `determined` for running jobs via determined.
|
|
324
320
|
- `mistral` for inference on Mistral models.
|
|
325
321
|
- `transformers` for inference using the transformers library.
|
|
@@ -47,7 +47,6 @@ pip install eval_framework
|
|
|
47
47
|
|
|
48
48
|
There are optional extras available to unlock specific features of the library:
|
|
49
49
|
- `api` for inference using the aleph-alpha client.
|
|
50
|
-
- `comet` for the COMET metric.
|
|
51
50
|
- `determined` for running jobs via determined.
|
|
52
51
|
- `mistral` for inference on Mistral models.
|
|
53
52
|
- `transformers` for inference using the transformers library.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.
|
|
4
|
-
description = "
|
|
3
|
+
version = "0.5.0"
|
|
4
|
+
description = "Evaluation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
7
7
|
requires-python = ">=3.12,<3.13"
|
|
@@ -18,53 +18,51 @@ classifiers = [
|
|
|
18
18
|
"Typing :: Typed",
|
|
19
19
|
]
|
|
20
20
|
dependencies = [
|
|
21
|
-
"pyyaml>=6.0.
|
|
22
|
-
"xmltodict>=0.
|
|
23
|
-
"pydantic>=2.
|
|
24
|
-
"datasets>=
|
|
25
|
-
"sacrebleu>=2.
|
|
26
|
-
"pycountry>=
|
|
27
|
-
"nltk>=3.9.
|
|
28
|
-
"python-dotenv>=1.
|
|
29
|
-
"lingua-language-detector>=2.0
|
|
30
|
-
"google-crc32c>=1.
|
|
31
|
-
"kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
|
|
21
|
+
"pyyaml>=6.0.3,<7",
|
|
22
|
+
"xmltodict>=1.0.4,<1.1",
|
|
23
|
+
"pydantic>=2.13.4,<3",
|
|
24
|
+
"datasets>=5.0.0,<6",
|
|
25
|
+
"sacrebleu>=2.6.0,<3",
|
|
26
|
+
"pycountry>=26.2.16,<27",
|
|
27
|
+
"nltk>=3.9.4,<4",
|
|
28
|
+
"python-dotenv>=1.2.2,<2",
|
|
29
|
+
"lingua-language-detector>=2.2.0,<3",
|
|
30
|
+
"google-crc32c>=1.8.0,<2",
|
|
32
31
|
"langdetect>=1.0.9,<2", # required by the original ifeval implementation
|
|
33
|
-
"spacy>=3.8.
|
|
34
|
-
"jsonschema>=4.
|
|
35
|
-
"mysql-connector-python>=9.
|
|
36
|
-
"psycopg2-binary>=2.9.
|
|
32
|
+
"spacy>=3.8.14,<4",
|
|
33
|
+
"jsonschema>=4.26.0,<5",
|
|
34
|
+
"mysql-connector-python>=9.7.0,<10", # required for sql-related tasks
|
|
35
|
+
"psycopg2-binary>=2.9.12,<3", # required for sql-related tasks
|
|
37
36
|
"sympy>=1.13.1,<2",
|
|
38
|
-
"llm-sandbox[docker]==0.3.
|
|
37
|
+
"llm-sandbox[docker]==0.3.39",
|
|
39
38
|
"jsonlines>=4,<5",
|
|
40
|
-
"lxml>=6,<7",
|
|
41
|
-
"python-iso639>=
|
|
42
|
-
"wandb>=0.
|
|
43
|
-
"boto3>=1.
|
|
44
|
-
"numpy>=
|
|
39
|
+
"lxml>=6.1.1,<7",
|
|
40
|
+
"python-iso639>=2026.4.20",
|
|
41
|
+
"wandb>=0.27.2,<1",
|
|
42
|
+
"boto3>=1.43.19,<2",
|
|
43
|
+
"numpy>=2.2.6",
|
|
45
44
|
# is a dependency of sympy, but not explicitly listed in the requirements.txt
|
|
46
45
|
# https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
|
|
47
46
|
"antlr4-python3-runtime==4.11.0",
|
|
48
|
-
"scipy>=1.
|
|
49
|
-
|
|
47
|
+
"scipy>=1.17.1,<2", # required for the aggregation of pass@k metrics
|
|
50
48
|
]
|
|
51
49
|
|
|
52
50
|
[project.optional-dependencies]
|
|
53
51
|
# Model-specific extras
|
|
54
52
|
determined = [
|
|
55
|
-
"determined>=0.38,<0.39",
|
|
56
|
-
"tensorboard==2.
|
|
53
|
+
"determined>=0.38.1,<0.39",
|
|
54
|
+
"tensorboard==2.20.0"
|
|
57
55
|
]
|
|
58
56
|
api = ["aleph-alpha-client>=11.5.1"]
|
|
59
57
|
openai = [
|
|
60
58
|
"openai>=1.62,<3",
|
|
61
|
-
"tiktoken>=0.
|
|
59
|
+
"tiktoken>=0.13.0,<1",
|
|
62
60
|
"transformers>=4.45.2,<5",
|
|
63
61
|
]
|
|
64
62
|
transformers = [
|
|
65
63
|
"transformers>=4.45.2,<5",
|
|
66
64
|
"torch>=2.5,<3",
|
|
67
|
-
"accelerate>=
|
|
65
|
+
"accelerate>=1.14.0,<2",
|
|
68
66
|
]
|
|
69
67
|
accelerate = ["accelerate"]
|
|
70
68
|
vllm = [
|
|
@@ -72,21 +70,17 @@ vllm = [
|
|
|
72
70
|
"torch>=2.5,<3"
|
|
73
71
|
]
|
|
74
72
|
mistral = [
|
|
75
|
-
"mistral-common>=1.
|
|
76
|
-
"huggingface-hub>=0.
|
|
73
|
+
"mistral-common>=1.11.3,<2",
|
|
74
|
+
"huggingface-hub>=0.36.2,<0.37",
|
|
77
75
|
"eval_framework[vllm]",
|
|
78
76
|
]
|
|
79
|
-
# Benchmark/metric specific extras
|
|
80
|
-
comet = [
|
|
81
|
-
"unbabel-comet>=2.2.6,<3",
|
|
82
|
-
]
|
|
83
77
|
# from template-formatting
|
|
84
78
|
optional = [
|
|
85
79
|
"transformers>=4.45.2,<5",
|
|
86
80
|
"jinja2>=3.1.6,<4"
|
|
87
81
|
]
|
|
88
82
|
all = [
|
|
89
|
-
"eval_framework[determined,api,openai,transformers,accelerate,vllm,
|
|
83
|
+
"eval_framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral]"
|
|
90
84
|
]
|
|
91
85
|
|
|
92
86
|
[project.urls]
|
|
@@ -97,24 +91,25 @@ eval_framework = "eval_framework.run:run"
|
|
|
97
91
|
|
|
98
92
|
[dependency-groups]
|
|
99
93
|
dev = [
|
|
100
|
-
"mypy>=1.
|
|
101
|
-
"pytest>=
|
|
102
|
-
"pytest-mock>=3.
|
|
103
|
-
"pytest-xdist>=3.
|
|
94
|
+
"mypy>=2.1.0,<3",
|
|
95
|
+
"pytest>=9.1.0,<10",
|
|
96
|
+
"pytest-mock>=3.15.1",
|
|
97
|
+
"pytest-xdist>=3.8.0,<4",
|
|
104
98
|
"pytest-sugar>1.1,<2",
|
|
105
|
-
"types-pyyaml>=6.0.12.
|
|
106
|
-
"types-python-dateutil>=2.9.0.
|
|
107
|
-
"types-requests>=2.
|
|
108
|
-
"plotly>=
|
|
109
|
-
"ruff>=0.
|
|
99
|
+
"types-pyyaml>=6.0.12.20260518,<7",
|
|
100
|
+
"types-python-dateutil>=2.9.0.20260518,<3",
|
|
101
|
+
"types-requests>=2.33.0.20260518,<3",
|
|
102
|
+
"plotly>=6.8.0,<7",
|
|
103
|
+
"ruff>=0.15.18",
|
|
104
|
+
"pip-licenses>=5.5.5",
|
|
110
105
|
]
|
|
111
106
|
flash-attn = [
|
|
112
|
-
"flash-attn>=2.
|
|
107
|
+
"flash-attn>=2.8.3,<2.9",
|
|
113
108
|
"torch"
|
|
114
109
|
]
|
|
115
110
|
|
|
116
111
|
[build-system]
|
|
117
|
-
requires = ["uv_build>=0.
|
|
112
|
+
requires = ["uv_build>=0.11.22,<0.11.23"]
|
|
118
113
|
build-backend = "uv_build"
|
|
119
114
|
|
|
120
115
|
[tool.uv.build-backend]
|
|
@@ -125,22 +120,6 @@ override-dependencies = [
|
|
|
125
120
|
"requests>=2.32,<3", # fix for determined
|
|
126
121
|
]
|
|
127
122
|
|
|
128
|
-
[tool.uv.sources]
|
|
129
|
-
torch = [
|
|
130
|
-
{ index = "pytorch-default", marker = "sys_platform != 'linux'" },
|
|
131
|
-
{ index = "pytorch-cu124", marker = "sys_platform == 'linux'" },
|
|
132
|
-
]
|
|
133
|
-
|
|
134
|
-
[[tool.uv.index]]
|
|
135
|
-
name = "pytorch-cu124"
|
|
136
|
-
url = "https://download.pytorch.org/whl/cu124"
|
|
137
|
-
explicit = true
|
|
138
|
-
|
|
139
|
-
[[tool.uv.index]]
|
|
140
|
-
name = "pytorch-default"
|
|
141
|
-
url = "https://pypi.org/simple"
|
|
142
|
-
explicit = true
|
|
143
|
-
|
|
144
123
|
[tool.uv.extra-build-dependencies]
|
|
145
124
|
# Build flash-attn with the same torch version as in the container. Details at:
|
|
146
125
|
# https://docs.astral.sh/uv/concepts/projects/config/#augmenting-build-dependencies
|
|
@@ -166,6 +145,7 @@ known-third-party = ["wandb"]
|
|
|
166
145
|
|
|
167
146
|
[tool.ruff.lint.extend-per-file-ignores]
|
|
168
147
|
"__init__.py" = ["F401"]
|
|
148
|
+
"tests/tests_eval_framework/tasks/benchmarks/test_mmlu_de.py" = ["E501"]
|
|
169
149
|
|
|
170
150
|
[tool.mypy]
|
|
171
151
|
plugins = "pydantic.mypy"
|
|
@@ -55,7 +55,7 @@ class OpenAIModel(BaseLLM):
|
|
|
55
55
|
formatter: BaseFormatter | None = None,
|
|
56
56
|
temperature: float | None = None,
|
|
57
57
|
top_p: float | None = None,
|
|
58
|
-
api_key: str | None =
|
|
58
|
+
api_key: str | None = None,
|
|
59
59
|
organization: str | None = None,
|
|
60
60
|
base_url: str | None = None,
|
|
61
61
|
bytes_per_token: float | None = None,
|
|
@@ -86,13 +86,12 @@ class OpenAIModel(BaseLLM):
|
|
|
86
86
|
self._top_p = top_p
|
|
87
87
|
|
|
88
88
|
self._client = OpenAI(
|
|
89
|
-
api_key=api_key,
|
|
89
|
+
api_key=api_key if api_key is not None else os.getenv("OPENAI_API_KEY", ""),
|
|
90
90
|
organization=organization,
|
|
91
91
|
base_url=base_url,
|
|
92
92
|
)
|
|
93
93
|
|
|
94
|
-
|
|
95
|
-
self._encoder = self._get_encoder()
|
|
94
|
+
self._encoder: tiktoken.Encoding | Tokenizer | None = self._get_encoder_or_none()
|
|
96
95
|
|
|
97
96
|
# set bytes_per_token_scalar for non-standard models
|
|
98
97
|
if bytes_per_token is not None and bytes_per_token <= 0:
|
|
@@ -101,9 +100,23 @@ class OpenAIModel(BaseLLM):
|
|
|
101
100
|
4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
|
|
102
101
|
)
|
|
103
102
|
|
|
104
|
-
def
|
|
103
|
+
def _get_encoder_or_none(self) -> tiktoken.Encoding | None:
|
|
105
104
|
assert self._model_name is not None
|
|
106
|
-
|
|
105
|
+
try:
|
|
106
|
+
return tiktoken.encoding_for_model(self._model_name)
|
|
107
|
+
except KeyError:
|
|
108
|
+
logger.info(
|
|
109
|
+
"tiktoken could not map model_name=%r. Disabling token counting for this model.",
|
|
110
|
+
self._model_name,
|
|
111
|
+
)
|
|
112
|
+
return None
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.warning(
|
|
115
|
+
"Failed to initialize tiktoken encoder for model_name=%r (%s). Disabling token counting.",
|
|
116
|
+
self._model_name,
|
|
117
|
+
e.__class__.__name__,
|
|
118
|
+
)
|
|
119
|
+
return None
|
|
107
120
|
|
|
108
121
|
def _count_tokens(self, text: str) -> int:
|
|
109
122
|
"""
|
|
@@ -115,6 +128,8 @@ class OpenAIModel(BaseLLM):
|
|
|
115
128
|
Returns:
|
|
116
129
|
Number of tokens.
|
|
117
130
|
"""
|
|
131
|
+
if self._encoder is None:
|
|
132
|
+
raise RuntimeError("Token counting is not available (no encoder configured).")
|
|
118
133
|
return len(self._encoder.encode(text))
|
|
119
134
|
|
|
120
135
|
def generate_from_messages(
|
|
@@ -166,14 +181,31 @@ class OpenAIModel(BaseLLM):
|
|
|
166
181
|
stop=stop_sequences,
|
|
167
182
|
)
|
|
168
183
|
completion = response.choices[0].text
|
|
184
|
+
usage = getattr(response, "usage", None)
|
|
185
|
+
prompt_tokens = getattr(usage, "prompt_tokens", None) if usage is not None else None
|
|
186
|
+
completion_tokens = getattr(usage, "completion_tokens", None) if usage is not None else None
|
|
169
187
|
return RawCompletion(
|
|
170
188
|
prompt=prompt,
|
|
171
|
-
prompt_sequence_positions=
|
|
172
|
-
|
|
173
|
-
|
|
189
|
+
prompt_sequence_positions=(
|
|
190
|
+
prompt_tokens
|
|
191
|
+
if prompt_tokens is not None
|
|
192
|
+
else (self._count_tokens(prompt) if self._encoder is not None else None)
|
|
193
|
+
),
|
|
194
|
+
concat_compression=(
|
|
195
|
+
ConcatCompression.calculate(
|
|
196
|
+
single_messages,
|
|
197
|
+
count_tokens=self._count_tokens,
|
|
198
|
+
completion=completion,
|
|
199
|
+
)
|
|
200
|
+
if self._encoder is not None
|
|
201
|
+
else None
|
|
174
202
|
),
|
|
175
203
|
completion=completion,
|
|
176
|
-
completion_sequence_positions=
|
|
204
|
+
completion_sequence_positions=(
|
|
205
|
+
completion_tokens
|
|
206
|
+
if completion_tokens is not None
|
|
207
|
+
else (self._count_tokens(completion) if self._encoder is not None else None)
|
|
208
|
+
),
|
|
177
209
|
)
|
|
178
210
|
|
|
179
211
|
else:
|
|
@@ -190,15 +222,26 @@ class OpenAIModel(BaseLLM):
|
|
|
190
222
|
)
|
|
191
223
|
prompt = "\n".join([f"{m.get('role', '')}: {m.get('content', '')}" for m in chat_messages])
|
|
192
224
|
prompt_tokens = getattr(chat_response.usage, "prompt_tokens", None)
|
|
225
|
+
completion_tokens = getattr(chat_response.usage, "completion_tokens", None)
|
|
193
226
|
completion = chat_response.choices[0].message.content or ""
|
|
194
227
|
return RawCompletion(
|
|
195
228
|
prompt=prompt,
|
|
196
229
|
prompt_sequence_positions=prompt_tokens,
|
|
197
|
-
concat_compression=
|
|
198
|
-
|
|
230
|
+
concat_compression=(
|
|
231
|
+
ConcatCompression.calculate(
|
|
232
|
+
single_messages,
|
|
233
|
+
count_tokens=self._count_tokens,
|
|
234
|
+
completion=completion,
|
|
235
|
+
)
|
|
236
|
+
if self._encoder is not None
|
|
237
|
+
else None
|
|
199
238
|
),
|
|
200
239
|
completion=completion,
|
|
201
|
-
completion_sequence_positions=
|
|
240
|
+
completion_sequence_positions=(
|
|
241
|
+
completion_tokens
|
|
242
|
+
if completion_tokens is not None
|
|
243
|
+
else (self._count_tokens(completion) if self._encoder is not None else None)
|
|
244
|
+
),
|
|
202
245
|
)
|
|
203
246
|
|
|
204
247
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
@@ -218,6 +261,10 @@ class OpenAIModel(BaseLLM):
|
|
|
218
261
|
Note:
|
|
219
262
|
Uses the OpenAI completions API with echo=True; chat logprobs are not supported.
|
|
220
263
|
"""
|
|
264
|
+
if self._encoder is None:
|
|
265
|
+
raise NotImplementedError(
|
|
266
|
+
"OpenAIModel.logprobs() requires a local tokenizer/encoder, but none is available."
|
|
267
|
+
)
|
|
221
268
|
assert self._model_name in ["babbage-002", "davinci-002"], (
|
|
222
269
|
"Log-probs for prompt tokens are only supported for a limited set of models."
|
|
223
270
|
)
|
|
@@ -383,12 +430,16 @@ class DeepseekModel(OpenAIModel):
|
|
|
383
430
|
base_url="https://api.deepseek.com/beta",
|
|
384
431
|
)
|
|
385
432
|
self._tokenizer_name = tokenizer_name if tokenizer_name is not None else "deepseek-ai/DeepSeek-V3.2-Exp"
|
|
433
|
+
# DeepSeek uses HF tokenization; override the base encoder (which may be None).
|
|
434
|
+
self._encoder = self._get_encoder()
|
|
386
435
|
|
|
387
436
|
def _get_encoder(self) -> Tokenizer:
|
|
388
437
|
return AutoTokenizer.from_pretrained(self._tokenizer_name)
|
|
389
438
|
|
|
390
439
|
def _count_tokens(self, text: str) -> int:
|
|
391
|
-
|
|
440
|
+
encoder = self._encoder
|
|
441
|
+
assert encoder is not None
|
|
442
|
+
return len(encoder.encode(text)) # type: ignore[union-attr]
|
|
392
443
|
|
|
393
444
|
|
|
394
445
|
### Model Aliases ###
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import atexit
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import signal
|
|
7
|
+
import socket
|
|
8
|
+
import subprocess
|
|
9
|
+
import time
|
|
10
|
+
import urllib.error
|
|
11
|
+
import urllib.request
|
|
12
|
+
from collections.abc import Sequence
|
|
13
|
+
|
|
14
|
+
from eval_framework.llm.base import BaseLLM
|
|
15
|
+
from eval_framework.llm.openai import OpenAIModel
|
|
16
|
+
from eval_framework.shared.types import RawCompletion, RawLoglikelihood
|
|
17
|
+
from eval_framework.tasks.base import Sample
|
|
18
|
+
from template_formatting.formatter import BaseFormatter, Message
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _pick_free_port(host: str) -> int:
|
|
24
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
25
|
+
s.bind((host, 0))
|
|
26
|
+
return int(s.getsockname()[1])
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _wait_for_http_ready(url: str, *, timeout_s: float) -> None:
|
|
30
|
+
deadline = time.time() + timeout_s
|
|
31
|
+
last_err: Exception | None = None
|
|
32
|
+
while time.time() < deadline:
|
|
33
|
+
try:
|
|
34
|
+
with urllib.request.urlopen(url, timeout=2) as resp:
|
|
35
|
+
if 200 <= resp.status < 500:
|
|
36
|
+
return
|
|
37
|
+
except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
|
|
38
|
+
last_err = e
|
|
39
|
+
time.sleep(0.25)
|
|
40
|
+
raise RuntimeError(f"Timed out waiting for server readiness at {url}. Last error: {last_err}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _wait_for_http_ready_or_proc_exit(url: str, *, timeout_s: float, proc: subprocess.Popen[str]) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Like `_wait_for_http_ready`, but fail fast if the server process exits.
|
|
46
|
+
|
|
47
|
+
This avoids long timeouts that hide the real root cause (e.g. invalid CLI flags,
|
|
48
|
+
missing dependencies, CUDA issues).
|
|
49
|
+
"""
|
|
50
|
+
deadline = time.time() + timeout_s
|
|
51
|
+
last_err: Exception | None = None
|
|
52
|
+
|
|
53
|
+
while time.time() < deadline:
|
|
54
|
+
if proc.poll() is not None:
|
|
55
|
+
out = ""
|
|
56
|
+
try:
|
|
57
|
+
if proc.stdout is not None:
|
|
58
|
+
out = proc.stdout.read() or ""
|
|
59
|
+
except Exception:
|
|
60
|
+
out = ""
|
|
61
|
+
tail = out.strip()
|
|
62
|
+
if len(tail) > 8000:
|
|
63
|
+
tail = tail[-8000:]
|
|
64
|
+
raise RuntimeError(
|
|
65
|
+
f"vLLM server process exited before becoming ready. exit_code={proc.returncode}. Output (tail):\n{tail}"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
with urllib.request.urlopen(url, timeout=2) as resp:
|
|
70
|
+
if 200 <= resp.status < 500:
|
|
71
|
+
return
|
|
72
|
+
except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
|
|
73
|
+
last_err = e
|
|
74
|
+
time.sleep(0.25)
|
|
75
|
+
|
|
76
|
+
raise RuntimeError(f"Timed out waiting for server readiness at {url}. Last error: {last_err}")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class VLLMLocalServerModel(BaseLLM):
|
|
80
|
+
"""
|
|
81
|
+
Provider-style model: start a local vLLM OpenAI-compatible server, then talk to it via `OpenAIModel(base_url=...)`.
|
|
82
|
+
|
|
83
|
+
This gives you a stable HTTP boundary (good for VCR cassettes) while keeping "local vLLM" as a selectable backend.
|
|
84
|
+
|
|
85
|
+
Notes:
|
|
86
|
+
- The server is started in a subprocess using `vllm serve`.
|
|
87
|
+
- Cleanup is best-effort (SIGTERM then SIGKILL).
|
|
88
|
+
- Not all OpenAI API features are guaranteed to be supported by the local server (e.g. logprobs).
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
*,
|
|
94
|
+
model_name: str,
|
|
95
|
+
host: str = "127.0.0.1",
|
|
96
|
+
port: int | None = None,
|
|
97
|
+
startup_timeout_s: float = 120.0,
|
|
98
|
+
# `OpenAIModel` parameters:
|
|
99
|
+
formatter: BaseFormatter | None = None,
|
|
100
|
+
temperature: float | None = None,
|
|
101
|
+
top_p: float | None = None,
|
|
102
|
+
api_key: str | None = None,
|
|
103
|
+
bytes_per_token: float | None = None,
|
|
104
|
+
# vLLM "serve" parameters (subset, passed through):
|
|
105
|
+
tensor_parallel_size: int | None = None,
|
|
106
|
+
dtype: str | None = None,
|
|
107
|
+
max_model_len: int | None = None,
|
|
108
|
+
gpu_memory_utilization: float | None = None,
|
|
109
|
+
enforce_eager: bool | None = None,
|
|
110
|
+
# Escape hatch:
|
|
111
|
+
vllm_command: str | None = None,
|
|
112
|
+
vllm_extra_args: list[str] | None = None,
|
|
113
|
+
env: dict[str, str] | None = None,
|
|
114
|
+
) -> None:
|
|
115
|
+
self._model_name = model_name
|
|
116
|
+
self._host = host
|
|
117
|
+
self._port = port if port is not None else _pick_free_port(host)
|
|
118
|
+
self._startup_timeout_s = float(startup_timeout_s)
|
|
119
|
+
|
|
120
|
+
self._proc: subprocess.Popen[str] | None = None
|
|
121
|
+
|
|
122
|
+
self._server_url = f"http://{self._host}:{self._port}/v1"
|
|
123
|
+
|
|
124
|
+
cmd = [vllm_command or "vllm", "serve", self._model_name, "--host", self._host, "--port", str(self._port)]
|
|
125
|
+
|
|
126
|
+
# A small, intentionally conservative subset of flags.
|
|
127
|
+
if tensor_parallel_size is not None:
|
|
128
|
+
cmd += ["--tensor-parallel-size", str(tensor_parallel_size)]
|
|
129
|
+
if dtype is not None:
|
|
130
|
+
cmd += ["--dtype", str(dtype)]
|
|
131
|
+
if max_model_len is not None:
|
|
132
|
+
cmd += ["--max-model-len", str(max_model_len)]
|
|
133
|
+
if gpu_memory_utilization is not None:
|
|
134
|
+
cmd += ["--gpu-memory-utilization", str(gpu_memory_utilization)]
|
|
135
|
+
if enforce_eager is not None:
|
|
136
|
+
# vLLM exposes this as a boolean flag; passing a value breaks CLI parsing.
|
|
137
|
+
if enforce_eager:
|
|
138
|
+
cmd += ["--enforce-eager"]
|
|
139
|
+
|
|
140
|
+
if vllm_extra_args:
|
|
141
|
+
cmd += list(vllm_extra_args)
|
|
142
|
+
|
|
143
|
+
merged_env = os.environ.copy()
|
|
144
|
+
if env:
|
|
145
|
+
merged_env.update(env)
|
|
146
|
+
|
|
147
|
+
logger.info("Starting local vLLM server: %s", " ".join(cmd))
|
|
148
|
+
self._proc = subprocess.Popen(
|
|
149
|
+
cmd,
|
|
150
|
+
env=merged_env,
|
|
151
|
+
stdout=subprocess.PIPE,
|
|
152
|
+
stderr=subprocess.STDOUT,
|
|
153
|
+
text=True,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Ensure we don't leave it around if the process exits abruptly.
|
|
157
|
+
atexit.register(self._cleanup)
|
|
158
|
+
|
|
159
|
+
# Wait until the OpenAI-compatible endpoints respond.
|
|
160
|
+
if self._proc is None:
|
|
161
|
+
raise RuntimeError("Failed to start vLLM server process.")
|
|
162
|
+
_wait_for_http_ready_or_proc_exit(
|
|
163
|
+
f"{self._server_url}/models",
|
|
164
|
+
timeout_s=self._startup_timeout_s,
|
|
165
|
+
proc=self._proc,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Configure client to talk to the local server.
|
|
169
|
+
# For local servers, any non-empty API key typically works; allow explicit override.
|
|
170
|
+
effective_api_key = api_key if api_key is not None else os.getenv("OPENAI_API_KEY") or "local-vllm"
|
|
171
|
+
|
|
172
|
+
self._client = OpenAIModel(
|
|
173
|
+
model_name=self._model_name,
|
|
174
|
+
formatter=formatter,
|
|
175
|
+
temperature=temperature,
|
|
176
|
+
top_p=top_p,
|
|
177
|
+
api_key=effective_api_key,
|
|
178
|
+
base_url=self._server_url,
|
|
179
|
+
bytes_per_token=bytes_per_token,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def name(self) -> str:
|
|
184
|
+
return f"vllm_local::{self._model_name}"
|
|
185
|
+
|
|
186
|
+
def generate_from_messages(
|
|
187
|
+
self,
|
|
188
|
+
messages: list[Sequence[Message]],
|
|
189
|
+
stop_sequences: list[str] | None = None,
|
|
190
|
+
max_tokens: int | None = None,
|
|
191
|
+
temperature: float | None = None,
|
|
192
|
+
top_p: float | None = None,
|
|
193
|
+
) -> list[RawCompletion]:
|
|
194
|
+
return self._client.generate_from_messages(messages, stop_sequences, max_tokens, temperature, top_p)
|
|
195
|
+
|
|
196
|
+
def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
|
|
197
|
+
return self._client.logprobs(samples)
|
|
198
|
+
|
|
199
|
+
def _cleanup(self) -> None:
|
|
200
|
+
proc = self._proc
|
|
201
|
+
self._proc = None
|
|
202
|
+
if proc is None:
|
|
203
|
+
return
|
|
204
|
+
if proc.poll() is not None:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
proc.terminate()
|
|
209
|
+
proc.wait(timeout=10)
|
|
210
|
+
except Exception:
|
|
211
|
+
try:
|
|
212
|
+
proc.send_signal(signal.SIGKILL)
|
|
213
|
+
except Exception:
|
|
214
|
+
pass
|
|
215
|
+
|
|
216
|
+
def __del__(self) -> None:
|
|
217
|
+
self._cleanup()
|