evalscope 0.10.0__tar.gz → 0.10.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.10.0/evalscope.egg-info → evalscope-0.10.1}/PKG-INFO +7 -7
- {evalscope-0.10.0 → evalscope-0.10.1}/README.md +6 -6
- evalscope-0.10.1/evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope-0.10.1/evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -1
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/ifeval/instructions.py +3 -4
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/cli/start_app.py +3 -2
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/chat_adapter.py +2 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/report/app.py +268 -81
- evalscope-0.10.1/evalscope/version.py +4 -0
- {evalscope-0.10.0 → evalscope-0.10.1/evalscope.egg-info}/PKG-INFO +7 -7
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope.egg-info/SOURCES.txt +3 -0
- evalscope-0.10.1/tests/rag/__init__.py +0 -0
- evalscope-0.10.0/evalscope/version.py +0 -4
- {evalscope-0.10.0 → evalscope-0.10.1}/LICENSE +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/MANIFEST.in +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/arguments.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/base.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/samples.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/data_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
- {evalscope-0.10.0/evalscope/benchmarks/ifeval → evalscope-0.10.1/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.10.0/evalscope/benchmarks/iquiz → evalscope-0.10.1/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.10.0/evalscope/benchmarks/mmlu_pro → evalscope-0.10.1/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.10.0/evalscope/perf/utils → evalscope-0.10.1/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/cli/base.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/cli/cli.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/collections/evaluator.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/collections/schema.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/config.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/constants.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/evaluator/evaluator.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/metrics/math_accuracy.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/base_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/choice_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/custom_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/local_model.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/model.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/models/server_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/arguments.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/benchmark.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/main.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/openai_api.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.10.0/tests/rag → evalscope-0.10.1/evalscope/perf/utils}/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/utils/benchmark_util.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/utils/db_util.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/report/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/report/combinator.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/report/generator.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/report/utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/run.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/run_arena.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/summarizer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/utils/io_utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/utils/logger.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope/utils/utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope.egg-info/requires.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/requirements/app.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/requirements/docs.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/requirements/framework.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/requirements/inner.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/requirements/opencompass.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/requirements/perf.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/requirements/rag.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/requirements/tests.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/requirements/vlmeval.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/requirements.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/setup.cfg +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/setup.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/cli/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/cli/test_collection.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/cli/test_run.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/perf/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/perf/test_perf.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/rag/test_mteb.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/swift/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/test_run_all.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/vlm/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.10.1}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.10.
|
|
3
|
+
Version: 0.10.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -215,7 +215,7 @@ Please scan the QR code below to join our community groups:
|
|
|
215
215
|
|
|
216
216
|
|
|
217
217
|
## 🎉 News
|
|
218
|
-
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/
|
|
218
|
+
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
219
219
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
220
220
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
221
221
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
@@ -416,27 +416,27 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
416
416
|
<table>
|
|
417
417
|
<tr>
|
|
418
418
|
<td style="text-align: center;">
|
|
419
|
-
<img src="docs/
|
|
419
|
+
<img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
|
|
420
420
|
<p>Setting Interface</p>
|
|
421
421
|
</td>
|
|
422
422
|
<td style="text-align: center;">
|
|
423
|
-
<img src="docs/
|
|
423
|
+
<img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
|
|
424
424
|
<p>Model Comparison</p>
|
|
425
425
|
</td>
|
|
426
426
|
</tr>
|
|
427
427
|
<tr>
|
|
428
428
|
<td style="text-align: center;">
|
|
429
|
-
<img src="docs/
|
|
429
|
+
<img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
|
|
430
430
|
<p>Report Overview</p>
|
|
431
431
|
</td>
|
|
432
432
|
<td style="text-align: center;">
|
|
433
|
-
<img src="docs/
|
|
433
|
+
<img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
|
|
434
434
|
<p>Report Details</p>
|
|
435
435
|
</td>
|
|
436
436
|
</tr>
|
|
437
437
|
</table>
|
|
438
438
|
|
|
439
|
-
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/
|
|
439
|
+
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
440
440
|
|
|
441
441
|
## 🌐 Evaluation of Specified Model API
|
|
442
442
|
|
|
@@ -74,7 +74,7 @@ Please scan the QR code below to join our community groups:
|
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
## 🎉 News
|
|
77
|
-
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/
|
|
77
|
+
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
78
78
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
79
79
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
80
80
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
@@ -275,27 +275,27 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
275
275
|
<table>
|
|
276
276
|
<tr>
|
|
277
277
|
<td style="text-align: center;">
|
|
278
|
-
<img src="docs/
|
|
278
|
+
<img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
|
|
279
279
|
<p>Setting Interface</p>
|
|
280
280
|
</td>
|
|
281
281
|
<td style="text-align: center;">
|
|
282
|
-
<img src="docs/
|
|
282
|
+
<img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
|
|
283
283
|
<p>Model Comparison</p>
|
|
284
284
|
</td>
|
|
285
285
|
</tr>
|
|
286
286
|
<tr>
|
|
287
287
|
<td style="text-align: center;">
|
|
288
|
-
<img src="docs/
|
|
288
|
+
<img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
|
|
289
289
|
<p>Report Overview</p>
|
|
290
290
|
</td>
|
|
291
291
|
<td style="text-align: center;">
|
|
292
|
-
<img src="docs/
|
|
292
|
+
<img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
|
|
293
293
|
<p>Report Details</p>
|
|
294
294
|
</td>
|
|
295
295
|
</tr>
|
|
296
296
|
</table>
|
|
297
297
|
|
|
298
|
-
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/
|
|
298
|
+
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
299
299
|
|
|
300
300
|
## 🌐 Evaluation of Specified Model API
|
|
301
301
|
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
Question: In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?
|
|
2
|
+
Choices:
|
|
3
|
+
(A) 1/400
|
|
4
|
+
(B) 19/400
|
|
5
|
+
(C) 20/400
|
|
6
|
+
(D) 38/400
|
|
7
|
+
Let's think step by step:
|
|
8
|
+
The expected proportion of individuals who carry the b allele but are not expected to develop the cancer equals to the frequency of heterozygous allele in the given population.
|
|
9
|
+
According to the Hardy-Weinberg equation p∧2 + 2pq + q∧2 = 1, where p is the frequency of dominant allele frequency, q is the frequency of recessive allele frequency, p∧2 is the frequency of the homozygous dominant allele, q∧2 is the frequency of the recessive allele, and 2pq is the frequency of the heterozygous allele.
|
|
10
|
+
Given that q∧2=1/400, hence, q=0.05 and p=1-q=0.95.
|
|
11
|
+
The frequency of the heterozygous allele is 2pq=2*0.05*0.95=38/400.
|
|
12
|
+
The correct answer is (D)
|
|
13
|
+
Question: A Fe pellet of 0.056 g is first dissolved in 10 mL of hydrobromic acid HBr (0.1 M). The resulting solution is then titrated by KMnO4 (0.02 M). How many equivalence points are there?
|
|
14
|
+
Choices:
|
|
15
|
+
(A) Two points, 25 ml and 35 ml
|
|
16
|
+
(B) One point, 25 mL
|
|
17
|
+
(C) One point, 10 ml
|
|
18
|
+
(D) Two points, 25 ml and 30 ml
|
|
19
|
+
Let's think step by step:
|
|
20
|
+
HBr will react with Fe to produce Fe2+. MnO4- will first react with Fe2+ then Br-.
|
|
21
|
+
Two equivalence points will exist 25 ml and 35 ml.
|
|
22
|
+
HBr will react with Fe to produce Fe2+. MnO4- will first react with Fe2+ then Br-.
|
|
23
|
+
Two equivalence points will exist 25 ml and 35 ml.
|
|
24
|
+
In the beaker there is Fe2+ and Br-.
|
|
25
|
+
When considering titration with two analytes one will have to consider which reaction will occur first.
|
|
26
|
+
Since it is a redox titration consider the reduction potential of:
|
|
27
|
+
E0 (Br2 /Br- ) = 1.09 V E0 (MnO4-/ Mn2+) = 1.49 V E0 (Fe3+/Fe2+) =0.77 V
|
|
28
|
+
[Fe2+]=m/MV=0.1M.
|
|
29
|
+
Reaction 1: MnO4- + 5Fe2+ + 8H+ → Mn2+ + 5Fe3+ + 4H2O
|
|
30
|
+
Reaction 2: 2MnO4- + 10Br- + 16H+ → 2Mn2+ + 5Br2 + 8H2O
|
|
31
|
+
So MnO4- will first react with Fe2+ with a stoichiometry of 1:5 so Veq1 will be 10 ml.
|
|
32
|
+
Then when Fe2+ is used up, MnO4- will react with Br- with a stoichiometry of 2:10 then V added will be 25 ml so Veq2=25+10=35 ml.
|
|
33
|
+
The correct answer is (A)
|
|
34
|
+
Question: Consider a quantum mechanical system containing a particle of mass $m$ moving in an istropic three dimensional potential of the form $V(r) = 1/2 m \omega^2 r^2$ corresponding to the acted force obeying Hooke’s law. Here, $\omega$ is the angular frequency of oscillation and $r$ is the radial distance of the particle from the origin in spherical polar coordinate. What is the value of energy of the third excited state, and how many linearly independent eigenfunctions are possible for the same energy eigenvalue?
|
|
35
|
+
Choices:
|
|
36
|
+
(A) 11 \pi^2 \hbar^2 / (2m r^2), 3
|
|
37
|
+
(B) (9/2) \hbar \omega , 10
|
|
38
|
+
(C) 11 \pi^2 \hbar^2 / (2m r^2), 10
|
|
39
|
+
(D) (9/2) \hbar \omega, 3
|
|
40
|
+
Let's think step by step:
|
|
41
|
+
This problem is nothing but the three dimensional simple harmonic oscillator (SHO) problem.
|
|
42
|
+
The energy spectrum of three dimensional SHO is $E_n= (n+3/2)\hbar \omega$ where $n=0,1,2,3….$.
|
|
43
|
+
For third excited state n=3.
|
|
44
|
+
3+3/2=6/2+3/2=9/2.
|
|
45
|
+
Thus the corresponding energy is $(9/2)\hbar \omega$.
|
|
46
|
+
The degeneracy of the state is $g_n= (n+1)(n+2)/2$.
|
|
47
|
+
For n=3, degeneracy is (3+1)*(3+2)/2=4*5/2=10.
|
|
48
|
+
The correct answer is (B)
|
|
49
|
+
Question: "Your overhear two chemists talking to each other as they leave a synthetic organic chemistry lab. One asks the other "So, how did it go?" The second chemist replies, "Not well - my compounds are on top of each other." What is the second chemist most likely referring to?"
|
|
50
|
+
Choices:
|
|
51
|
+
(A) The compounds they are working with have similar polarities.
|
|
52
|
+
(B) The compounds they are working with have similar boiling points.
|
|
53
|
+
(C) The compounds they are working with are bonding to each other through non-covalent/van der Waals interactions.
|
|
54
|
+
(D) The compounds they are working with have similar optical rotations.
|
|
55
|
+
Let's think step by step:
|
|
56
|
+
"On top of each other" commonly refers to two compounds that have similar Rf values on chromatography (a common operation in synthetic chemistry).
|
|
57
|
+
Similar Rf values arise for compounds with similar polarities.
|
|
58
|
+
The correct answer is (A)
|
|
59
|
+
Question: Two people are playing the following game. A fair coin is tossed into the air. Person A says that in a single toss of the coin, the tail will come. So it's like the first shot or the third shot or the fifth shot. Person B says that the coin will come with a double toss. So like the second, fourth, sixth or eighth shot. Imagine this game played forever. What is the probability that person A wins this game?
|
|
60
|
+
Choices:
|
|
61
|
+
(A) 1/2
|
|
62
|
+
(B) 1/4
|
|
63
|
+
(C) 2/3
|
|
64
|
+
(D) 1/8
|
|
65
|
+
Let's think step by step:
|
|
66
|
+
When finding the correct answer, the probability of playing forever and the coin's single-point toss will be calculated.
|
|
67
|
+
For example, a tail may appear on the first shot.
|
|
68
|
+
This probability is 1/2. if the first toss doesn't come up, it shouldn't come to the second roll either, because the second throw is an even number.
|
|
69
|
+
So it can come in the third shot.
|
|
70
|
+
This is (1/2)(1/2)(1/2).
|
|
71
|
+
So (1/2)^3=1/8.
|
|
72
|
+
Or it could come on the fifth shot.
|
|
73
|
+
This is (1/2)^5=1/32.
|
|
74
|
+
This is actually a geometric series that goes on forever.
|
|
75
|
+
We can write this series as follows.
|
|
76
|
+
(1/2) + (1/2)^3 + (1/2)^5 + (1/2)^7 + ……….
|
|
77
|
+
The solution for this series is as follows : a1/(1-r) where a1 is the first number and r is the sequence or r= a2/a1 or a3/a2 etc.
|
|
78
|
+
a1=1/2
|
|
79
|
+
r=(1/2)^2=1/4
|
|
80
|
+
So a1/(1-r)=(1/2)/(1-1/4)=(1/2)/(3/4)=2/3.
|
|
81
|
+
The correct answer is (C)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import random
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.constants import AnswerKeys, EvalType
|
|
7
|
+
from evalscope.metrics import Pass1, exact_match
|
|
8
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
+
from evalscope.utils.utils import ResponseParser
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@Benchmark.register(
|
|
13
|
+
name='gpqa',
|
|
14
|
+
dataset_id='modelscope/gpqa',
|
|
15
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
16
|
+
subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
|
|
17
|
+
metric_list=[Pass1],
|
|
18
|
+
few_shot_num=5,
|
|
19
|
+
train_split='train',
|
|
20
|
+
eval_split='train', # only have train split
|
|
21
|
+
prompt_template='',
|
|
22
|
+
)
|
|
23
|
+
class GPQAAdapter(DataAdapter):
|
|
24
|
+
|
|
25
|
+
def __init__(self, **kwargs):
|
|
26
|
+
super().__init__(**kwargs)
|
|
27
|
+
|
|
28
|
+
self.choices = ['A', 'B', 'C', 'D']
|
|
29
|
+
if self.few_shot_num and self.few_shot_num > 0:
|
|
30
|
+
self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n' # noqa: E501
|
|
31
|
+
self.prompt_prefix += open(os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'),
|
|
32
|
+
'r').read() + '\nQuestion: '
|
|
33
|
+
else:
|
|
34
|
+
self.prompt_prefix = 'What is the correct answer to this question:'
|
|
35
|
+
|
|
36
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
37
|
+
"""
|
|
38
|
+
Generate model prompt from input data.
|
|
39
|
+
example:
|
|
40
|
+
{
|
|
41
|
+
"question":"Two people are playing the following game. A fair coin is tossed into the air. Person A says that in a single toss of the coin, the tail will come. So it's like the first shot or the third shot or the fifth shot. Person B says that the coin will come with a double toss. So like the second, fourth, sixth or eighth shot. Imagine this game played forever. What is the probability that person A wins this game?",
|
|
42
|
+
"choice1":"1/2",
|
|
43
|
+
"choice2":"1/4",
|
|
44
|
+
"choice3":"2/3",
|
|
45
|
+
"choice4":"1/8",
|
|
46
|
+
"answer":"C",
|
|
47
|
+
}
|
|
48
|
+
""" # noqa: E501
|
|
49
|
+
processed_input_d = self.__process_input(input_d)
|
|
50
|
+
input_d['answer'] = processed_input_d['answer'] # add answer to input_d for answer extraction
|
|
51
|
+
prompt = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}Let's think step by step: " # noqa: E501
|
|
52
|
+
|
|
53
|
+
return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
54
|
+
|
|
55
|
+
def __process_input(self, input_d: dict) -> dict:
|
|
56
|
+
|
|
57
|
+
def preprocess(text):
|
|
58
|
+
if text is None:
|
|
59
|
+
return ' '
|
|
60
|
+
text = text.strip()
|
|
61
|
+
text = text.replace(' [title]', '. ')
|
|
62
|
+
text = re.sub('\\[.*?\\]', '', text)
|
|
63
|
+
text = text.replace(' ', ' ')
|
|
64
|
+
return text
|
|
65
|
+
|
|
66
|
+
choices = [
|
|
67
|
+
preprocess(input_d['Incorrect Answer 1']),
|
|
68
|
+
preprocess(input_d['Incorrect Answer 2']),
|
|
69
|
+
preprocess(input_d['Incorrect Answer 3']),
|
|
70
|
+
preprocess(input_d['Correct Answer']),
|
|
71
|
+
]
|
|
72
|
+
random.shuffle(choices)
|
|
73
|
+
correct_answer_index = choices.index(preprocess(input_d['Correct Answer']))
|
|
74
|
+
|
|
75
|
+
out_doc = {
|
|
76
|
+
'choices': [choices[0], choices[1], choices[2], choices[3]],
|
|
77
|
+
'answer': f'{chr(65 + correct_answer_index)}',
|
|
78
|
+
}
|
|
79
|
+
return out_doc
|
|
80
|
+
|
|
81
|
+
def __form_options(self, options: list):
|
|
82
|
+
option_str = 'Choices:\n'
|
|
83
|
+
for opt, choice in zip(options, self.choices):
|
|
84
|
+
option_str += f'({choice}) {opt}' + '\n'
|
|
85
|
+
return option_str
|
|
86
|
+
|
|
87
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
88
|
+
"""
|
|
89
|
+
Parse the raw input labels (gold).
|
|
90
|
+
"""
|
|
91
|
+
return input_d['answer']
|
|
92
|
+
|
|
93
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Parse the predicted result and extract proper answer.
|
|
96
|
+
"""
|
|
97
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
98
|
+
|
|
99
|
+
def match(self, gold: str, pred: str) -> float:
|
|
100
|
+
"""
|
|
101
|
+
Match the gold answer and the predicted answer.
|
|
102
|
+
"""
|
|
103
|
+
return exact_match(gold=gold, pred=pred)
|
|
@@ -6,7 +6,6 @@ from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_result
|
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
7
|
from evalscope.metrics import Metric, mean
|
|
8
8
|
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
-
from evalscope.utils.utils import normalize_score
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
@Benchmark.register(
|
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
|
|
16
16
|
import collections
|
|
17
17
|
import json
|
|
18
|
-
import langdetect
|
|
19
18
|
import logging
|
|
20
19
|
import random
|
|
21
20
|
import re
|
|
@@ -163,7 +162,7 @@ class ResponseLanguageChecker(Instruction):
|
|
|
163
162
|
True if the language of `value` follows instruction; otherwise False.
|
|
164
163
|
"""
|
|
165
164
|
assert isinstance(value, str)
|
|
166
|
-
|
|
165
|
+
import langdetect
|
|
167
166
|
try:
|
|
168
167
|
return langdetect.detect(value) == self._language
|
|
169
168
|
except langdetect.LangDetectException as e:
|
|
@@ -1339,7 +1338,7 @@ class CapitalLettersEnglishChecker(Instruction):
|
|
|
1339
1338
|
def check_following(self, value):
|
|
1340
1339
|
"""Checks that the response is in English and in all capital letters."""
|
|
1341
1340
|
assert isinstance(value, str)
|
|
1342
|
-
|
|
1341
|
+
import langdetect
|
|
1343
1342
|
try:
|
|
1344
1343
|
return value.isupper() and langdetect.detect(value) == 'en'
|
|
1345
1344
|
except langdetect.LangDetectException as e:
|
|
@@ -1367,7 +1366,7 @@ class LowercaseLettersEnglishChecker(Instruction):
|
|
|
1367
1366
|
def check_following(self, value):
|
|
1368
1367
|
"""Checks that the response is in English and in all lowercase letters."""
|
|
1369
1368
|
assert isinstance(value, str)
|
|
1370
|
-
|
|
1369
|
+
import langdetect
|
|
1371
1370
|
try:
|
|
1372
1371
|
return value.islower() and langdetect.detect(value) == 'en'
|
|
1373
1372
|
except langdetect.LangDetectException as e:
|
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from argparse import ArgumentParser
|
|
4
4
|
|
|
5
5
|
from evalscope.cli.base import CLICommand
|
|
6
|
-
from evalscope.report.app import create_app
|
|
6
|
+
from evalscope.report.app import add_argument, create_app
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def subparser_func(args):
|
|
@@ -23,7 +23,8 @@ class StartAppCMD(CLICommand):
|
|
|
23
23
|
""" define args for create pipeline template command.
|
|
24
24
|
"""
|
|
25
25
|
parser = parsers.add_parser(StartAppCMD.name)
|
|
26
|
+
add_argument(parser)
|
|
26
27
|
parser.set_defaults(func=subparser_func)
|
|
27
28
|
|
|
28
29
|
def execute(self):
|
|
29
|
-
create_app()
|
|
30
|
+
create_app(self.args)
|
|
@@ -76,6 +76,8 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
76
76
|
# For base model, use the query as the input
|
|
77
77
|
formatted_prompt = query
|
|
78
78
|
|
|
79
|
+
logger.debug(f'formatted_prompt: {formatted_prompt}')
|
|
80
|
+
|
|
79
81
|
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
|
|
80
82
|
input_ids = inputs['input_ids']
|
|
81
83
|
|