PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (273) hide show

evalscope/__init__.py +4 -1
evalscope/api/__init__.py +0 -0
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +3 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
evalscope/api/benchmark/benchmark.py +321 -0
evalscope/api/benchmark/meta.py +115 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +261 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +355 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +264 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +11 -0
evalscope/api/messages/chat_message.py +198 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +105 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/dataset_mixin.py +105 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +157 -0
evalscope/api/model/model.py +383 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +23 -11
evalscope/app/utils/data_utils.py +42 -26
evalscope/app/utils/text_utils.py +0 -2
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +6 -7
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -3
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +2 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +135 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +95 -54
evalscope/constants.py +29 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +277 -423
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +32 -30
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +47 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +123 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +698 -0
evalscope/perf/benchmark.py +2 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +8 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -2
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +101 -6
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +26 -44
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +2 -1
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/io_utils.py +100 -5
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +10 -7
evalscope/utils/multi_choices.py +271 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
tests/aigc/test_t2i.py +22 -4
tests/benchmark/__init__.py +1 -0
tests/benchmark/test_eval.py +386 -0
tests/cli/test_all.py +3 -5
tests/cli/test_collection.py +13 -4
tests/cli/test_custom.py +22 -15
tests/rag/test_clip_benchmark.py +1 -0
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0

evalscope/api/evaluator/state.py ADDED Viewed

@@ -0,0 +1,264 @@
+from dataclasses import dataclass
+from random import Random
+from typing import Any, Dict, List, Optional, Sequence, Union, overload
+from evalscope.api.dataset import Sample
+from evalscope.api.messages import ChatMessage, ChatMessageUser, messages_pretty_str
+from evalscope.api.model import ModelOutput
+class Target(Sequence[str]):
+    """Target for scoring against the current TaskState.
+    Target is a sequence of one or more strings. Use the
+    `text` property to access the value as a single string.
+    """
+    def __init__(self, target: Union[str, List[str]]) -> None:
+        self.target = target if isinstance(target, list) else [target]
+    @overload
+    def __getitem__(self, index: int) -> str:
+        ...
+    @overload
+    def __getitem__(self, index: slice) -> Sequence[str]:
+        ...
+    def __getitem__(self, index: Union[int, slice]) -> Union[str, Sequence[str]]:
+        return self.target[index]
+    def __len__(self) -> int:
+        return len(self.target)
+    @property
+    def text(self) -> str:
+        return ''.join(self.target)
+@dataclass
+class Choice:
+    """
+    A `Choice` represents a single choice in a multiple choice question.
+    It is only relevant for the `multiple_choice` solver and corresponding
+    `choice` scorer.
+    """
+    value: str
+    """The original value of the choice from the `Sample`."""
+    correct: Optional[bool]
+    """Did the model think this choice satisfies the question? `None`
+    indicates this has not been set yet"""
+    original_position: int
+    """Choices may be re-ordered during processing, this represents the
+    original position in the sample's list of choices"""
+class Choices(Sequence[Choice]):
+    """
+    Wrapper class for a list of `Choice` objects.
+    Primarily simply to abstract away implementations of choice-specific
+    functionality from the already-big `TaskState` class.
+    """
+    def __init__(self, choices: Union[List[str], List[Choice]]) -> None:
+        """
+        Setter for choices, intended to only be used with the `multiple_choice` scorer.
+        Choices come from a list of choices for the sample, specifically used by
+        the `multiple_choice` scorer.
+        For example, if the sample was a multiple choice question like "What is
+        the capital of France? A) Paris B) London C) Berlin", we would store the
+        possible answers here.
+        """
+        self._choices: List[Choice] = []
+        for i, choice in enumerate(choices):
+            if isinstance(choice, str):
+                self._choices.append(Choice(value=choice, correct=None, original_position=i))
+            elif isinstance(choice, Choice):
+                self._choices.append(choice)
+    @overload
+    def __getitem__(self, index: int) -> Choice:
+        ...
+    @overload
+    def __getitem__(self, index: slice) -> Sequence[Choice]:
+        ...
+    def __getitem__(self, index: Union[int, slice]) -> Union[Choice, Sequence[Choice]]:
+        return self._choices[index]
+    def __len__(self) -> int:
+        return len(self._choices)
+    def mark_choice(self, index: int, correct: bool) -> None:
+        """Set the value of a specific choice"""
+        self._choices[index].correct = correct
+    def shuffle(self, rand: Random = Random()) -> None:
+        """
+        Shuffle the choice order, setting the `original_position` so they can be mapped back to their original order.
+        Some evals will shuffle the choices from the original sample to try to
+        avoid the model answering correctly due to fine-tuning (or similar) on
+        specific datasets.
+        """
+        shuffled_positions = list(range(len(self._choices)))
+        rand.shuffle(shuffled_positions)
+        shuffled_choices = [Choice('notachoice', None, -1)] * len(self._choices)
+        for i, shuffled_position in enumerate(shuffled_positions):
+            shuffled_choices[i] = self._choices[shuffled_position]
+            shuffled_choices[i].original_position = shuffled_position
+        self._choices = shuffled_choices
+class TaskState:
+    """
+    The `TaskState` represents the internal state of the `Task` being run for a single `Sample`.
+    The `TaskState` is passed to and returned from each solver during a sample's
+    evaluation. It allows us to maintain the manipulated message history, the tools
+    available to the model, the final output of the model, and whether the task
+    is completed or has hit a limit.
+    """
+    def __init__(
+        self,
+        model: str,
+        sample: Sample,
+        messages: List[ChatMessage] = [],
+        output: Optional[ModelOutput] = None,
+        completed: bool = False,
+    ) -> None:
+        self._model = model
+        self._sample = sample
+        self._sample_id = sample.id
+        self._group_id = sample.group_id
+        self._input = sample.input
+        self._target = Target(sample.target)
+        self._metadata = sample.metadata
+        self._messages: List[ChatMessage] = messages
+        self._output = output if output else ModelOutput(model=str(model))
+        self._completed = completed
+        if sample.choices:
+            self._choices = Choices(sample.choices)
+        else:
+            self._choices = Choices([])
+    @property
+    def model(self) -> str:
+        """Name of model being evaluated."""
+        return self._model
+    @property
+    def sample_id(self) -> int:
+        """Unique id for sample."""
+        return self._sample_id
+    @property
+    def group_id(self) -> int:
+        """Group id for sample."""
+        return self._group_id
+    @property
+    def input(self) -> Union[str, List[ChatMessage]]:
+        """Input from the `Sample`, should be considered immutable."""
+        return self._input
+    @property
+    def input_text(self) -> str:
+        """
+        Convenience function for accessing the initial input from the `Sample` as a string.
+        If the `input` is a `List[ChatMessage]`, this will return the text from
+        the last chat message
+        """
+        if isinstance(self._input, str):
+            return self._input
+        else:
+            return messages_pretty_str(self._input)
+    @property
+    def choices(self) -> Choices:
+        """Choices for the sample, if applicable."""
+        return self._choices
+    @property
+    def user_prompt(self) -> ChatMessageUser:
+        """User prompt for this state.
+        Tasks are very general and can have may types of inputs.
+        However, in many cases solvers assume they can interact with
+        the state as a "chat" in a predictable fashion (e.g. prompt
+        engineering solvers). This property enables easy read and
+        write access to the user chat prompt. Raises an
+        exception if there is no user prompt
+        """
+        prompt = next((m for m in reversed(self.messages) if m.role == 'user'), None)
+        if prompt:
+            return prompt
+        else:
+            raise ValueError('user_prompt requested from TaskState but none available')
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        """Metadata from the `Sample` for this `TaskState`"""
+        return self._metadata
+    @metadata.setter
+    def metadata(self, metadata: Dict[str, Any]) -> None:
+        self._metadata = metadata
+    @property
+    def messages(self) -> List[ChatMessage]:
+        """
+        Chat conversation history for sample.
+        This will generally get appended to every time a `generate` call is made
+        to the model. Useful for both debug and for solvers/scorers to assess
+        model performance or choose the next step.
+        """
+        return self._messages
+    @messages.setter
+    def messages(self, messages: List[ChatMessage]) -> None:
+        self._messages = messages
+    @property
+    def output(self) -> ModelOutput:
+        """
+        The 'final' model output once we've completed all solving.
+        For simple evals this may just be the last `message` from the
+        conversation history, but more complex solvers may set this directly.
+        """
+        return self._output
+    @output.setter
+    def output(self, output: ModelOutput) -> None:
+        self._output = output
+    @property
+    def completed(self) -> bool:
+        """Is the task completed."""
+        return self._completed
+    @completed.setter
+    def completed(self, completed: bool) -> None:
+        """Set the completed status."""
+        self._completed = completed
+    @property
+    def target(self) -> str:
+        """The scoring target for this `Sample`."""
+        return self._target.text

evalscope/api/filter/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .filter import Filter, FilterEnsemble, build_filter_ensemble

evalscope/api/filter/filter.py ADDED Viewed

@@ -0,0 +1,72 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Iterable, List, Union
+from evalscope.api.registry import get_filter
+class Filter(ABC):
+    """
+    Filter classes operate on a sample level.
+    """
+    def __init__(self, *args, **kwargs) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    @abstractmethod
+    def apply(self, instance: List[str]) -> List[str]:
+        return instance
+    def __call__(self, instance: str) -> str:
+        """
+        Allows the filter to be called like a function.
+        """
+        return self.apply([instance])[0]
+@dataclass
+class FilterEnsemble:
+    """
+    FilterEnsemble creates a pipeline applying multiple filters.
+    Its intended usage is to stack multiple post-processing steps in order.
+    """
+    name: str
+    filters: List[Callable[[], Filter]]
+    def apply(self, instance: List[str]) -> List[str]:
+        for f in self.filters:
+            # apply filters in sequence
+            instance = f.apply(instance)
+        return instance
+    def __call__(self, instance: str) -> str:
+        """
+        Allows the filter ensemble to be called like a function.
+        """
+        return self.apply([instance])[0]
+def build_filter_ensemble(name: str = 'default', filters: Dict[str, Any] = {}) -> FilterEnsemble:
+    """
+    Create a filtering pipeline.
+    """
+    filter_funcs = []
+    for filter_name, filter_args in filters.items():
+        filter_cls = get_filter(filter_name)
+        if isinstance(filter_args, list):
+            filter_function = filter_cls(*filter_args)
+        elif isinstance(filter_args, dict):
+            filter_function = filter_cls(**filter_args)
+        else:
+            # Assume single value for simple filters
+            filter_function = filter_cls(filter_args)
+        # add the filter as a pipeline step
+        filter_funcs.append(filter_function)
+    return FilterEnsemble(name=name, filters=filter_funcs)

evalscope/api/messages/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .chat_message import (
+    ChatMessage,
+    ChatMessageAssistant,
+    ChatMessageSystem,
+    ChatMessageTool,
+    ChatMessageUser,
+    dict_to_chat_message,
+    messages_pretty_str,
+)
+from .content import Content, ContentAudio, ContentData, ContentImage, ContentReasoning, ContentText, ContentVideo
+from .utils import parse_content_with_reasoning

evalscope/api/messages/chat_message.py ADDED Viewed

@@ -0,0 +1,198 @@
+import uuid
+from pydantic import BaseModel, Field, JsonValue, model_validator
+from typing import Any, Dict, List, Literal, Optional, Type, Union
+from evalscope.api.tool import ToolCall, ToolCallError
+from .content import Content, ContentReasoning, ContentText
+from .utils import parse_content_with_reasoning
+class ChatMessageBase(BaseModel):
+    """Base class for chat messages."""
+    id: Optional[str] = Field(default=None)
+    """Unique identifer for message."""
+    content: Union[str, List[Content]]
+    """Content (simple string or list of content objects)"""
+    source: Optional[Literal['input', 'generate']] = Field(default=None)
+    """Source of message."""
+    metadata: Optional[Dict[str, Any]] = Field(default=None)
+    """Additional message metadata."""
+    internal: Optional[JsonValue] = Field(default=None)
+    """Model provider specific payload - typically used to aid transformation back to model types."""
+    def model_post_init(self, __context: Any) -> None:
+        # Generate ID
+        if self.id is None:
+            self.id = uuid.uuid4().hex[:8]  # Shorten to 8 characters for simplicity
+    @property
+    def text(self) -> str:
+        """Get the text content of this message.
+        ChatMessage content is very general and can contain either
+        a simple text value or a list of content parts (each of which
+        can either be text or an image). Solvers (e.g. for prompt
+        engineering) often need to interact with chat messages with
+        the assumption that they are a simple string. The text
+        property returns either the plain str content, or if the
+        content is a list of text and images, the text items
+        concatenated together (separated by newline)
+        """
+        if isinstance(self.content, str):
+            return self.content
+        else:
+            all_text = [content.text for content in self.content if content.type == 'text']
+            return '\n'.join(all_text)
+    @text.setter
+    def text(self, text: str) -> None:
+        """Set the primary text content for this message.
+        ChatMessage content is very general and can contain either
+        a simple text value or a list of content parts (each of which
+        can either be text or an image). Solvers (e.g. for prompt
+        engineering) often need to interact with chat messages with
+        the assumption that they are a simple string. The text property
+        sets text either to content directly (if it is a `str`) or to
+        the first text content item in the message (inserting one at
+        the beginning if necessary). If there are multiple text content
+        items in the message then after the set there will be only
+        one remaining (image content will remain).
+        """
+        if isinstance(self.content, str):
+            self.content = text
+        else:
+            all_other = [content for content in self.content if content.type != 'text']
+            self.content = all_other + [ContentText(text=text)]
+class ChatMessageSystem(ChatMessageBase):
+    """System chat message."""
+    role: Literal['system'] = Field(default='system')
+    """Conversation role."""
+class ChatMessageUser(ChatMessageBase):
+    """User chat message."""
+    role: Literal['user'] = Field(default='user')
+    """Conversation role."""
+    tool_call_id: Optional[List[str]] = Field(default=None)
+    """ID(s) of tool call(s) this message has the content payload for."""
+class ChatMessageAssistant(ChatMessageBase):
+    """Assistant chat message."""
+    role: Literal['assistant'] = Field(default='assistant')
+    """Conversation role."""
+    tool_calls: Optional[List[ToolCall]] = Field(default=None)
+    """Tool calls made by the model."""
+    model: Optional[str] = Field(default=None)
+    """Model used to generate assistant message."""
+    # Some OpenAI compatible REST endpoints include reasoning as a field alongside
+    # content, however since this field doesn't exist in the OpenAI interface,
+    # hosting providers (so far we've seen this with Together and Groq) may
+    # include the reasoning in a <think></think> tag before the main response.
+    # We expect this pattern to be repeated elsewhere, so include this hook to
+    # automatically extract the reasoning content when the response is prefaced
+    # with a <think> block. If this ends up being an overeach we can fall back
+    # to each provider manually parsing out <think> using a helper function.
+    # The implementation isn't important here, the critical thing to establish
+    # is that EvalScope makes reasoning content available separately.
+    @model_validator(mode='before')
+    @classmethod
+    def extract_reasoning(cls, data: Any) -> Any:
+        if isinstance(data, dict):
+            # cleave apart <think> blocks
+            content = data.get('content', None)
+            if isinstance(content, str):
+                content_text, content_reasoning = parse_content_with_reasoning(content)
+                if content_reasoning:
+                    data['content'] = [
+                        content_reasoning,
+                        ContentText(text=content_text),
+                    ]
+            # migrate messages that has explicit 'reasoning' field
+            # (which was our original representation of reasoning)
+            reasoning = data.get('reasoning', None)
+            if isinstance(reasoning, str):
+                # ensure that content is a list
+                content = data.get('content', None)
+                if content is None:
+                    data['content'] = []
+                elif isinstance(content, str):
+                    data['content'] = [ContentText(text=content)]
+                elif not isinstance(content, list):
+                    data['content'] = []
+                data['content'].insert(0, ContentReasoning(reasoning=reasoning))
+                del data['reasoning']
+        return data
+class ChatMessageTool(ChatMessageBase):
+    """Tool chat message."""
+    role: Literal['tool'] = Field(default='tool')
+    """Conversation role."""
+    tool_call_id: Optional[str] = Field(default=None)
+    """ID of tool call."""
+    function: Optional[str] = Field(default=None)
+    """Name of function called."""
+    error: Optional[ToolCallError] = Field(default=None)
+    """Error which occurred during tool call."""
+ChatMessage = Union[ChatMessageSystem, ChatMessageUser, ChatMessageAssistant, ChatMessageTool]
+"""Message in a chat conversation"""
+def dict_to_chat_message(data: Dict[str, Any]) -> ChatMessage:
+    """Convert a dictionary to a ChatMessage."""
+    if isinstance(data, ChatMessage):
+        return data
+    if 'role' not in data:
+        raise ValueError('ChatMessage must have a "role" field')
+    role = data['role']
+    if role == 'system':
+        return ChatMessageSystem.model_validate(data)
+    elif role == 'user':
+        return ChatMessageUser.model_validate(data)
+    elif role == 'assistant':
+        return ChatMessageAssistant.model_validate(data)
+    elif role == 'tool':
+        return ChatMessageTool.model_validate(data)
+    else:
+        raise ValueError(f'Unknown chat message role: {role}')
+def messages_pretty_str(messages: List[ChatMessage]) -> str:
+    """Pretty print a list of chat messages."""
+    output = []
+    for message in messages:
+        role = message.role.capitalize()
+        content = message.text
+        if isinstance(message, ChatMessageTool):
+            if message.error:
+                content += f'\nError: {message.error.message}'
+            if message.function:
+                content += f'\nFunction: {message.function}'
+        output.append(f'**{role}**: {content}')
+    return '\n\n'.join(output)

evalscope/api/messages/content.py ADDED Viewed

@@ -0,0 +1,102 @@
+from pydantic import BaseModel, Field, JsonValue
+from typing import Dict, Literal, Optional, Sequence, Union
+class ContentBase(BaseModel):
+    internal: Optional[JsonValue] = Field(default=None)
+    """Model provider specific payload - typically used to aid transformation back to model types."""
+class ContentText(ContentBase):
+    """Text content."""
+    type: Literal['text'] = Field(default='text')
+    """Type."""
+    text: str
+    """Text content."""
+    refusal: Optional[bool] = Field(default=None)
+    """Was this a refusal message?"""
+class ContentReasoning(ContentBase):
+    """Reasoning content.
+    See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
+    """  # noqa: E501
+    type: Literal['reasoning'] = Field(default='reasoning')
+    """Type."""
+    reasoning: str
+    """Reasoning content."""
+    signature: Optional[str] = Field(default=None)
+    """Signature for reasoning content (used by some models to ensure that reasoning content is not modified for replay)"""  # noqa: E501
+    redacted: bool = Field(default=False)
+    """Indicates that the explicit content of this reasoning block has been redacted."""
+class ContentImage(ContentBase):
+    """Image content."""
+    type: Literal['image'] = Field(default='image')
+    """Type."""
+    image: str
+    """Either a URL of the image or the base64 encoded image data."""
+    detail: Literal['auto', 'low', 'high'] = Field(default='auto')
+    """Specifies the detail level of the image.
+    Currently only supported for OpenAI. Learn more in the    [Vision guide](https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding).
+    """  # noqa: E501
+class ContentAudio(ContentBase):
+    """Audio content."""
+    type: Literal['audio'] = Field(default='audio')
+    """Type."""
+    audio: str
+    """Audio file path or base64 encoded data URL."""
+    format: Literal['wav', 'mp3']
+    """Format of audio data ('mp3' or 'wav')"""
+class ContentVideo(ContentBase):
+    """Video content."""
+    type: Literal['video'] = Field(default='video')
+    """Type."""
+    video: str
+    """Audio file path or base64 encoded data URL."""
+    format: Literal['mp4', 'mpeg', 'mov']
+    """Format of video data ('mp4', 'mpeg', or 'mov')"""
+class ContentData(ContentBase):
+    """Model internal."""
+    type: Literal['data'] = Field(default='data')
+    """Type."""
+    data: Dict[str, JsonValue]
+    """Model provider specific payload - required for internal content."""
+Content = Union[
+    ContentText,
+    ContentReasoning,
+    ContentImage,
+    ContentAudio,
+    ContentVideo,
+    ContentData,
+]
+"""Content sent to or received from a model."""

evalscope/api/messages/utils.py ADDED Viewed

@@ -0,0 +1,35 @@
+import re
+from typing import Optional
+from .content import ContentReasoning
+def parse_content_with_reasoning(content: str) -> tuple[str, Optional[ContentReasoning]]:
+    """
+    Looks for and extracts <think/> tags into reasoning text.
+    Returns a tuple:
+    - The first element is the input content with the <think> tag and its contents fully removed.
+    - The second element is a ContentReasoning object (or None if no <think> tag is found).
+    """
+    # Match <think> tag with optional attributes anywhere in the string
+    pattern = (r'<think(?:\s+signature="([^"]*)")?(?:\s+redacted="(true)")?\s*>(.*?)</think>')
+    match = re.search(pattern, content, re.DOTALL)
+    if match:
+        signature = match.group(1)  # This will be None if not present
+        redacted_value = match.group(2)  # This will be "true" or None
+        reasoning = match.group(3).strip()
+        # Remove the matched <think>...</think> from the input
+        start, end = match.span()
+        return (
+            (content[:start] + content[end:]).strip(),
+            ContentReasoning(
+                reasoning=reasoning,
+                signature=signature,
+                redacted=redacted_value == 'true',
+            ),
+        )
+    else:
+        return content, None

evalscope/api/metric/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .metric import Metric, T2IMetric
2	+ from .scorer import Aggregator, AggScore, SampleScore, Score, Value

evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl