evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, DataAdapter
|
|
6
|
+
from evalscope.api.filter import Filter
|
|
7
|
+
from evalscope.api.metric import Aggregator, Metric
|
|
8
|
+
from evalscope.api.model.model import ModelAPI
|
|
9
|
+
from evalscope.config import TaskConfig
|
|
10
|
+
|
|
11
|
+
# BEGIN: Registry for benchmarks
|
|
12
|
+
# Registry for benchmarks, allowing dynamic registration and retrieval of benchmark metadata and data adapters.
|
|
13
|
+
BENCHMARK_REGISTRY: Dict[str, 'BenchmarkMeta'] = {}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def register_benchmark(metadata: 'BenchmarkMeta'):
|
|
17
|
+
"""Register a benchmark with its metadata."""
|
|
18
|
+
|
|
19
|
+
def register_wrapper(data_adapter: Type['DataAdapter']):
|
|
20
|
+
if metadata.name in BENCHMARK_REGISTRY:
|
|
21
|
+
raise ValueError(f'Benchmark {metadata.name} already registered')
|
|
22
|
+
metadata.data_adapter = data_adapter
|
|
23
|
+
BENCHMARK_REGISTRY[metadata.name] = metadata
|
|
24
|
+
return data_adapter
|
|
25
|
+
|
|
26
|
+
return register_wrapper
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_benchmark(name: str, config: Optional['TaskConfig'] = None) -> 'DataAdapter':
|
|
30
|
+
"""
|
|
31
|
+
Retrieve a registered benchmark by name.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
name (str): The name of the benchmark.
|
|
35
|
+
config (Optional['TaskConfig']): The task configuration.
|
|
36
|
+
dataset_args (Optional[dict]): The dataset-specific arguments.
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
# copy to avoid modifying the original metadata
|
|
40
|
+
metadata = copy.deepcopy(BENCHMARK_REGISTRY.get(name))
|
|
41
|
+
if not metadata:
|
|
42
|
+
raise ValueError(f'Benchmark {name} not found, available benchmarks: {list(sorted(BENCHMARK_REGISTRY.keys()))}')
|
|
43
|
+
|
|
44
|
+
# Update metadata with dataset-specific configuration
|
|
45
|
+
if config is not None:
|
|
46
|
+
metadata._update(config.dataset_args.get(name, {}))
|
|
47
|
+
# Return the data adapter initialized with the benchmark metadata
|
|
48
|
+
data_adapter_cls = metadata.data_adapter
|
|
49
|
+
return data_adapter_cls(benchmark_meta=metadata, task_config=config)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# END: Registry for benchmarks
|
|
53
|
+
|
|
54
|
+
# BEGIN: Registry for model APIs
|
|
55
|
+
# Registry for model APIs, allowing dynamic registration and retrieval of model API classes.
|
|
56
|
+
MODEL_APIS: Dict[str, Type['ModelAPI']] = {}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def register_model_api(name: str):
|
|
60
|
+
"""
|
|
61
|
+
Decorator to register a model API class with a given name.
|
|
62
|
+
|
|
63
|
+
:param name: The name of the model API.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def decorator(api_class: Type['ModelAPI']):
|
|
67
|
+
if name in MODEL_APIS:
|
|
68
|
+
raise ValueError(f"Model API '{name}' is already registered.")
|
|
69
|
+
MODEL_APIS[name] = api_class
|
|
70
|
+
return api_class
|
|
71
|
+
|
|
72
|
+
return decorator
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_model_api(name: str) -> Type['ModelAPI']:
|
|
76
|
+
"""
|
|
77
|
+
Retrieve a registered model API class by name.
|
|
78
|
+
|
|
79
|
+
:param name: The name of the model API.
|
|
80
|
+
:return: The model API class.
|
|
81
|
+
"""
|
|
82
|
+
if name not in MODEL_APIS:
|
|
83
|
+
raise ValueError(f"Model API '{name}' is not registered. Available model APIs: {list(MODEL_APIS.keys())}")
|
|
84
|
+
|
|
85
|
+
wrapped = MODEL_APIS[name]
|
|
86
|
+
if not isinstance(wrapped, type):
|
|
87
|
+
return wrapped()
|
|
88
|
+
else:
|
|
89
|
+
return wrapped
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# END: Registry for model APIs
|
|
93
|
+
|
|
94
|
+
# BEGIN: Registry for metrics
|
|
95
|
+
METRIC_REGISTRY: Dict[str, Type['Metric']] = {}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def register_metric(name: str):
|
|
99
|
+
|
|
100
|
+
def decorate(fn):
|
|
101
|
+
if name in METRIC_REGISTRY:
|
|
102
|
+
raise ValueError(f"Metric named '{name}' conflicts with existing registered metric!")
|
|
103
|
+
|
|
104
|
+
METRIC_REGISTRY[name] = fn
|
|
105
|
+
return fn
|
|
106
|
+
|
|
107
|
+
return decorate
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def get_metric(name: str) -> Type['Metric']:
|
|
111
|
+
if name in METRIC_REGISTRY:
|
|
112
|
+
return METRIC_REGISTRY[name]
|
|
113
|
+
else:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"Metric '{name}' not found in the registry. Available metrics: {list(METRIC_REGISTRY.keys())}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# END: Registry for metrics
|
|
120
|
+
|
|
121
|
+
# BEGIN: Registry for filters
|
|
122
|
+
|
|
123
|
+
FILTER_REGISTRY: Dict[str, Type['Filter']] = {}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def register_filter(name):
|
|
127
|
+
|
|
128
|
+
def decorate(cls):
|
|
129
|
+
if name in FILTER_REGISTRY:
|
|
130
|
+
raise ValueError(f'Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}')
|
|
131
|
+
FILTER_REGISTRY[name] = cls
|
|
132
|
+
return cls
|
|
133
|
+
|
|
134
|
+
return decorate
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def get_filter(filter_name: str) -> Type['Filter']:
|
|
138
|
+
if filter_name not in FILTER_REGISTRY:
|
|
139
|
+
raise KeyError(
|
|
140
|
+
f"Filter '{filter_name}' not found in the registry. Available filters: {list(FILTER_REGISTRY.keys())}"
|
|
141
|
+
)
|
|
142
|
+
return FILTER_REGISTRY[filter_name]
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# END: Registry for filters
|
|
146
|
+
|
|
147
|
+
# BEGIN: Registry for aggregation functions
|
|
148
|
+
AGGREGATION_REGISTRY: Dict[str, Type['Aggregator']] = {}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def register_aggregation(name: str):
|
|
152
|
+
"""
|
|
153
|
+
Decorator to register an aggregation function with a given name.
|
|
154
|
+
|
|
155
|
+
:param name: The name of the aggregation function.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
def decorator(aggregation_fn: 'Aggregator'):
|
|
159
|
+
if name in AGGREGATION_REGISTRY:
|
|
160
|
+
raise ValueError(f"Aggregation function '{name}' is already registered.")
|
|
161
|
+
AGGREGATION_REGISTRY[name] = aggregation_fn
|
|
162
|
+
return aggregation_fn
|
|
163
|
+
|
|
164
|
+
return decorator
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def get_aggregation(name: str) -> Type['Aggregator']:
|
|
168
|
+
"""
|
|
169
|
+
Retrieve a registered aggregation function by name.
|
|
170
|
+
|
|
171
|
+
:param name: The name of the aggregation function.
|
|
172
|
+
:return: The aggregation function.
|
|
173
|
+
"""
|
|
174
|
+
if name not in AGGREGATION_REGISTRY:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
f"Aggregation function '{name}' is not registered. "
|
|
177
|
+
f'Available aggregations: {list(AGGREGATION_REGISTRY.keys())}'
|
|
178
|
+
)
|
|
179
|
+
return AGGREGATION_REGISTRY[name]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# END: Registry for aggregation functions
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pydantic import BaseModel, Field, JsonValue, field_validator
|
|
3
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ToolFunction(BaseModel):
|
|
7
|
+
"""Indicate that a specific tool function should be called."""
|
|
8
|
+
|
|
9
|
+
name: str
|
|
10
|
+
"""The name of the tool function to call."""
|
|
11
|
+
|
|
12
|
+
arguments: Dict[str, Any]
|
|
13
|
+
"""The arguments of the tool function to call"""
|
|
14
|
+
|
|
15
|
+
@field_validator('arguments', mode='before')
|
|
16
|
+
@classmethod
|
|
17
|
+
def parse_arguments(cls, v):
|
|
18
|
+
if isinstance(v, str):
|
|
19
|
+
try:
|
|
20
|
+
v = json.loads(v)
|
|
21
|
+
except Exception as e:
|
|
22
|
+
raise ValueError(f'arguments field string is not valid JSON: {e}')
|
|
23
|
+
if not isinstance(v, dict):
|
|
24
|
+
raise ValueError('arguments must be a dict or a JSON string representing a dict')
|
|
25
|
+
return v
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ToolCallContent(BaseModel):
|
|
29
|
+
"""Content to include in tool call view."""
|
|
30
|
+
|
|
31
|
+
title: Optional[str] = Field(default=None)
|
|
32
|
+
"""Optional (plain text) title for tool call content."""
|
|
33
|
+
|
|
34
|
+
format: Literal['text', 'markdown']
|
|
35
|
+
"""Format (text or markdown)."""
|
|
36
|
+
|
|
37
|
+
content: str
|
|
38
|
+
"""Text or markdown content."""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ToolCallView(BaseModel):
|
|
42
|
+
"""Custom view of a tool call.
|
|
43
|
+
|
|
44
|
+
Both `context` and `call` are optional. If `call` is not specified
|
|
45
|
+
then the view will default to a syntax highlighted Python function call.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
context: Optional[ToolCallContent] = Field(default=None)
|
|
49
|
+
"""Context for the tool call (i.e. current tool state)."""
|
|
50
|
+
|
|
51
|
+
call: Optional[ToolCallContent] = Field(default=None)
|
|
52
|
+
"""Custom representation of tool call."""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ToolCall(BaseModel):
|
|
56
|
+
id: str
|
|
57
|
+
"""Unique identifier for tool call."""
|
|
58
|
+
|
|
59
|
+
function: ToolFunction
|
|
60
|
+
"""Function to call."""
|
|
61
|
+
|
|
62
|
+
internal: Optional[JsonValue] = Field(default=None)
|
|
63
|
+
"""Model provider specific payload - typically used to aid transformation back to model types."""
|
|
64
|
+
|
|
65
|
+
parse_error: Optional[str] = Field(default=None)
|
|
66
|
+
"""Error which occurred parsing tool call."""
|
|
67
|
+
|
|
68
|
+
view: Optional[ToolCallContent] = Field(default=None)
|
|
69
|
+
"""Custom view of tool call input."""
|
|
70
|
+
|
|
71
|
+
type: Optional[str] = Field(default=None)
|
|
72
|
+
"""Tool call type (deprecated)."""
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class ToolCallError(BaseModel):
|
|
76
|
+
"""Error raised by a tool call."""
|
|
77
|
+
|
|
78
|
+
type: Literal[
|
|
79
|
+
'parsing',
|
|
80
|
+
'timeout',
|
|
81
|
+
'unicode_decode',
|
|
82
|
+
'permission',
|
|
83
|
+
'file_not_found',
|
|
84
|
+
'is_a_directory',
|
|
85
|
+
'limit',
|
|
86
|
+
'approval',
|
|
87
|
+
'unknown',
|
|
88
|
+
]
|
|
89
|
+
"""Error type."""
|
|
90
|
+
|
|
91
|
+
message: str
|
|
92
|
+
"""Error message."""
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
ToolChoice = Union[Literal['auto', 'any', 'none'], ToolFunction]
|
|
96
|
+
"""Specify which tool to call.
|
|
97
|
+
|
|
98
|
+
"auto" means the model decides; "any" means use at least one tool,
|
|
99
|
+
"none" means never call a tool; ToolFunction instructs the model
|
|
100
|
+
to call a specific function.
|
|
101
|
+
"""
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from docstring_parser import Docstring, parse
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, TypeAlias, Union, get_args, get_type_hints
|
|
6
|
+
|
|
7
|
+
from evalscope.utils.json_schema import JSONSchema, JSONType, json_schema, python_type_to_json_type
|
|
8
|
+
|
|
9
|
+
ToolParam: TypeAlias = JSONSchema
|
|
10
|
+
"""Description of tool parameter in JSON Schema format."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Tool:
|
|
14
|
+
|
|
15
|
+
def __call__(
|
|
16
|
+
self,
|
|
17
|
+
*args: Any,
|
|
18
|
+
**kwargs: Any,
|
|
19
|
+
) -> Any:
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ToolParams(BaseModel):
|
|
24
|
+
"""Description of tool parameters object in JSON Schema format."""
|
|
25
|
+
|
|
26
|
+
type: Literal['object'] = Field(default='object')
|
|
27
|
+
"""Params type (always 'object')"""
|
|
28
|
+
|
|
29
|
+
properties: Dict[str, ToolParam] = Field(default_factory=dict)
|
|
30
|
+
"""Tool function parameters."""
|
|
31
|
+
|
|
32
|
+
required: List[str] = Field(default_factory=list)
|
|
33
|
+
"""List of required fields."""
|
|
34
|
+
|
|
35
|
+
additionalProperties: bool = Field(default=False)
|
|
36
|
+
"""Are additional object properties allowed? (always `False`)"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ToolDescription:
|
|
41
|
+
name: Optional[str] = None
|
|
42
|
+
description: Optional[str] = None
|
|
43
|
+
parameters: Optional[ToolParams] = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def tool_description(tool: Tool) -> ToolDescription:
|
|
47
|
+
return getattr(tool, TOOL_DESCRIPTION, ToolDescription())
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def set_tool_description(tool: Tool, description: ToolDescription) -> None:
|
|
51
|
+
setattr(tool, TOOL_DESCRIPTION, description)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
TOOL_DESCRIPTION = '__TOOL_DESCRIPTION__'
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ToolInfo(BaseModel):
|
|
58
|
+
"""Specification of a tool (JSON Schema compatible)
|
|
59
|
+
|
|
60
|
+
If you are implementing a ModelAPI, most LLM libraries can
|
|
61
|
+
be passed this object (dumped to a dict) directly as a function
|
|
62
|
+
specification. For example, in the OpenAI provider:
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
ChatCompletionToolParam(
|
|
66
|
+
type="function",
|
|
67
|
+
function=tool.model_dump(exclude_none=True),
|
|
68
|
+
)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
In some cases the field names don't match up exactly. In that case
|
|
72
|
+
call `model_dump()` on the `parameters` field. For example, in the
|
|
73
|
+
Anthropic provider:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
ToolParam(
|
|
77
|
+
name=tool.name,
|
|
78
|
+
description=tool.description,
|
|
79
|
+
input_schema=tool.parameters.model_dump(exclude_none=True),
|
|
80
|
+
)
|
|
81
|
+
```
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
name: str
|
|
85
|
+
"""Name of tool."""
|
|
86
|
+
description: str
|
|
87
|
+
"""Short description of tool."""
|
|
88
|
+
parameters: ToolParams = Field(default_factory=ToolParams)
|
|
89
|
+
"""JSON Schema of tool parameters object."""
|
|
90
|
+
options: Optional[Dict[str, object]] = Field(default=None)
|
|
91
|
+
"""Optional property bag that can be used by the model provider to customize the implementation of the tool"""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def parse_tool_info(func: Callable[..., Any]) -> ToolInfo:
|
|
95
|
+
# tool may already have registry attributes w/ tool info
|
|
96
|
+
description = tool_description(func)
|
|
97
|
+
if (description.name and description.description and description.parameters is not None):
|
|
98
|
+
return ToolInfo(
|
|
99
|
+
name=description.name,
|
|
100
|
+
description=description.description,
|
|
101
|
+
parameters=description.parameters,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
signature = inspect.signature(func)
|
|
105
|
+
type_hints = get_type_hints(func)
|
|
106
|
+
docstring = inspect.getdoc(func)
|
|
107
|
+
parsed_docstring: Optional[Docstring] = parse(docstring) if docstring else None
|
|
108
|
+
|
|
109
|
+
info = ToolInfo(name=func.__name__, description='')
|
|
110
|
+
|
|
111
|
+
for param_name, param in signature.parameters.items():
|
|
112
|
+
tool_param = ToolParam()
|
|
113
|
+
|
|
114
|
+
# Parse docstring
|
|
115
|
+
docstring_info = parse_docstring(docstring, param_name)
|
|
116
|
+
|
|
117
|
+
# Get type information from type annotations
|
|
118
|
+
if param_name in type_hints:
|
|
119
|
+
tool_param = json_schema(type_hints[param_name])
|
|
120
|
+
# as a fallback try to parse it from the docstring
|
|
121
|
+
# (this is minimally necessary for backwards compatiblity
|
|
122
|
+
# with tools gen1 type parsing, which only used docstrings)
|
|
123
|
+
elif 'docstring_type' in docstring_info:
|
|
124
|
+
json_type = python_type_to_json_type(docstring_info['docstring_type'])
|
|
125
|
+
if json_type and (json_type in get_args(JSONType)):
|
|
126
|
+
tool_param = ToolParam(type=json_type)
|
|
127
|
+
|
|
128
|
+
# Get default value
|
|
129
|
+
if param.default is param.empty:
|
|
130
|
+
info.parameters.required.append(param_name)
|
|
131
|
+
else:
|
|
132
|
+
tool_param.default = param.default
|
|
133
|
+
|
|
134
|
+
# Add description from docstring
|
|
135
|
+
if 'description' in docstring_info:
|
|
136
|
+
tool_param.description = docstring_info['description']
|
|
137
|
+
|
|
138
|
+
# append the tool param
|
|
139
|
+
info.parameters.properties[param_name] = tool_param
|
|
140
|
+
|
|
141
|
+
# Add function description if available
|
|
142
|
+
if parsed_docstring:
|
|
143
|
+
if parsed_docstring.description:
|
|
144
|
+
info.description = parsed_docstring.description.strip()
|
|
145
|
+
elif parsed_docstring.long_description:
|
|
146
|
+
info.description = parsed_docstring.long_description.strip()
|
|
147
|
+
elif parsed_docstring.short_description:
|
|
148
|
+
info.description = parsed_docstring.short_description.strip()
|
|
149
|
+
|
|
150
|
+
# Add examples if available
|
|
151
|
+
if parsed_docstring.examples:
|
|
152
|
+
examples = '\n\n'.join([(example.description or '') for example in parsed_docstring.examples])
|
|
153
|
+
info.description = f'{info.description}\n\nExamples\n\n{examples}'
|
|
154
|
+
|
|
155
|
+
return info
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def parse_docstring(docstring: Optional[str], param_name: str) -> Dict[str, str]:
|
|
159
|
+
if not docstring:
|
|
160
|
+
return {}
|
|
161
|
+
|
|
162
|
+
parsed_docstring: Docstring = parse(docstring)
|
|
163
|
+
|
|
164
|
+
for param in parsed_docstring.params:
|
|
165
|
+
if param.arg_name == param_name:
|
|
166
|
+
schema: Dict[str, str] = {'description': param.description or ''}
|
|
167
|
+
|
|
168
|
+
if param.type_name:
|
|
169
|
+
schema['docstring_type'] = param.type_name
|
|
170
|
+
|
|
171
|
+
return schema
|
|
172
|
+
|
|
173
|
+
return {}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import yaml
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from evalscope.utils import get_logger
|
|
6
|
+
from .tool_call import ToolCall, ToolFunction
|
|
7
|
+
from .tool_info import ToolInfo
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def parse_tool_call(id: str, function: str, arguments: str, tools: Optional[List[ToolInfo]] = None) -> ToolCall:
|
|
13
|
+
"""Parse a tool call from a JSON payload.
|
|
14
|
+
|
|
15
|
+
Note that this function doesn't know about internal tool names so the caller
|
|
16
|
+
should ammend the returned `ToolCall` by mapping the parsed `function` field from
|
|
17
|
+
from an internal name to an inspect tool name and fixing up the `ToolCall` object
|
|
18
|
+
as required to reflect this change.
|
|
19
|
+
"""
|
|
20
|
+
error: Optional[str] = None
|
|
21
|
+
arguments_dict: Dict[str, Any] = {}
|
|
22
|
+
|
|
23
|
+
def report_parse_error(ex: Exception) -> None:
|
|
24
|
+
nonlocal error
|
|
25
|
+
error = tool_parse_error_message(arguments, ex)
|
|
26
|
+
logger.info(error)
|
|
27
|
+
|
|
28
|
+
# if the arguments is a dict, then handle it with a plain json.loads
|
|
29
|
+
arguments = arguments.strip()
|
|
30
|
+
if arguments.startswith('{'):
|
|
31
|
+
try:
|
|
32
|
+
arguments_dict = json.loads(arguments)
|
|
33
|
+
except json.JSONDecodeError as ex:
|
|
34
|
+
report_parse_error(ex)
|
|
35
|
+
|
|
36
|
+
# otherwise parse it as yaml (which will pickup unquoted strings, numbers, and true/false)
|
|
37
|
+
# and then create a dict that maps it to the first function argument
|
|
38
|
+
elif function and tools:
|
|
39
|
+
tool_info = next(
|
|
40
|
+
(tool for tool in tools if tool.name == function and len(tool.parameters.properties) > 0),
|
|
41
|
+
None,
|
|
42
|
+
)
|
|
43
|
+
if tool_info:
|
|
44
|
+
param_names = list(tool_info.parameters.properties.keys())
|
|
45
|
+
try:
|
|
46
|
+
value = yaml.safe_load(arguments)
|
|
47
|
+
arguments_dict[param_names[0]] = value
|
|
48
|
+
except yaml.error.YAMLError:
|
|
49
|
+
# If the yaml parser fails, we treat it as a string argument.
|
|
50
|
+
arguments_dict[param_names[0]] = arguments
|
|
51
|
+
|
|
52
|
+
# return ToolCall with error payload
|
|
53
|
+
return ToolCall(
|
|
54
|
+
id=id,
|
|
55
|
+
function=ToolFunction(
|
|
56
|
+
name=function,
|
|
57
|
+
arguments=arguments_dict,
|
|
58
|
+
),
|
|
59
|
+
parse_error=error,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def tool_parse_error_message(arguments: str, ex: Exception) -> str:
|
|
64
|
+
return f'Error parsing the following tool call arguments:\n\n{arguments}\n\nError details: {ex}'
|
evalscope/app/ui/app_ui.py
CHANGED
|
@@ -32,7 +32,8 @@ def create_app_ui(args: argparse.Namespace):
|
|
|
32
32
|
|
|
33
33
|
@sidebar.load_btn.click(
|
|
34
34
|
inputs=[sidebar.reports_dropdown],
|
|
35
|
-
outputs=[visualization.single_model.report_name, visualization.multi_model.multi_report_name]
|
|
35
|
+
outputs=[visualization.single_model.report_name, visualization.multi_model.multi_report_name]
|
|
36
|
+
)
|
|
36
37
|
def update_displays(reports_dropdown):
|
|
37
38
|
if not reports_dropdown:
|
|
38
39
|
gr.Warning(locale_dict['note'], duration=3)
|