evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
evalscope/perf/benchmark.py
CHANGED
|
@@ -189,7 +189,8 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
|
|
|
189
189
|
await connect_test(args, api_plugin)
|
|
190
190
|
# start statistic benchmark metric
|
|
191
191
|
statistic_benchmark_metric_task = asyncio.create_task(
|
|
192
|
-
statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
|
|
192
|
+
statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
|
|
193
|
+
)
|
|
193
194
|
# start send request
|
|
194
195
|
semaphore = asyncio.Semaphore(args.parallel)
|
|
195
196
|
send_request_tasks: List[asyncio.Task] = []
|
evalscope/perf/http_client.py
CHANGED
|
@@ -26,7 +26,8 @@ class AioHttpClient:
|
|
|
26
26
|
self.api_plugin = api_plugin
|
|
27
27
|
self.client = aiohttp.ClientSession(
|
|
28
28
|
timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
|
|
29
|
-
trace_configs=[self._create_trace_config()] if args.debug else []
|
|
29
|
+
trace_configs=[self._create_trace_config()] if args.debug else []
|
|
30
|
+
)
|
|
30
31
|
|
|
31
32
|
async def __aenter__(self):
|
|
32
33
|
pass
|
|
@@ -105,7 +106,8 @@ async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
|
|
|
105
106
|
while True:
|
|
106
107
|
try:
|
|
107
108
|
is_error, state_code, response_data = await asyncio.wait_for(
|
|
108
|
-
attempt_connection(), timeout=args.connect_timeout
|
|
109
|
+
attempt_connection(), timeout=args.connect_timeout
|
|
110
|
+
)
|
|
109
111
|
if not is_error:
|
|
110
112
|
logger.info('Test connection successful.')
|
|
111
113
|
return True
|
|
@@ -153,7 +153,8 @@ class CustomPlugin(ApiPluginBase):
|
|
|
153
153
|
|
|
154
154
|
# If no usage information and no tokenizer, raise an error
|
|
155
155
|
raise ValueError(
|
|
156
|
-
'Cannot determine token counts: no usage information in response and no tokenizer provided.'
|
|
156
|
+
'Cannot determine token counts: no usage information in response and no tokenizer provided.'
|
|
157
|
+
)
|
|
157
158
|
|
|
158
159
|
except Exception as e:
|
|
159
160
|
logger.error(f'Error parsing responses: {e}')
|
|
@@ -186,8 +187,7 @@ class CustomPlugin(ApiPluginBase):
|
|
|
186
187
|
data = json.dumps(body, ensure_ascii=False)
|
|
187
188
|
|
|
188
189
|
# Send the request
|
|
189
|
-
async with client_session.request(
|
|
190
|
-
'POST', url=url, data=data, headers=headers) as response: # noqa: E125
|
|
190
|
+
async with client_session.request('POST', url=url, data=data, headers=headers) as response: # noqa: E125
|
|
191
191
|
# Get the status code
|
|
192
192
|
status_code = response.status
|
|
193
193
|
|
|
@@ -244,6 +244,7 @@ if __name__ == '__main__':
|
|
|
244
244
|
api='custom', # Use the custom API plugin registered above
|
|
245
245
|
dataset='openqa',
|
|
246
246
|
number=1,
|
|
247
|
-
max_tokens=10
|
|
247
|
+
max_tokens=10
|
|
248
|
+
)
|
|
248
249
|
|
|
249
250
|
run_perf_benchmark(args)
|
|
@@ -159,13 +159,15 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
159
159
|
input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
|
|
160
160
|
output_tokens += len(self.tokenizer.encode(full_response_content))
|
|
161
161
|
else:
|
|
162
|
-
raise ValueError(
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
162
|
+
raise ValueError(
|
|
163
|
+
'Error: Unable to retrieve usage information\n\n'
|
|
164
|
+
'This error occurs when:\n'
|
|
165
|
+
'1. The API response does not contain usage data, AND\n'
|
|
166
|
+
'2. No tokenizer has been specified or found.\n\n'
|
|
167
|
+
'To resolve this issue, do ONE of the following:\n'
|
|
168
|
+
"a) Ensure that the API you're using supports and returns usage information, OR\n"
|
|
169
|
+
'b) Specify a tokenizer using the `--tokenizer-path` parameter.\n\n'
|
|
170
|
+
'If you continue to experience issues, '
|
|
171
|
+
'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
|
|
172
|
+
)
|
|
171
173
|
return input_tokens, output_tokens
|
|
@@ -17,7 +17,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
17
17
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
18
18
|
prompt = item.strip()
|
|
19
19
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
20
|
-
|
|
20
|
+
prompt
|
|
21
|
+
) < self.query_parameters.max_prompt_length:
|
|
21
22
|
if self.query_parameters.apply_chat_template:
|
|
22
23
|
message = self.create_message(prompt)
|
|
23
24
|
yield [message]
|
|
@@ -24,5 +24,5 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
24
24
|
text = item['txt']
|
|
25
25
|
base64_image = PIL_to_base64(pil_image)
|
|
26
26
|
|
|
27
|
-
message = self.create_message(text=text,
|
|
27
|
+
message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
|
|
28
28
|
yield [message]
|
|
@@ -24,5 +24,5 @@ class KontextDatasetPlugin(DatasetPluginBase):
|
|
|
24
24
|
text = item['instruction']
|
|
25
25
|
base64_image = PIL_to_base64(pil_image)
|
|
26
26
|
|
|
27
|
-
message = self.create_message(text=text,
|
|
27
|
+
message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
|
|
28
28
|
yield [message]
|
|
@@ -18,7 +18,8 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
|
|
|
18
18
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
19
19
|
prompt = item.strip()
|
|
20
20
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
21
|
-
|
|
21
|
+
prompt
|
|
22
|
+
) < self.query_parameters.max_prompt_length:
|
|
22
23
|
if self.query_parameters.apply_chat_template:
|
|
23
24
|
message = self.create_message(prompt)
|
|
24
25
|
yield [message]
|
|
@@ -23,7 +23,8 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
|
23
23
|
for item in ds:
|
|
24
24
|
prompt = item['instruction'].strip()
|
|
25
25
|
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
26
|
-
|
|
26
|
+
prompt
|
|
27
|
+
) < self.query_parameters.max_prompt_length:
|
|
27
28
|
if self.query_parameters.apply_chat_template:
|
|
28
29
|
message = self.create_message(prompt)
|
|
29
30
|
yield [message]
|
|
@@ -27,8 +27,10 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
|
27
27
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
28
28
|
item = json.loads(item)
|
|
29
29
|
prompt = item['question'].strip()
|
|
30
|
-
if (
|
|
31
|
-
|
|
30
|
+
if (
|
|
31
|
+
len(prompt) > self.query_parameters.min_prompt_length
|
|
32
|
+
and len(prompt) < self.query_parameters.max_prompt_length
|
|
33
|
+
):
|
|
32
34
|
if self.query_parameters.apply_chat_template:
|
|
33
35
|
message = self.create_message(prompt)
|
|
34
36
|
yield [message]
|
|
@@ -33,8 +33,8 @@ class BenchmarkData:
|
|
|
33
33
|
if len(self.chunk_times) > 1:
|
|
34
34
|
self.first_chunk_latency = self.chunk_times[0] - self.start_time
|
|
35
35
|
# remove the first chunk time from the total latency
|
|
36
|
-
self.time_per_output_token = (self.query_latency - self.first_chunk_latency
|
|
37
|
-
|
|
36
|
+
self.time_per_output_token = (self.query_latency - self.first_chunk_latency
|
|
37
|
+
) / (self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
|
|
38
38
|
self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
|
|
39
39
|
else:
|
|
40
40
|
self.first_chunk_latency = self.query_latency
|
|
@@ -126,11 +126,13 @@ class BenchmarkMetrics:
|
|
|
126
126
|
self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
|
|
127
127
|
self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
|
|
128
128
|
self.avg_output_token_per_seconds = self.n_total_completion_tokens / self.total_time
|
|
129
|
-
self.avg_total_token_per_seconds = (
|
|
130
|
-
|
|
129
|
+
self.avg_total_token_per_seconds = (
|
|
130
|
+
self.n_total_prompt_tokens + self.n_total_completion_tokens
|
|
131
|
+
) / self.total_time
|
|
131
132
|
self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
|
|
132
133
|
self.avg_inter_token_latency = sum(self.n_total_inter_token_latency) / len(
|
|
133
|
-
self.n_total_inter_token_latency
|
|
134
|
+
self.n_total_inter_token_latency
|
|
135
|
+
) if self.n_total_inter_token_latency else 0.0
|
|
134
136
|
self.qps = self.n_succeed_queries / self.total_time
|
|
135
137
|
except ZeroDivisionError as e:
|
|
136
138
|
logger.exception(e)
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -56,7 +56,8 @@ def transpose_results(data):
|
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
def create_result_table(cursor):
|
|
59
|
-
cursor.execute(
|
|
59
|
+
cursor.execute(
|
|
60
|
+
f'''CREATE TABLE IF NOT EXISTS result(
|
|
60
61
|
{DatabaseColumns.REQUEST} TEXT,
|
|
61
62
|
{DatabaseColumns.START_TIME} REAL,
|
|
62
63
|
{DatabaseColumns.CHUNK_TIMES} TEXT,
|
|
@@ -69,7 +70,8 @@ def create_result_table(cursor):
|
|
|
69
70
|
{DatabaseColumns.COMPLETION_TOKENS} INTEGER,
|
|
70
71
|
{DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
|
|
71
72
|
{DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
|
|
72
|
-
)'''
|
|
73
|
+
)'''
|
|
74
|
+
)
|
|
73
75
|
|
|
74
76
|
|
|
75
77
|
def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
|
|
@@ -89,9 +91,10 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
|
|
|
89
91
|
|
|
90
92
|
if benchmark_data.success:
|
|
91
93
|
# Add additional columns for success case
|
|
92
|
-
additional_columns = (
|
|
93
|
-
|
|
94
|
-
|
|
94
|
+
additional_columns = (
|
|
95
|
+
benchmark_data.query_latency, benchmark_data.first_chunk_latency, benchmark_data.prompt_tokens,
|
|
96
|
+
benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
|
|
97
|
+
)
|
|
95
98
|
query = f"""INSERT INTO result(
|
|
96
99
|
{DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
|
|
97
100
|
{DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
|
|
@@ -124,7 +127,7 @@ def get_result_db_path(args: Arguments):
|
|
|
124
127
|
|
|
125
128
|
logger.info(f'Save the data base to: {result_db_path}')
|
|
126
129
|
if os.path.exists(result_db_path):
|
|
127
|
-
logger.
|
|
130
|
+
logger.error(f'The db file {result_db_path} exists, delete it and start again!.')
|
|
128
131
|
sys.exit(1)
|
|
129
132
|
|
|
130
133
|
return result_db_path
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import subprocess
|
|
3
|
-
import torch
|
|
4
3
|
import uvicorn
|
|
5
4
|
from contextlib import asynccontextmanager
|
|
6
5
|
from dataclasses import dataclass
|
|
@@ -61,8 +60,12 @@ class ServerSentEvent(object):
|
|
|
61
60
|
@asynccontextmanager
|
|
62
61
|
async def lifespan(app: FastAPI):
|
|
63
62
|
yield
|
|
64
|
-
|
|
65
|
-
torch
|
|
63
|
+
try:
|
|
64
|
+
import torch
|
|
65
|
+
if torch.cuda.is_available():
|
|
66
|
+
torch.cuda.empty_cache()
|
|
67
|
+
except ImportError:
|
|
68
|
+
pass
|
|
66
69
|
|
|
67
70
|
|
|
68
71
|
def create_app(model, attn_implementation=None) -> FastAPI:
|
|
@@ -102,6 +105,8 @@ def start_app(args: Arguments):
|
|
|
102
105
|
uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
|
|
103
106
|
|
|
104
107
|
elif args.api == 'local_vllm':
|
|
108
|
+
import torch
|
|
109
|
+
|
|
105
110
|
os.environ['VLLM_USE_MODELSCOPE'] = 'True'
|
|
106
111
|
os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
|
|
107
112
|
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
|
@@ -32,8 +32,9 @@ def analyze_results(all_results):
|
|
|
32
32
|
avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
|
|
33
33
|
avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
|
|
34
34
|
p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
|
|
35
|
-
success_rate = (
|
|
36
|
-
|
|
35
|
+
success_rate = (
|
|
36
|
+
total_metrics.get(Metrics.SUCCEED_REQUESTS, 0) / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)
|
|
37
|
+
) * 100
|
|
37
38
|
avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
|
|
38
39
|
p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
|
|
39
40
|
|
|
@@ -55,12 +56,13 @@ def analyze_results(all_results):
|
|
|
55
56
|
f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
|
|
56
57
|
])
|
|
57
58
|
|
|
58
|
-
total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST,
|
|
59
|
-
|
|
59
|
+
total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST,
|
|
60
|
+
0) * total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
|
|
60
61
|
total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
|
|
61
62
|
except Exception as e:
|
|
62
63
|
logger.warning(
|
|
63
|
-
f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}"
|
|
64
|
+
f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}"
|
|
65
|
+
)
|
|
64
66
|
continue
|
|
65
67
|
|
|
66
68
|
if not summary:
|
|
@@ -138,7 +140,8 @@ def print_summary(all_results, model_name):
|
|
|
138
140
|
f'{float(row[8]):.3f}', # Average TPOT
|
|
139
141
|
f'{float(row[9]):.3f}', # P99 TPOT
|
|
140
142
|
row[6], # Success Rate
|
|
141
|
-
style=row_style
|
|
143
|
+
style=row_style
|
|
144
|
+
)
|
|
142
145
|
except ValueError as e:
|
|
143
146
|
console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
|
|
144
147
|
continue
|
|
@@ -156,8 +159,9 @@ def print_summary(all_results, model_name):
|
|
|
156
159
|
perf_info.add_column('Value', style='green', width=40)
|
|
157
160
|
|
|
158
161
|
perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
|
|
159
|
-
perf_info.add_row(
|
|
160
|
-
|
|
162
|
+
perf_info.add_row(
|
|
163
|
+
'Lowest Latency', f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)'
|
|
164
|
+
)
|
|
161
165
|
|
|
162
166
|
console.print('\n')
|
|
163
167
|
console.print(perf_info)
|
|
@@ -166,7 +170,8 @@ def print_summary(all_results, model_name):
|
|
|
166
170
|
recommendations = []
|
|
167
171
|
if best_rps_idx == len(summary) - 1:
|
|
168
172
|
recommendations.append(
|
|
169
|
-
'The system seems not to have reached its performance bottleneck, try higher concurrency'
|
|
173
|
+
'The system seems not to have reached its performance bottleneck, try higher concurrency'
|
|
174
|
+
)
|
|
170
175
|
elif best_rps_idx == 0:
|
|
171
176
|
recommendations.append('Consider lowering concurrency, current load may be too high')
|
|
172
177
|
else:
|
|
@@ -175,7 +180,8 @@ def print_summary(all_results, model_name):
|
|
|
175
180
|
success_rate = float(summary[-1][6][:-1])
|
|
176
181
|
if success_rate < 95:
|
|
177
182
|
recommendations.append(
|
|
178
|
-
'Success rate is low at high concurrency, check system resources or reduce concurrency'
|
|
183
|
+
'Success rate is low at high concurrency, check system resources or reduce concurrency'
|
|
184
|
+
)
|
|
179
185
|
|
|
180
186
|
recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
|
|
181
187
|
console.print(recommend_text)
|
evalscope/report/__init__.py
CHANGED
|
@@ -6,7 +6,7 @@ from evalscope.utils.import_utils import _LazyModule
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from .combinator import gen_table, get_data_frame, get_report_list
|
|
8
8
|
from .generator import ReportGenerator
|
|
9
|
-
from .
|
|
9
|
+
from .report import Category, Report, ReportKey, Subset
|
|
10
10
|
|
|
11
11
|
else:
|
|
12
12
|
_import_structure = {
|
|
@@ -19,7 +19,7 @@ else:
|
|
|
19
19
|
'generator': [
|
|
20
20
|
'ReportGenerator',
|
|
21
21
|
],
|
|
22
|
-
'
|
|
22
|
+
'report': [
|
|
23
23
|
'Category',
|
|
24
24
|
'Report',
|
|
25
25
|
'ReportKey',
|
evalscope/report/combinator.py
CHANGED
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
|
6
6
|
from tabulate import tabulate
|
|
7
7
|
from typing import List, Tuple
|
|
8
8
|
|
|
9
|
-
from evalscope.report.
|
|
9
|
+
from evalscope.report.report import Report
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
@@ -32,25 +32,30 @@ def get_report_list(reports_path_list: List[str]) -> List[Report]:
|
|
|
32
32
|
return report_list
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def get_data_frame(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
35
|
+
def get_data_frame(
|
|
36
|
+
report_list: List[Report],
|
|
37
|
+
flatten_metrics: bool = True,
|
|
38
|
+
flatten_categories: bool = True,
|
|
39
|
+
add_overall_metric: bool = False
|
|
40
|
+
) -> pd.DataFrame:
|
|
39
41
|
tables = []
|
|
40
42
|
for report in report_list:
|
|
41
43
|
df = report.to_dataframe(
|
|
42
44
|
flatten_metrics=flatten_metrics,
|
|
43
45
|
flatten_categories=flatten_categories,
|
|
44
|
-
add_overall_metric=add_overall_metric
|
|
46
|
+
add_overall_metric=add_overall_metric
|
|
47
|
+
)
|
|
45
48
|
tables.append(df)
|
|
46
49
|
return pd.concat(tables, ignore_index=True)
|
|
47
50
|
|
|
48
51
|
|
|
49
|
-
def gen_table(
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
def gen_table(
|
|
53
|
+
reports_path_list: list[str] = None,
|
|
54
|
+
report_list: list[Report] = None,
|
|
55
|
+
flatten_metrics: bool = True,
|
|
56
|
+
flatten_categories: bool = True,
|
|
57
|
+
add_overall_metric: bool = False
|
|
58
|
+
) -> str:
|
|
54
59
|
"""
|
|
55
60
|
Generates a formatted table from a list of report paths or Report objects.
|
|
56
61
|
|
|
@@ -78,7 +83,8 @@ def gen_table(reports_path_list: list[str] = None,
|
|
|
78
83
|
report_list,
|
|
79
84
|
flatten_metrics=flatten_metrics,
|
|
80
85
|
flatten_categories=flatten_categories,
|
|
81
|
-
add_overall_metric=add_overall_metric
|
|
86
|
+
add_overall_metric=add_overall_metric
|
|
87
|
+
)
|
|
82
88
|
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
83
89
|
|
|
84
90
|
|
evalscope/report/generator.py
CHANGED
|
@@ -3,16 +3,18 @@ from pandas import DataFrame
|
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
4
|
|
|
5
5
|
from evalscope.constants import DataCollection
|
|
6
|
-
from evalscope.report.
|
|
6
|
+
from evalscope.report.report import *
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
|
-
from evalscope.
|
|
9
|
+
from evalscope.api.benchmark import DataAdapter
|
|
10
|
+
from evalscope.api.metric import AggScore
|
|
11
|
+
from evalscope.benchmarks import DataAdapter as OldDataAdapter
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
class ReportGenerator:
|
|
13
15
|
|
|
14
16
|
@staticmethod
|
|
15
|
-
def gen_report(subset_score_map: dict, model_name: str, data_adapter: '
|
|
17
|
+
def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'OldDataAdapter', **kwargs) -> Report:
|
|
16
18
|
"""
|
|
17
19
|
Generate a report for a specific dataset based on provided subset scores.
|
|
18
20
|
|
|
@@ -59,7 +61,9 @@ class ReportGenerator:
|
|
|
59
61
|
score=score_item['score'],
|
|
60
62
|
num=score_item['num'],
|
|
61
63
|
metric_name=score_item['metric_name'],
|
|
62
|
-
categories=tuple(categories)
|
|
64
|
+
categories=tuple(categories)
|
|
65
|
+
)
|
|
66
|
+
)
|
|
63
67
|
df = pd.DataFrame(subsets)
|
|
64
68
|
return df
|
|
65
69
|
|
|
@@ -83,7 +87,8 @@ class ReportGenerator:
|
|
|
83
87
|
dataset_name=dataset_name,
|
|
84
88
|
model_name=model_name,
|
|
85
89
|
dataset_description=data_adapter.description,
|
|
86
|
-
dataset_pretty_name=data_adapter.pretty_name
|
|
90
|
+
dataset_pretty_name=data_adapter.pretty_name
|
|
91
|
+
)
|
|
87
92
|
return report
|
|
88
93
|
|
|
89
94
|
@staticmethod
|
|
@@ -101,4 +106,94 @@ class ReportGenerator:
|
|
|
101
106
|
name=DataCollection.NAME,
|
|
102
107
|
metrics=[Metric(name='Average', categories=categories)],
|
|
103
108
|
dataset_name=all_dataset_name,
|
|
104
|
-
model_name=model_name
|
|
109
|
+
model_name=model_name
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def generate_report(
|
|
114
|
+
score_dict: Dict[str, List['AggScore']],
|
|
115
|
+
model_name: str,
|
|
116
|
+
data_adapter: 'DataAdapter',
|
|
117
|
+
add_aggregation_name: bool = True
|
|
118
|
+
) -> Report:
|
|
119
|
+
"""
|
|
120
|
+
Generate a report for a specific dataset based on provided subset scores.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
|
|
124
|
+
```
|
|
125
|
+
{
|
|
126
|
+
'subset_name': [
|
|
127
|
+
AggScore={'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
|
|
128
|
+
AggScore={'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
|
|
129
|
+
],
|
|
130
|
+
...
|
|
131
|
+
}
|
|
132
|
+
```
|
|
133
|
+
data_adapter (DataAdapter): An adapter object for data handling.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Report: A structured report object containing metrics, categories, and subsets.
|
|
137
|
+
|
|
138
|
+
>>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
|
|
139
|
+
""" # noqa: E501
|
|
140
|
+
|
|
141
|
+
dataset_name = data_adapter.name
|
|
142
|
+
category_map = data_adapter.category_map
|
|
143
|
+
report_name = f'{model_name}@{dataset_name}'
|
|
144
|
+
|
|
145
|
+
def flatten_subset() -> DataFrame:
|
|
146
|
+
"""
|
|
147
|
+
Flatten subset score map to a DataFrame.
|
|
148
|
+
|
|
149
|
+
Example:
|
|
150
|
+
name score num categories metric_name
|
|
151
|
+
0 ARC-Easy 0.5 2 [default] AverageAccuracy
|
|
152
|
+
1 ARC-Challenge 0.5 2 [default] AverageAccuracy
|
|
153
|
+
"""
|
|
154
|
+
subsets = []
|
|
155
|
+
for subset_name, agg_scores in score_dict.items():
|
|
156
|
+
for agg_score_item in agg_scores:
|
|
157
|
+
categories = category_map.get(subset_name, ['default'])
|
|
158
|
+
if add_aggregation_name and agg_score_item.aggregation_name:
|
|
159
|
+
metric_name = f'{agg_score_item.aggregation_name}_{agg_score_item.metric_name}'
|
|
160
|
+
else:
|
|
161
|
+
metric_name = agg_score_item.metric_name
|
|
162
|
+
|
|
163
|
+
if isinstance(categories, str):
|
|
164
|
+
categories = [categories]
|
|
165
|
+
subsets.append(
|
|
166
|
+
dict(
|
|
167
|
+
name=subset_name,
|
|
168
|
+
score=agg_score_item.score,
|
|
169
|
+
num=agg_score_item.num,
|
|
170
|
+
metric_name=metric_name,
|
|
171
|
+
categories=tuple(categories)
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
df = pd.DataFrame(subsets)
|
|
175
|
+
return df
|
|
176
|
+
|
|
177
|
+
df = flatten_subset()
|
|
178
|
+
|
|
179
|
+
metrics_list = []
|
|
180
|
+
for metric_name, group_metric in df.groupby('metric_name', sort=False):
|
|
181
|
+
categories = []
|
|
182
|
+
for category_name, group_category in group_metric.groupby('categories'):
|
|
183
|
+
subsets = []
|
|
184
|
+
for _, row in group_category.iterrows():
|
|
185
|
+
subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
|
|
186
|
+
|
|
187
|
+
categories.append(Category(name=category_name, subsets=subsets))
|
|
188
|
+
|
|
189
|
+
metrics_list.append(Metric(name=metric_name, categories=categories))
|
|
190
|
+
|
|
191
|
+
report = Report(
|
|
192
|
+
name=report_name,
|
|
193
|
+
metrics=metrics_list,
|
|
194
|
+
dataset_name=dataset_name,
|
|
195
|
+
model_name=model_name,
|
|
196
|
+
dataset_description=data_adapter.description,
|
|
197
|
+
dataset_pretty_name=data_adapter.pretty_name
|
|
198
|
+
)
|
|
199
|
+
return report
|
|
@@ -152,10 +152,12 @@ class Report:
|
|
|
152
152
|
data = json.load(f)
|
|
153
153
|
return cls.from_dict(data)
|
|
154
154
|
|
|
155
|
-
def to_dataframe(
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
155
|
+
def to_dataframe(
|
|
156
|
+
self,
|
|
157
|
+
flatten_metrics: bool = True,
|
|
158
|
+
flatten_categories: bool = True,
|
|
159
|
+
add_overall_metric: bool = False
|
|
160
|
+
) -> pd.DataFrame:
|
|
159
161
|
"""
|
|
160
162
|
Convert the report to a pandas DataFrame.
|
|
161
163
|
Args:
|
|
@@ -201,8 +203,8 @@ class Report:
|
|
|
201
203
|
# multi-level aggregation for categories
|
|
202
204
|
max_depth = df_categories[ReportKey.category_name].apply(len).max()
|
|
203
205
|
for level in range(max_depth):
|
|
204
|
-
df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[
|
|
205
|
-
lambda x: x[level] if len(x) > level else None)
|
|
206
|
+
df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[
|
|
207
|
+
ReportKey.category_name].apply(lambda x: x[level] if len(x) > level else None)
|
|
206
208
|
|
|
207
209
|
df_categories.drop(columns=[ReportKey.category_name], inplace=True)
|
|
208
210
|
return df_categories
|