evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -1
- evalscope/benchmarks/__init__.py +2 -2
- evalscope/benchmarks/aigc/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/base.py +56 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
- evalscope/benchmarks/aime/aime24_adapter.py +1 -1
- evalscope/benchmarks/aime/aime25_adapter.py +4 -4
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
- evalscope/benchmarks/data_adapter.py +16 -9
- evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -3
- evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
- evalscope/benchmarks/utils.py +7 -16
- evalscope/cli/start_app.py +1 -1
- evalscope/collections/evaluator.py +16 -4
- evalscope/config.py +7 -3
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +9 -3
- evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
- evalscope/metrics/__init__.py +49 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/metrics/named_metrics.py +13 -0
- evalscope/metrics/t2v_metrics/__init__.py +66 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +50 -14
- evalscope/models/adapters/__init__.py +17 -0
- evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
- evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
- evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
- evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
- evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
- evalscope/models/adapters/t2i_adapter.py +76 -0
- evalscope/models/custom/__init__.py +2 -1
- evalscope/models/custom/dummy_model.py +11 -13
- evalscope/models/local_model.py +82 -33
- evalscope/models/model.py +2 -42
- evalscope/models/register.py +26 -0
- evalscope/perf/benchmark.py +4 -3
- evalscope/perf/main.py +4 -2
- evalscope/perf/plugin/datasets/flickr8k.py +2 -1
- evalscope/perf/utils/benchmark_util.py +2 -2
- evalscope/perf/utils/db_util.py +16 -8
- evalscope/report/__init__.py +1 -0
- evalscope/report/app.py +117 -67
- evalscope/report/app_arguments.py +11 -0
- evalscope/report/generator.py +1 -1
- evalscope/run.py +3 -3
- evalscope/third_party/thinkbench/eval.py +19 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/import_utils.py +66 -0
- evalscope/utils/utils.py +12 -4
- evalscope/version.py +2 -2
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +20 -3
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +178 -66
- tests/aigc/__init__.py +1 -0
- tests/aigc/test_t2i.py +87 -0
- tests/cli/test_run.py +20 -7
- tests/perf/test_perf.py +6 -3
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0
evalscope/models/local_model.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import
|
|
1
|
+
import importlib
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
2
3
|
from typing import TYPE_CHECKING, Optional
|
|
3
4
|
|
|
4
|
-
from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType
|
|
5
|
+
from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType, ModelTask
|
|
5
6
|
from evalscope.utils.logger import get_logger
|
|
6
7
|
from evalscope.utils.model_utils import get_device
|
|
7
8
|
|
|
@@ -11,31 +12,55 @@ if TYPE_CHECKING:
|
|
|
11
12
|
logger = get_logger()
|
|
12
13
|
|
|
13
14
|
|
|
14
|
-
class LocalModel:
|
|
15
|
+
class LocalModel(ABC):
|
|
15
16
|
|
|
16
17
|
def __init__(self,
|
|
17
18
|
model_id: str,
|
|
18
|
-
model_revision: str =
|
|
19
|
-
device_map: str =
|
|
19
|
+
model_revision: str = None,
|
|
20
|
+
device_map: str = None,
|
|
20
21
|
torch_dtype: str = 'auto',
|
|
21
22
|
cache_dir: str = None,
|
|
22
23
|
**kwargs):
|
|
23
|
-
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
24
24
|
|
|
25
|
-
|
|
25
|
+
self.model_id = model_id
|
|
26
|
+
self.model_revision = model_revision or DEFAULT_MODEL_REVISION
|
|
27
|
+
self.device = device_map or get_device()
|
|
28
|
+
self.cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
|
|
29
|
+
self.kwargs = kwargs
|
|
30
|
+
self.model = None
|
|
31
|
+
self.tokenizer = None
|
|
26
32
|
|
|
27
33
|
if isinstance(torch_dtype, str) and torch_dtype != 'auto':
|
|
34
|
+
import torch
|
|
28
35
|
torch_dtype = eval(torch_dtype)
|
|
36
|
+
self.torch_dtype = torch_dtype
|
|
37
|
+
|
|
38
|
+
self.model_cfg = {
|
|
39
|
+
'model_id': self.model_id,
|
|
40
|
+
'device_map': self.device,
|
|
41
|
+
'torch_dtype': str(self.torch_dtype),
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def load_model(self):
|
|
46
|
+
pass
|
|
29
47
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
48
|
+
|
|
49
|
+
class LocalChatModel(LocalModel):
|
|
50
|
+
|
|
51
|
+
def __init__(self, **kwargs):
|
|
52
|
+
super().__init__(**kwargs)
|
|
53
|
+
|
|
54
|
+
def load_model(self):
|
|
55
|
+
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
56
|
+
|
|
57
|
+
logger.info(f'Loading model {self.model_id} ...')
|
|
33
58
|
|
|
34
59
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
35
60
|
self.model_id,
|
|
36
|
-
revision=model_revision,
|
|
61
|
+
revision=self.model_revision,
|
|
37
62
|
trust_remote_code=True,
|
|
38
|
-
cache_dir=
|
|
63
|
+
cache_dir=self.cache_dir,
|
|
39
64
|
)
|
|
40
65
|
|
|
41
66
|
# Fix no padding
|
|
@@ -44,18 +69,45 @@ class LocalModel:
|
|
|
44
69
|
|
|
45
70
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
46
71
|
self.model_id,
|
|
47
|
-
revision=model_revision,
|
|
48
|
-
device_map=
|
|
72
|
+
revision=self.model_revision,
|
|
73
|
+
device_map=self.device,
|
|
49
74
|
trust_remote_code=True,
|
|
50
|
-
torch_dtype=torch_dtype,
|
|
51
|
-
cache_dir=
|
|
75
|
+
torch_dtype=self.torch_dtype,
|
|
76
|
+
cache_dir=self.cache_dir,
|
|
52
77
|
)
|
|
53
78
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
79
|
+
|
|
80
|
+
class LocalImageModel(LocalModel):
|
|
81
|
+
|
|
82
|
+
def __init__(self, **kwargs):
|
|
83
|
+
super().__init__(**kwargs)
|
|
84
|
+
|
|
85
|
+
self.pipeline_cls = kwargs.pop('pipeline_cls', None)
|
|
86
|
+
# default to DiffusionPipeline if not specified
|
|
87
|
+
if self.pipeline_cls is None:
|
|
88
|
+
if 'flux' in self.model_id.lower():
|
|
89
|
+
self.pipeline_cls = 'FluxPipeline'
|
|
90
|
+
else:
|
|
91
|
+
self.pipeline_cls = 'DiffusionPipeline'
|
|
92
|
+
|
|
93
|
+
def load_model(self):
|
|
94
|
+
# from modelscope import pipeline_cls
|
|
95
|
+
module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
|
|
96
|
+
|
|
97
|
+
logger.info(f'Loading model {self.model_id} with {self.pipeline_cls} ...')
|
|
98
|
+
|
|
99
|
+
self.model = module.from_pretrained(
|
|
100
|
+
self.model_id,
|
|
101
|
+
revision=self.model_revision,
|
|
102
|
+
torch_dtype=self.torch_dtype,
|
|
103
|
+
cache_dir=self.cache_dir,
|
|
104
|
+
**self.kwargs,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
self.model.to(self.device)
|
|
108
|
+
|
|
109
|
+
def __call__(self, *args, **kwargs):
|
|
110
|
+
return self.model(*args, **kwargs)
|
|
59
111
|
|
|
60
112
|
|
|
61
113
|
def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
|
|
@@ -64,16 +116,13 @@ def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
|
|
|
64
116
|
"""
|
|
65
117
|
if task_cfg.eval_type != EvalType.CHECKPOINT:
|
|
66
118
|
return None
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
base_model
|
|
74
|
-
model_id=task_cfg.model,
|
|
75
|
-
model_revision=model_revision,
|
|
76
|
-
device_map=device_map,
|
|
77
|
-
torch_dtype=model_precision,
|
|
78
|
-
cache_dir=cache_dir)
|
|
119
|
+
elif task_cfg.model_task == ModelTask.TEXT_GENERATION:
|
|
120
|
+
base_model = LocalChatModel(model_id=task_cfg.model, **task_cfg.model_args)
|
|
121
|
+
base_model.load_model()
|
|
122
|
+
return base_model
|
|
123
|
+
elif task_cfg.model_task == ModelTask.IMAGE_GENERATION:
|
|
124
|
+
base_model = LocalImageModel(model_id=task_cfg.model, **task_cfg.model_args)
|
|
125
|
+
base_model.load_model()
|
|
79
126
|
return base_model
|
|
127
|
+
else:
|
|
128
|
+
raise ValueError(f'Unsupported model task: {task_cfg.model_task} for model checkpoint.')
|
evalscope/models/model.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os
|
|
3
|
-
import random
|
|
4
3
|
import time
|
|
5
4
|
from abc import ABC, abstractmethod
|
|
6
|
-
from typing import Any
|
|
5
|
+
from typing import Any, List
|
|
7
6
|
|
|
8
7
|
from evalscope.utils.logger import get_logger
|
|
9
8
|
|
|
@@ -95,6 +94,7 @@ class ChatBaseModel(BaseModel):
|
|
|
95
94
|
raise NotImplementedError
|
|
96
95
|
|
|
97
96
|
|
|
97
|
+
# TODO: Remove this class after refactoring all models
|
|
98
98
|
class OpenAIModel(ChatBaseModel):
|
|
99
99
|
"""
|
|
100
100
|
APIs of OpenAI models.
|
|
@@ -187,43 +187,3 @@ class OpenAIModel(ChatBaseModel):
|
|
|
187
187
|
time.sleep(3)
|
|
188
188
|
logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
|
|
189
189
|
return res
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
class DummyChatModel(ChatBaseModel):
|
|
193
|
-
|
|
194
|
-
MODEL_ID = 'dummy_chat_model_0801'
|
|
195
|
-
REVISION = 'v1.0.0'
|
|
196
|
-
|
|
197
|
-
def __init__(self, model_cfg: dict, **kwargs):
|
|
198
|
-
model_cfg['model_id'] = self.MODEL_ID
|
|
199
|
-
model_cfg['revision'] = self.REVISION
|
|
200
|
-
super(DummyChatModel, self).__init__(model_cfg=model_cfg)
|
|
201
|
-
|
|
202
|
-
def predict(self, inputs: dict, **kwargs) -> dict:
|
|
203
|
-
|
|
204
|
-
debug: bool = False
|
|
205
|
-
if debug:
|
|
206
|
-
messages = inputs['messages']
|
|
207
|
-
history = inputs['history']
|
|
208
|
-
|
|
209
|
-
logger.info(f'** messages: {messages}')
|
|
210
|
-
logger.info(f'** history: {history}')
|
|
211
|
-
|
|
212
|
-
choice = random.choice(['A', 'B', 'C', 'D'])
|
|
213
|
-
|
|
214
|
-
# Build response
|
|
215
|
-
res = {
|
|
216
|
-
'choices': [{
|
|
217
|
-
'index': 0,
|
|
218
|
-
'message': {
|
|
219
|
-
'content': choice,
|
|
220
|
-
'role': 'assistant'
|
|
221
|
-
}
|
|
222
|
-
}],
|
|
223
|
-
'created': time.time(),
|
|
224
|
-
'model': self.MODEL_ID + '-' + self.REVISION,
|
|
225
|
-
'object': 'chat.completion',
|
|
226
|
-
'usage': {}
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
return res
|
evalscope/models/register.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from evalscope.constants import OutputType
|
|
2
|
+
from .adapters import *
|
|
3
|
+
|
|
1
4
|
MODEL_ADAPTERS = {}
|
|
2
5
|
|
|
3
6
|
|
|
@@ -26,3 +29,26 @@ def get_model_adapter(name):
|
|
|
26
29
|
raise ValueError(
|
|
27
30
|
f"Model adapter '{name}' is not registered. Available model adapters: {list(MODEL_ADAPTERS.keys())}")
|
|
28
31
|
return MODEL_ADAPTERS[name]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def register_model_adapter_class(cls, name=None):
|
|
35
|
+
"""
|
|
36
|
+
Register a model adapter class.
|
|
37
|
+
:param cls: The model adapter class to register
|
|
38
|
+
:param name: Optional name for the model adapter. If not provided, the class name will be used.
|
|
39
|
+
"""
|
|
40
|
+
if name is None:
|
|
41
|
+
name = cls.__name__
|
|
42
|
+
if name in MODEL_ADAPTERS:
|
|
43
|
+
raise ValueError(f"Model adapter class '{name}' is already registered.")
|
|
44
|
+
MODEL_ADAPTERS[name] = cls
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# register all model adapters
|
|
48
|
+
register_model_adapter_class(BaseModelAdapter, name='base')
|
|
49
|
+
register_model_adapter_class(ChatGenerationModelAdapter, name=OutputType.GENERATION)
|
|
50
|
+
register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.LOGITS)
|
|
51
|
+
register_model_adapter_class(MultiChoiceModelAdapter, name=OutputType.MULTIPLE_CHOICE)
|
|
52
|
+
register_model_adapter_class(CustomModelAdapter, name='custom')
|
|
53
|
+
register_model_adapter_class(ServerModelAdapter, name='server')
|
|
54
|
+
register_model_adapter_class(T2IModelAdapter, name=OutputType.IMAGE_GENERATION)
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -9,7 +9,7 @@ import threading
|
|
|
9
9
|
import time
|
|
10
10
|
from http import HTTPStatus
|
|
11
11
|
from tqdm import tqdm
|
|
12
|
-
from typing import AsyncGenerator, List
|
|
12
|
+
from typing import AsyncGenerator, Dict, List, Tuple
|
|
13
13
|
|
|
14
14
|
from evalscope.perf.arguments import Arguments
|
|
15
15
|
from evalscope.perf.http_client import AioHttpClient, test_connection
|
|
@@ -180,7 +180,7 @@ async def connect_test(args: Arguments) -> bool:
|
|
|
180
180
|
|
|
181
181
|
|
|
182
182
|
@exception_handler
|
|
183
|
-
async def benchmark(args: Arguments) ->
|
|
183
|
+
async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
|
|
184
184
|
if platform.system() != 'Windows':
|
|
185
185
|
loop = asyncio.get_running_loop()
|
|
186
186
|
add_signal_handlers(loop)
|
|
@@ -205,4 +205,5 @@ async def benchmark(args: Arguments) -> None:
|
|
|
205
205
|
data_process_completed_event.set()
|
|
206
206
|
|
|
207
207
|
metrics, result_db_path = await statistic_benchmark_metric_task
|
|
208
|
-
summary_result(args, metrics, result_db_path)
|
|
208
|
+
metrics_result, percentile_result = summary_result(args, metrics, result_db_path)
|
|
209
|
+
return metrics_result, percentile_result
|
evalscope/perf/main.py
CHANGED
|
@@ -36,9 +36,11 @@ def run_perf_benchmark(args):
|
|
|
36
36
|
if platform.system() != 'Windows':
|
|
37
37
|
add_signal_handlers(loop)
|
|
38
38
|
|
|
39
|
-
loop.run_until_complete(benchmark(args))
|
|
39
|
+
return loop.run_until_complete(benchmark(args))
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
if __name__ == '__main__':
|
|
43
43
|
args = Arguments.from_args(parse_args())
|
|
44
|
-
run_perf_benchmark(args)
|
|
44
|
+
metrics_result, percentile_result = run_perf_benchmark(args)
|
|
45
|
+
print(metrics_result)
|
|
46
|
+
print(percentile_result)
|
|
@@ -30,6 +30,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
30
30
|
|
|
31
31
|
for item in dataset:
|
|
32
32
|
pil_image = item['jpg']
|
|
33
|
+
text = item['txt']
|
|
33
34
|
base64_iamge = PIL_to_base64(pil_image)
|
|
34
35
|
|
|
35
36
|
yield [{
|
|
@@ -38,7 +39,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
38
39
|
'content': [
|
|
39
40
|
{
|
|
40
41
|
'type': 'text',
|
|
41
|
-
'text':
|
|
42
|
+
'text': text,
|
|
42
43
|
},
|
|
43
44
|
{
|
|
44
45
|
'type': 'image_url',
|
|
@@ -32,13 +32,13 @@ class BenchmarkData:
|
|
|
32
32
|
self.query_latency = self.completed_time - self.start_time
|
|
33
33
|
if len(self.chunk_times) > 1:
|
|
34
34
|
self.first_chunk_latency = self.chunk_times[0] - self.start_time
|
|
35
|
-
self.n_chunks = len(self.chunk_times) - 2
|
|
35
|
+
self.n_chunks = len(self.chunk_times) - 2 # remove last and first chunk
|
|
36
36
|
self.n_chunks_time = self.chunk_times[-2] - self.chunk_times[0]
|
|
37
37
|
else:
|
|
38
38
|
self.first_chunk_latency = self.query_latency
|
|
39
39
|
self.n_chunks = 1
|
|
40
40
|
self.n_chunks_time = self.query_latency
|
|
41
|
-
self.time_per_output_token = self.
|
|
41
|
+
self.time_per_output_token = self.n_chunks_time / self.completion_tokens
|
|
42
42
|
|
|
43
43
|
def _calculate_tokens(self, api_plugin):
|
|
44
44
|
self.prompt_tokens, self.completion_tokens = \
|
evalscope/perf/utils/db_util.py
CHANGED
|
@@ -7,7 +7,7 @@ import sqlite3
|
|
|
7
7
|
import sys
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from tabulate import tabulate
|
|
10
|
-
from typing import Dict, List
|
|
10
|
+
from typing import Dict, List, Tuple
|
|
11
11
|
|
|
12
12
|
from evalscope.perf.arguments import Arguments
|
|
13
13
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
@@ -165,6 +165,7 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
165
165
|
CHUNK_TIMES_INDEX = 1
|
|
166
166
|
LATENCY_INDEX = 4
|
|
167
167
|
FIRST_CHUNK_LATENCY_INDEX = 5
|
|
168
|
+
CHUNK_TIME_INDEX = 7
|
|
168
169
|
PROMPT_TOKENS_INDEX = 8
|
|
169
170
|
COMPLETION_TOKENS_INDEX = 9
|
|
170
171
|
|
|
@@ -177,12 +178,17 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
177
178
|
'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
|
|
178
179
|
'ITL (s)':
|
|
179
180
|
inter_token_latencies_all,
|
|
181
|
+
'TPOT (s)':
|
|
182
|
+
[(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
|
|
183
|
+
for row in rows],
|
|
180
184
|
'Latency (s)': [row[LATENCY_INDEX] for row in rows],
|
|
181
185
|
'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
|
|
182
186
|
'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
|
|
183
|
-
'
|
|
187
|
+
'Output throughput(tok/s)':
|
|
184
188
|
[(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
|
|
185
|
-
for row in rows]
|
|
189
|
+
for row in rows],
|
|
190
|
+
'Total throughput(tok/s)': [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
|
|
191
|
+
/ row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan') for row in rows]
|
|
186
192
|
}
|
|
187
193
|
|
|
188
194
|
# Calculate percentiles for each metric
|
|
@@ -194,16 +200,16 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
|
|
|
194
200
|
return results
|
|
195
201
|
|
|
196
202
|
|
|
197
|
-
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str):
|
|
203
|
+
def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: str) -> Tuple[Dict, Dict]:
|
|
198
204
|
result_path = os.path.dirname(result_db_path)
|
|
199
205
|
write_json_file(args.to_dict(), os.path.join(result_path, 'benchmark_args.json'))
|
|
200
206
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
write_json_file(
|
|
207
|
+
metrics_result = metrics.create_message()
|
|
208
|
+
metrics_result.update({'Expected number of requests': args.number, 'Result DB path': result_db_path})
|
|
209
|
+
write_json_file(metrics_result, os.path.join(result_path, 'benchmark_summary.json'))
|
|
204
210
|
|
|
205
211
|
# Print summary in a table
|
|
206
|
-
table = tabulate(list(
|
|
212
|
+
table = tabulate(list(metrics_result.items()), headers=['Key', 'Value'], tablefmt='grid')
|
|
207
213
|
logger.info('\nBenchmarking summary:\n' + table)
|
|
208
214
|
|
|
209
215
|
# Get percentile results
|
|
@@ -217,6 +223,8 @@ def summary_result(args: Arguments, metrics: BenchmarkMetrics, result_db_path: s
|
|
|
217
223
|
if args.dataset.startswith('speed_benchmark'):
|
|
218
224
|
speed_benchmark_result(result_db_path)
|
|
219
225
|
|
|
226
|
+
return metrics_result, percentile_result
|
|
227
|
+
|
|
220
228
|
|
|
221
229
|
def speed_benchmark_result(result_db_path: str):
|
|
222
230
|
query_sql = """
|
evalscope/report/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
from evalscope.report.app_arguments import add_argument
|
|
3
4
|
from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
|
|
4
5
|
from evalscope.report.generator import ReportGenerator
|
|
5
6
|
from evalscope.report.utils import Category, Report, ReportKey, Subset
|
evalscope/report/app.py
CHANGED
|
@@ -11,7 +11,7 @@ from dataclasses import dataclass
|
|
|
11
11
|
from typing import Any, List, Union
|
|
12
12
|
|
|
13
13
|
from evalscope.constants import DataCollection
|
|
14
|
-
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
14
|
+
from evalscope.report import Report, ReportKey, add_argument, get_data_frame, get_report_list
|
|
15
15
|
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
16
16
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
17
17
|
from evalscope.version import __version__
|
|
@@ -22,6 +22,23 @@ PLOTLY_THEME = 'plotly_dark'
|
|
|
22
22
|
REPORT_TOKEN = '@@'
|
|
23
23
|
MODEL_TOKEN = '::'
|
|
24
24
|
DATASET_TOKEN = ', '
|
|
25
|
+
LATEX_DELIMITERS = [{
|
|
26
|
+
'left': '$$',
|
|
27
|
+
'right': '$$',
|
|
28
|
+
'display': True
|
|
29
|
+
}, {
|
|
30
|
+
'left': '$',
|
|
31
|
+
'right': '$',
|
|
32
|
+
'display': False
|
|
33
|
+
}, {
|
|
34
|
+
'left': '\\(',
|
|
35
|
+
'right': '\\)',
|
|
36
|
+
'display': False
|
|
37
|
+
}, {
|
|
38
|
+
'left': '\\[',
|
|
39
|
+
'right': '\\]',
|
|
40
|
+
'display': True
|
|
41
|
+
}]
|
|
25
42
|
|
|
26
43
|
|
|
27
44
|
def scan_for_report_folders(root_path):
|
|
@@ -234,6 +251,18 @@ def convert_html_tags(text):
|
|
|
234
251
|
return text
|
|
235
252
|
|
|
236
253
|
|
|
254
|
+
def convert_markdown_image(text):
|
|
255
|
+
if not os.path.isfile(text):
|
|
256
|
+
return text
|
|
257
|
+
# Convert the image path to a markdown image tag
|
|
258
|
+
if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
|
|
259
|
+
text = os.path.abspath(text)
|
|
260
|
+
image_tag = f''
|
|
261
|
+
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
262
|
+
return image_tag
|
|
263
|
+
return text
|
|
264
|
+
|
|
265
|
+
|
|
237
266
|
def process_string(string: str, max_length: int = 2048) -> str:
|
|
238
267
|
string = convert_html_tags(string) # for display labels e.g. `<think>`
|
|
239
268
|
if len(string) > max_length:
|
|
@@ -285,7 +314,7 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
|
|
|
285
314
|
'Input': raw_input,
|
|
286
315
|
'Generated': raw_pred_answer,
|
|
287
316
|
'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
|
|
288
|
-
'Pred': parsed_pred_answer
|
|
317
|
+
'Pred': parsed_pred_answer,
|
|
289
318
|
'Score': score,
|
|
290
319
|
'NScore': normalize_score(score)
|
|
291
320
|
}
|
|
@@ -295,22 +324,6 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
|
|
|
295
324
|
return df_subset
|
|
296
325
|
|
|
297
326
|
|
|
298
|
-
def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: int = 1) -> pd.DataFrame:
|
|
299
|
-
if data_review_df is None:
|
|
300
|
-
return pd.DataFrame(), None
|
|
301
|
-
|
|
302
|
-
logger.debug(f'page: {page}, rows_per_page: {rows_per_page}')
|
|
303
|
-
start = (page - 1) * rows_per_page
|
|
304
|
-
end = start + rows_per_page
|
|
305
|
-
df_subset = data_review_df.iloc[start:end].copy()
|
|
306
|
-
df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
|
|
307
|
-
df_subset['Generated'] = df_subset['Generated'].map(process_model_prediction).astype(str)
|
|
308
|
-
df_subset['Pred'] = df_subset['Pred'].map(process_model_prediction).astype(str)
|
|
309
|
-
df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
|
|
310
|
-
styler = style_df(df_subset, columns=['NScore'])
|
|
311
|
-
return df_subset, styler
|
|
312
|
-
|
|
313
|
-
|
|
314
327
|
@dataclass
|
|
315
328
|
class SidebarComponents:
|
|
316
329
|
root_path: gr.Textbox
|
|
@@ -457,7 +470,11 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
457
470
|
'page': {
|
|
458
471
|
'zh': '页码',
|
|
459
472
|
'en': 'Page'
|
|
460
|
-
}
|
|
473
|
+
},
|
|
474
|
+
'score_threshold': {
|
|
475
|
+
'zh': '分数阈值',
|
|
476
|
+
'en': 'Score Threshold'
|
|
477
|
+
},
|
|
461
478
|
}
|
|
462
479
|
|
|
463
480
|
# Update the UI components with localized labels
|
|
@@ -489,37 +506,53 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
489
506
|
gr.Markdown(f'### {locale_dict["model_prediction"][lang]}')
|
|
490
507
|
subset_select = gr.Dropdown(
|
|
491
508
|
label=locale_dict['select_subset'][lang], choices=[], show_label=True, interactive=True)
|
|
509
|
+
|
|
492
510
|
with gr.Row():
|
|
493
511
|
answer_mode_radio = gr.Radio(
|
|
494
512
|
label=locale_dict['answer_mode'][lang], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
answer_mode_counts = gr.Markdown('', label='Counts')
|
|
513
|
+
score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'][lang], interactive=True)
|
|
514
|
+
|
|
498
515
|
data_review_df = gr.State(None)
|
|
499
516
|
filtered_review_df = gr.State(None)
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
'
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
517
|
+
|
|
518
|
+
# show statistics
|
|
519
|
+
with gr.Row(variant='panel'):
|
|
520
|
+
with gr.Column():
|
|
521
|
+
gr.Markdown('### *Counts*')
|
|
522
|
+
answer_mode_counts = gr.Markdown('')
|
|
523
|
+
with gr.Column():
|
|
524
|
+
page_number = gr.Number(
|
|
525
|
+
value=1, label=locale_dict['page'][lang], minimum=1, maximum=1, step=1, interactive=True)
|
|
526
|
+
|
|
527
|
+
# show data review table
|
|
528
|
+
with gr.Row(variant='panel'):
|
|
529
|
+
with gr.Column():
|
|
530
|
+
gr.Markdown('### *Score*')
|
|
531
|
+
score_text = gr.Markdown(
|
|
532
|
+
'', elem_id='score_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
|
|
533
|
+
with gr.Column():
|
|
534
|
+
gr.Markdown('### *Normalized Score*')
|
|
535
|
+
nscore = gr.Markdown('', elem_id='score_text', latex_delimiters=LATEX_DELIMITERS)
|
|
536
|
+
|
|
537
|
+
with gr.Row(variant='panel'):
|
|
538
|
+
with gr.Column():
|
|
539
|
+
gr.Markdown('### *Gold*')
|
|
540
|
+
gold_text = gr.Markdown(
|
|
541
|
+
'', elem_id='gold_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
|
|
542
|
+
with gr.Column():
|
|
543
|
+
gr.Markdown('### *Pred*')
|
|
544
|
+
pred_text = gr.Markdown(
|
|
545
|
+
'', elem_id='pred_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
|
|
546
|
+
|
|
547
|
+
with gr.Row(variant='panel'):
|
|
548
|
+
with gr.Column():
|
|
549
|
+
gr.Markdown('### *Input*')
|
|
550
|
+
input_text = gr.Markdown(
|
|
551
|
+
'', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
|
|
552
|
+
with gr.Column():
|
|
553
|
+
gr.Markdown('### *Generated*')
|
|
554
|
+
generated_text = gr.Markdown(
|
|
555
|
+
'', elem_id='generated_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
|
|
523
556
|
|
|
524
557
|
@report_name.change(
|
|
525
558
|
inputs=[sidebar.root_path, report_name],
|
|
@@ -561,15 +594,15 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
561
594
|
return data_review_df, 1
|
|
562
595
|
|
|
563
596
|
@gr.on(
|
|
564
|
-
triggers=[data_review_df.change, answer_mode_radio.change],
|
|
565
|
-
inputs=[data_review_df, answer_mode_radio],
|
|
597
|
+
triggers=[data_review_df.change, answer_mode_radio.change, score_threshold.change],
|
|
598
|
+
inputs=[data_review_df, answer_mode_radio, score_threshold],
|
|
566
599
|
outputs=[filtered_review_df, page_number, answer_mode_counts])
|
|
567
|
-
def filter_data(data_review_df, answer_mode):
|
|
600
|
+
def filter_data(data_review_df, answer_mode, score_threshold):
|
|
568
601
|
if data_review_df is None:
|
|
569
602
|
return None, gr.update(value=1, maximum=1), ''
|
|
570
603
|
|
|
571
604
|
all_count = len(data_review_df)
|
|
572
|
-
pass_df = data_review_df[data_review_df['NScore'] >=
|
|
605
|
+
pass_df = data_review_df[data_review_df['NScore'] >= score_threshold]
|
|
573
606
|
pass_count = len(pass_df)
|
|
574
607
|
fail_count = all_count - pass_count
|
|
575
608
|
|
|
@@ -578,7 +611,7 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
578
611
|
if answer_mode == 'Pass':
|
|
579
612
|
filtered_df = pass_df
|
|
580
613
|
elif answer_mode == 'Fail':
|
|
581
|
-
filtered_df = data_review_df[data_review_df['NScore'] <
|
|
614
|
+
filtered_df = data_review_df[data_review_df['NScore'] < score_threshold]
|
|
582
615
|
else:
|
|
583
616
|
filtered_df = data_review_df
|
|
584
617
|
|
|
@@ -588,13 +621,33 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
|
|
|
588
621
|
|
|
589
622
|
@gr.on(
|
|
590
623
|
triggers=[filtered_review_df.change, page_number.change],
|
|
591
|
-
inputs=[filtered_review_df, page_number],
|
|
592
|
-
outputs=[
|
|
593
|
-
def
|
|
594
|
-
if filtered_df is None:
|
|
595
|
-
return
|
|
596
|
-
|
|
597
|
-
|
|
624
|
+
inputs=[filtered_review_df, page_number, score_threshold],
|
|
625
|
+
outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore])
|
|
626
|
+
def update_table_components(filtered_df, page_number, score_threshold):
|
|
627
|
+
if filtered_df is None or len(filtered_df) == 0:
|
|
628
|
+
return '', '', '', '', '', ''
|
|
629
|
+
|
|
630
|
+
# Get single row data for the current page
|
|
631
|
+
start = (page_number - 1)
|
|
632
|
+
if start >= len(filtered_df):
|
|
633
|
+
return '', '', '', '', '', ''
|
|
634
|
+
|
|
635
|
+
row = filtered_df.iloc[start]
|
|
636
|
+
|
|
637
|
+
# Process the data for display
|
|
638
|
+
input_md = process_model_prediction(row['Input'])
|
|
639
|
+
generated_md = process_model_prediction(row['Generated'])
|
|
640
|
+
gold_md = process_model_prediction(row['Gold'])
|
|
641
|
+
pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
|
|
642
|
+
score_md = process_model_prediction(row['Score'])
|
|
643
|
+
nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
|
|
644
|
+
|
|
645
|
+
if nscore_val >= score_threshold:
|
|
646
|
+
nscore_val = f'<div style="background-color:rgb(45,104, 62); padding:10px;">{nscore_val}</div>'
|
|
647
|
+
else:
|
|
648
|
+
nscore_val = f'<div style="background-color:rgb(151, 31, 44); padding:10px;">{nscore_val}</div>'
|
|
649
|
+
|
|
650
|
+
return input_md, generated_md, gold_md, pred_md, score_md, nscore_val
|
|
598
651
|
|
|
599
652
|
return SingleModelComponents(report_name=report_name)
|
|
600
653
|
|
|
@@ -696,16 +749,13 @@ def create_app(args: argparse.Namespace):
|
|
|
696
749
|
text = '<' if new_visible else '>'
|
|
697
750
|
return gr.update(visible=new_visible), new_visible, gr.update(value=text)
|
|
698
751
|
|
|
699
|
-
demo.launch(
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
parser.add_argument('--debug', action='store_true', help='Debug the app.')
|
|
707
|
-
parser.add_argument('--lang', type=str, default='zh', help='The locale.', choices=['zh', 'en'])
|
|
708
|
-
parser.add_argument('--outputs', type=str, default='./outputs', help='The outputs dir.')
|
|
752
|
+
demo.launch(
|
|
753
|
+
share=args.share,
|
|
754
|
+
server_name=args.server_name,
|
|
755
|
+
server_port=args.server_port,
|
|
756
|
+
debug=args.debug,
|
|
757
|
+
allowed_paths=args.allowed_paths,
|
|
758
|
+
)
|
|
709
759
|
|
|
710
760
|
|
|
711
761
|
if __name__ == '__main__':
|