PyPI - evalscope - Versions diffs - 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

evalscope 1.0.2py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (87) hide show

evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
evalscope/benchmarks/bfcl/bfcl_adapter.py +10 -45
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/metrics/metric.py +51 -0
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/report/__init__.py +9 -1
evalscope/report/combinator.py +52 -2
evalscope/utils/json_schema.py +8 -6
evalscope/utils/multi_choices.py +16 -1
evalscope/version.py +2 -2
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/METADATA +6 -32
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/RECORD +51 -54
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -429
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_sandbox.py +0 -81
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -137
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -206
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/blink}/__init__.py +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0

tests/swift/test_run_swift_vlm_eval.py DELETED Viewed

@@ -1,128 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import json
-import os
-import requests
-import shutil
-import subprocess
-import time
-import unittest
-from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
-from evalscope.run import run_task
-from evalscope.summarizer import Summarizer
-from evalscope.utils.import_utils import is_module_installed
-from evalscope.utils.logger import get_logger
-from tests.utils import test_level_list
-logger = get_logger(__name__)
-DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
-DEFAULT_API_KEY = 'EMPTY'
-DEFAULT_MODEL_NAME = 'CustomAPIModel'
-DEFAULT_WORK_DIR = 'outputs/qwen-vl-chat'
-class TestRunSwiftVLMEval(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info('Init env for swift-eval UTs ...\n')
-        assert is_module_installed('evalscope'), 'Please install `llmuses` from pypi or source code.'
-        if not is_module_installed('vlmeval'):
-            logger.warning('Note: installing ms-vlmeval ...')
-            subprocess.run('pip3 install ms-vlmeval -U', shell=True, check=True)
-        if not is_module_installed('swift'):
-            logger.warning('Note: installing ms-swift ...')
-            subprocess.run('pip3 install ms-swift -U', shell=True, check=True)
-        if os.path.exists(DEFAULT_WORK_DIR):
-            shutil.rmtree(DEFAULT_WORK_DIR)
-            logger.info(f'Removed work dir: {os.path.abspath(DEFAULT_WORK_DIR)} \n')
-        logger.info('\nStaring run swift deploy ...')
-        self.model_name = 'qwen-vl-chat'
-        self.process_swift_deploy = subprocess.Popen(
-            f'swift deploy --model_type {self.model_name} --infer_backend pt', text=True, shell=True)
-        self.all_datasets = VLMEvalKitBackendManager.list_supported_datasets()
-        assert len(self.all_datasets) > 0, f'Failed to list datasets from VLMEvalKit backend: {self.all_datasets}'
-    def tearDown(self) -> None:
-        # Stop the swift deploy model service
-        logger.warning('Stopping swift deploy ...')
-        self.process_swift_deploy.terminate()
-        self.process_swift_deploy.wait()
-        logger.info('Process swift-deploy terminated successfully.')
-    @staticmethod
-    def _check_env(module_name: str):
-        if is_module_installed(module_name):
-            logger.info(f'{module_name} is installed.')
-        else:
-            raise ModuleNotFoundError(f'run: pip install {module_name}')
-    @staticmethod
-    def check_service_status(url: str, data: dict, retries: int = 20, delay: int = 10):
-        for i in range(retries):
-            try:
-                logger.info(f'Attempt {i + 1}: Checking service at {url} ...')
-                response = requests.post(
-                    url, data=json.dumps(data), headers={'Content-Type': 'application/json'}, timeout=30)
-                if response.status_code == 200:
-                    logger.info(f'Service at {url} is available !\n\n')
-                    return True
-                else:
-                    logger.info(f'Service at {url} returned status code {response.status_code}.')
-            except requests.exceptions.RequestException as e:
-                logger.info(f'Attempt {i + 1}: An error occurred: {e}')
-            time.sleep(delay)
-        logger.info(f'Service at {url} is not available after {retries} retries.')
-        return False
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_api(self):
-        api_base = DEFAULT_CHAT_MODEL_URL
-        task_cfg = {
-            'eval_backend': 'VLMEvalKit',
-            'eval_config': {
-                'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
-                'limit':
-                30,
-                'mode':
-                'all',
-                'model': [{
-                    'api_base': api_base,  # swfit deploy model api
-                    'key': DEFAULT_API_KEY,
-                    'name': DEFAULT_MODEL_NAME,  # must be CustomAPIModel for swift
-                    'temperature': 0.0,
-                    'type': self.model_name
-                }],  # swift model type
-                'nproc':
-                1,
-                'reuse':
-                True,
-                'work_dir':
-                DEFAULT_WORK_DIR
-            }
-        }
-        # Check the service status
-        data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
-        assert self.check_service_status(api_base, data=data), f'Failed to check service status: {api_base}'
-        logger.info(f'>> Start to run task: {task_cfg}')
-        run_task(task_cfg)
-        logger.info('>> Start to get the report with summarizer ...')
-        report_list = Summarizer.get_report_from_cfg(task_cfg)
-        logger.info(f'\n>> The report list: {report_list}')
-        assert len(report_list) > 0, f'Failed to get report list: {report_list}'
-if __name__ == '__main__':
-    unittest.main(buffer=False)

tests/swift/test_run_swift_vlm_jugde_eval.py DELETED Viewed

@@ -1,157 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import json
-import os
-import requests
-import shutil
-import subprocess
-import time
-import unittest
-from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
-from evalscope.run import run_task
-from evalscope.summarizer import Summarizer
-from evalscope.utils.import_utils import is_module_installed
-from evalscope.utils.logger import get_logger
-from tests.utils import test_level_list
-logger = get_logger(__name__)
-DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
-DEFAULT_JUDGE_MODEL_URL = 'http://127.0.0.1:8866/v1/chat/completions'
-DEFAULT_API_KEY = 'EMPTY'
-DEFAULT_MODEL_NAME = 'CustomAPIModel'
-DEFAULT_WORK_DIR = 'outputs'
-class TestRunSwiftVLMEval(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info('Init env for swift-eval UTs ...\n')
-        assert is_module_installed('llmuses'), 'Please install `llmuses` from pypi or source code.'
-        logger.warning('Note: installing ms-vlmeval ...')
-        subprocess.run('pip3 install ms-vlmeval -U', shell=True, check=True)
-        logger.warning('Note: installing ms-swift ...')
-        subprocess.run('pip3 install ms-swift -U', shell=True, check=True)
-        if os.path.exists(DEFAULT_WORK_DIR):
-            shutil.rmtree(DEFAULT_WORK_DIR)
-            logger.info(f'Removed work dir: {os.path.abspath(DEFAULT_WORK_DIR)} \n')
-        logger.info('\nStaring run swift deploy ...')
-        self.model_name = 'qwen-vl-chat'
-        self.process_swift_deploy = subprocess.Popen(
-            f'swift deploy --model_type {self.model_name}',
-            text=True,
-            shell=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
-        logger.info('\nStaring run swift deploy judge ...')
-        self.judge_model_name = 'qwen2-7b-instruct'
-        self.process_swift_deploy_judge = subprocess.Popen(
-            f'swift deploy --model_type {self.judge_model_name} \
-                                                    --port 8866',
-            text=True,
-            shell=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
-        self.all_datasets = VLMEvalKitBackendManager.list_supported_datasets()
-        assert len(self.all_datasets) > 0, f'Failed to list datasets from VLMEvalKit backend: {self.all_datasets}'
-    def tearDown(self) -> None:
-        # Stop the swift deploy model service
-        logger.warning('\nStopping swift deploy ...')
-        self.process_swift_deploy.terminate()
-        self.process_swift_deploy.wait()
-        self.process_swift_deploy_judge.terminate()
-        self.process_swift_deploy_judge.wait()
-        logger.info('Process swift-deploy terminated successfully.')
-    @staticmethod
-    def _check_env(module_name: str):
-        if is_module_installed(module_name):
-            logger.info(f'{module_name} is installed.')
-        else:
-            raise ModuleNotFoundError(f'run: pip install {module_name}')
-    @staticmethod
-    def check_service_status(url: str, data: dict, retries: int = 20, delay: int = 10):
-        for i in range(retries):
-            try:
-                logger.info(f'Attempt {i + 1}: Checking service at {url} ...')
-                response = requests.post(
-                    url, data=json.dumps(data), headers={'Content-Type': 'application/json'}, timeout=30)
-                if response.status_code == 200:
-                    logger.info(f'Service at {url} is available !\n\n')
-                    return True
-                else:
-                    logger.info(f'Service at {url} returned status code {response.status_code}.')
-            except requests.exceptions.RequestException as e:
-                logger.info(f'Attempt {i + 1}: An error occurred: {e}')
-            time.sleep(delay)
-        logger.info(f'Service at {url} is not available after {retries} retries.')
-        return False
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_api_with_judge(self):
-        # Check the judge service status
-        data = {'model': self.judge_model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
-        assert self.check_service_status(DEFAULT_JUDGE_MODEL_URL, data=data), \
-            f'Failed to check judge service status: {DEFAULT_JUDGE_MODEL_URL}'
-        api_base = DEFAULT_CHAT_MODEL_URL
-        task_cfg = {
-            'eval_backend': 'VLMEvalKit',
-            'eval_config': {
-                'LOCAL_LLM':
-                self.judge_model_name,  # judge model id
-                'OPENAI_API_BASE':
-                DEFAULT_JUDGE_MODEL_URL,  # judge model api
-                'OPENAI_API_KEY':
-                DEFAULT_API_KEY,
-                'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
-                'limit':
-                20,
-                'mode':
-                'all',
-                'model': [{
-                    'api_base': api_base,  # swfit deploy model api
-                    'key': DEFAULT_API_KEY,
-                    'name': DEFAULT_MODEL_NAME,  # must be CustomAPIModel for swift
-                    'temperature': 0.0,
-                    'type': self.model_name
-                }],  # swift model type
-                'nproc':
-                1,
-                'reuse':
-                True,
-                'work_dir':
-                DEFAULT_WORK_DIR
-            }
-        }
-        # Check the service status
-        data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
-        assert self.check_service_status(api_base, data=data), f'Failed to check service status: {api_base}'
-        logger.info(f'>> Start to run task: {task_cfg}')
-        run_task(task_cfg)
-        logger.info('>> Start to get the report with summarizer ...')
-        report_list = Summarizer.get_report_from_cfg(task_cfg)
-        logger.info(f'\n>> The report list: {report_list}')
-        assert len(report_list) > 0, f'Failed to get report list: {report_list}'
-if __name__ == '__main__':
-    unittest.main(buffer=False)

tests/test_run_all.py DELETED Viewed

@@ -1,12 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import subprocess
-if __name__ == '__main__':
-    cmd = f'TEST_LEVEL_LIST=0,1 python3 -m unittest discover tests'
-    run_res = subprocess.run(cmd, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    if run_res.returncode == 0:
-        print(f'>>test_run_all stdout: {run_res.stdout}')
-    else:
-        print(f'>>test_run_all stderr: {run_res.stderr}')

tests/utils.py DELETED Viewed

@@ -1,13 +0,0 @@
-import os
-TEST_LEVEL_LIST = [0, 1]
-# Example: export TEST_LEVEL_LIST=0,1
-TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
-def test_level_list():
-    global TEST_LEVEL_LIST
-    if TEST_LEVEL_LIST_STR in os.environ:
-        TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
-    return TEST_LEVEL_LIST

tests/vlm/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # Copyright (c) Alibaba, Inc. and its affiliates.

tests/vlm/test_vlmeval.py DELETED Viewed

@@ -1,102 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from dotenv import dotenv_values
-from tests.utils import test_level_list
-env = dotenv_values('.env')
-import unittest
-from evalscope.run import run_task
-from evalscope.summarizer import Summarizer
-from evalscope.utils.import_utils import is_module_installed
-from evalscope.utils.logger import get_logger
-logger = get_logger()
-class TestVLMEval(unittest.TestCase):
-    def setUp(self) -> None:
-        self._check_env('vlmeval')
-    def tearDown(self) -> None:
-        pass
-    @staticmethod
-    def _check_env(module_name: str):
-        if is_module_installed(module_name):
-            logger.info(f'{module_name} is installed.')
-        else:
-            raise ModuleNotFoundError(f'run: pip install {module_name}')
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_vlm_eval_local(self):
-        task_cfg = {
-            'eval_backend': 'VLMEvalKit',
-            'eval_config': {
-                'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
-                'limit': 20,
-                'mode': 'all',
-                'model': [{
-                    'name': 'qwen-vl-chat',
-                    'model_path': '../models/Qwen-VL-Chat'
-                }],  # model name for VLMEval config
-                'nproc': 1,
-                'reuse': True,
-            },
-            'work_dir': 'outputs',
-            'use_cache': 'outputs/20241216_142838'
-        }
-        logger.info(f'>> Start to run task: {task_cfg}')
-        run_task(task_cfg)
-        logger.info('>> Start to get the report with summarizer ...')
-        report_list = Summarizer.get_report_from_cfg(task_cfg)
-        logger.info(f'\n>>The report list: {report_list}')
-        assert len(report_list) > 0, f'Failed to get report list: {report_list}'
-    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
-    def test_run_vlm_api(self):
-        task_cfg = {
-            'eval_backend': 'VLMEvalKit',
-            'eval_config': {
-                'data': [
-                    # 'SEEDBench_IMG',
-                    # 'ChartQA_TEST',
-                    'MMDU'
-                    ],
-                'limit': 5,
-                'mode': 'all',
-                'model': [
-                    {'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
-                    'key': env.get('DASHSCOPE_API_KEY'),
-                    'name': 'CustomAPIModel',
-                    'temperature': 0.0,
-                    'type': 'qwen2.5-vl-7b-instruct',
-                    'img_size': -1,
-                    'video_llm': False,
-                    'max_tokens': 512,}
-                    ],
-                'nproc': 5,
-                'reuse': False,
-            },
-            'work_dir': 'outputs',
-            # 'use_cache': 'outputs/20241216_142838'
-        }
-        logger.info(f'>> Start to run task: {task_cfg}')
-        run_task(task_cfg)
-        logger.info('>> Start to get the report with summarizer ...')
-        report_list = Summarizer.get_report_from_cfg(task_cfg)
-        logger.info(f'\n>>The report list: {report_list}')
-        assert len(report_list) > 0, f'Failed to get report list: {report_list}'
-if __name__ == '__main__':
-    unittest.main(buffer=False)

{tests/rag → evalscope/benchmarks/blink}/__init__.py RENAMED Viewed

File without changes

{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

evalscope 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.2py3-none-any.whl → 1.1.0py3-none-any.whl