PyPI - evalscope - Versions diffs - 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show

evalscope/api/benchmark/__init__.py +8 -1
evalscope/api/benchmark/adapters/__init__.py +1 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/benchmark.py +14 -0
evalscope/api/dataset/dataset.py +21 -0
evalscope/api/dataset/loader.py +6 -2
evalscope/api/mixin/sandbox_mixin.py +32 -54
evalscope/api/model/generate_config.py +6 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +111 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/constants.py +4 -0
evalscope/evaluator/evaluator.py +72 -79
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +52 -1
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/utils/openai.py +4 -0
evalscope/perf/arguments.py +24 -4
evalscope/perf/benchmark.py +74 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +179 -79
evalscope/perf/plugin/api/openai_api.py +4 -3
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/utils/benchmark_util.py +36 -22
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +0 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +11 -2
evalscope/report/combinator.py +52 -2
evalscope/run.py +4 -0
evalscope/utils/function_utils.py +195 -12
evalscope/utils/io_utils.py +74 -0
evalscope/utils/json_schema.py +8 -6
evalscope/utils/logger.py +49 -17
evalscope/utils/multi_choices.py +16 -1
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -429
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_sandbox.py +0 -81
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -137
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -206
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py ADDED Viewed

@@ -0,0 +1,481 @@
+# flake8: noqa
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# encoding=utf8
+# File: E2E_iou_1_1.py
+# Version: 1.1
+# Version info: changes for Python 3
+# Date: 2019-12-29
+# Description: Evaluation script that computes End to End Recognition. For Text Localization it's used Intersection over Union criteria.
+# Average Precision is also calcuted when 'CONFIDENCES' parameter is True
+# There are 2 modes to determine if a detection is correct or not:
+# with Word Spotting: The detected word must coincide (ingnoring case) to a filtered Ground Truth containing only dictionary words (see include_in_dictionary and include_in_dictionary_transcription functions)
+# without Word Spotting: words must be equal excluding a set of special characters
+import importlib
+from collections import namedtuple
+from . import rrc_evaluation_funcs_1_1 as rrc_evaluation_funcs
+def evaluation_imports():
+    """
+    evaluation_imports: Dictionary ( key = module name , value = alias  )  with python modules used in the evaluation.
+    """
+    return {'Polygon': 'plg', 'numpy': 'np'}
+def default_evaluation_params():
+    """
+    default_evaluation_params: Default parameters to use for the validation and evaluation.
+    """
+    return {
+        'IOU_CONSTRAINT': 0.5,
+        'AREA_PRECISION_CONSTRAINT': 0.5,
+        'WORD_SPOTTING': False,
+        'MIN_LENGTH_CARE_WORD': 3,
+        'GT_SAMPLE_NAME_2_ID': 'gt_img_([0-9]+).txt',
+        'DET_SAMPLE_NAME_2_ID': 'res_img_([0-9]+).txt',
+        'LTRB': False,  # LTRB:2points(left,top,right,bottom) or 4 points(x1,y1,x2,y2,x3,y3,x4,y4)
+        'CRLF': False,  # Lines are delimited by Windows CRLF format
+        'CONFIDENCES': False,  # Detections must include confidence value. AP will be calculated,
+        'SPECIAL_CHARACTERS': "!?.:,*\"()·[]/'",
+        'ONLY_REMOVE_FIRST_LAST_CHARACTER': True,
+    }
+def validate_data(gtFilePath, submFilePath, evaluationParams):
+    """
+    Method validate_data: validates that all files in the results folder are correct (have the correct name contents).
+                            Validates also that there are no missing files in the folder.
+                            If some error detected, the method raises the error
+    """
+    gt = rrc_evaluation_funcs.load_zip_file(gtFilePath, evaluationParams['GT_SAMPLE_NAME_2_ID'])
+    subm = rrc_evaluation_funcs.load_zip_file(submFilePath, evaluationParams['DET_SAMPLE_NAME_2_ID'], True)
+    # Validate format of GroundTruth
+    for k in gt:
+        rrc_evaluation_funcs.validate_lines_in_file(k, gt[k], evaluationParams['CRLF'], evaluationParams['LTRB'], True)
+    # Validate format of results
+    for k in subm:
+        if (k in gt) == False:
+            raise Exception('The sample %s not present in GT' % k)
+        rrc_evaluation_funcs.validate_lines_in_file(
+            k, subm[k], evaluationParams['CRLF'], evaluationParams['LTRB'], True, evaluationParams['CONFIDENCES']
+        )
+def evaluate_method(gtFilePath, submFilePath, evaluationParams):
+    """
+    Method evaluate_method: evaluate method and returns the results
+        Results. Dictionary with the following values:
+        - method (required)  Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 }
+        - samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 }
+    """
+    for module, alias in evaluation_imports().items():
+        globals()[alias] = importlib.import_module(module)
+    def polygon_from_points(points, correctOffset=False):
+        """
+        Returns a Polygon object to use with the Polygon2 class from a list of 8 points: x1,y1,x2,y2,x3,y3,x4,y4
+        """
+        if correctOffset:  # this will substract 1 from the coordinates that correspond to the xmax and ymax
+            points[2] -= 1
+            points[4] -= 1
+            points[5] -= 1
+            points[7] -= 1
+        resBoxes = np.empty([1, 8], dtype='int32')
+        resBoxes[0, 0] = int(points[0])
+        resBoxes[0, 4] = int(points[1])
+        resBoxes[0, 1] = int(points[2])
+        resBoxes[0, 5] = int(points[3])
+        resBoxes[0, 2] = int(points[4])
+        resBoxes[0, 6] = int(points[5])
+        resBoxes[0, 3] = int(points[6])
+        resBoxes[0, 7] = int(points[7])
+        pointMat = resBoxes[0].reshape([2, 4]).T
+        return plg.Polygon(pointMat)
+    def rectangle_to_polygon(rect):
+        resBoxes = np.empty([1, 8], dtype='int32')
+        resBoxes[0, 0] = int(rect.xmin)
+        resBoxes[0, 4] = int(rect.ymax)
+        resBoxes[0, 1] = int(rect.xmin)
+        resBoxes[0, 5] = int(rect.ymin)
+        resBoxes[0, 2] = int(rect.xmax)
+        resBoxes[0, 6] = int(rect.ymin)
+        resBoxes[0, 3] = int(rect.xmax)
+        resBoxes[0, 7] = int(rect.ymax)
+        pointMat = resBoxes[0].reshape([2, 4]).T
+        return plg.Polygon(pointMat)
+    def rectangle_to_points(rect):
+        points = [
+            int(rect.xmin),
+            int(rect.ymax),
+            int(rect.xmax),
+            int(rect.ymax),
+            int(rect.xmax),
+            int(rect.ymin),
+            int(rect.xmin),
+            int(rect.ymin)
+        ]
+        return points
+    def get_union(pD, pG):
+        areaA = pD.area()
+        areaB = pG.area()
+        return areaA + areaB - get_intersection(pD, pG)
+    def get_intersection_over_union(pD, pG):
+        try:
+            return get_intersection(pD, pG) / get_union(pD, pG)
+        except:
+            return 0
+    def get_intersection(pD, pG):
+        pInt = pD & pG
+        if len(pInt) == 0:
+            return 0
+        return pInt.area()
+    def compute_ap(confList, matchList, numGtCare):
+        correct = 0
+        AP = 0
+        if len(confList) > 0:
+            confList = np.array(confList)
+            matchList = np.array(matchList)
+            sorted_ind = np.argsort(-confList)
+            confList = confList[sorted_ind]
+            matchList = matchList[sorted_ind]
+            for n in range(len(confList)):
+                match = matchList[n]
+                if match:
+                    correct += 1
+                    AP += float(correct) / (n + 1)
+            if numGtCare > 0:
+                AP /= numGtCare
+        return AP
+    def transcription_match(
+        transGt, transDet, specialCharacters="!?.:,*\"()·[]/'", onlyRemoveFirstLastCharacterGT=True
+    ):
+        if onlyRemoveFirstLastCharacterGT:
+            # special characters in GT are allowed only at initial or final position
+            if transGt == transDet:
+                return True
+            if specialCharacters.find(transGt[0]) > -1:
+                if transGt[1:] == transDet:
+                    return True
+            if specialCharacters.find(transGt[-1]) > -1:
+                if transGt[0:len(transGt) - 1] == transDet:
+                    return True
+            if specialCharacters.find(transGt[0]) > -1 and specialCharacters.find(transGt[-1]) > -1:
+                if transGt[1:len(transGt) - 1] == transDet:
+                    return True
+            return False
+        else:
+            # Special characters are removed from the begining and the end of both Detection and GroundTruth
+            while len(transGt) > 0 and specialCharacters.find(transGt[0]) > -1:
+                transGt = transGt[1:]
+            while len(transDet) > 0 and specialCharacters.find(transDet[0]) > -1:
+                transDet = transDet[1:]
+            while len(transGt) > 0 and specialCharacters.find(transGt[-1]) > -1:
+                transGt = transGt[0:len(transGt) - 1]
+            while len(transDet) > 0 and specialCharacters.find(transDet[-1]) > -1:
+                transDet = transDet[0:len(transDet) - 1]
+            return transGt == transDet
+    def include_in_dictionary(transcription):
+        """
+        Function used in Word Spotting that finds if the Ground Truth transcription meets the rules to enter into the dictionary. If not, the transcription will be cared as don't care
+        """
+        # special case 's at final
+        if transcription[len(transcription) - 2:] == "'s" or transcription[len(transcription) - 2:] == "'S":
+            transcription = transcription[0:len(transcription) - 2]
+        # hypens at init or final of the word
+        transcription = transcription.strip('-')
+        specialCharacters = "'!?.:,*\"()·[]/"
+        for character in specialCharacters:
+            transcription = transcription.replace(character, ' ')
+        transcription = transcription.strip()
+        if len(transcription) != len(transcription.replace(' ', '')):
+            return False
+        if len(transcription) < evaluationParams['MIN_LENGTH_CARE_WORD']:
+            return False
+        notAllowed = '×÷·'
+        range1 = [ord('a'), ord('z')]
+        range2 = [ord('A'), ord('Z')]
+        range3 = [ord('À'), ord('ƿ')]
+        range4 = [ord('Ǆ'), ord('ɿ')]
+        range5 = [ord('Ά'), ord('Ͽ')]
+        range6 = [ord('-'), ord('-')]
+        for char in transcription:
+            charCode = ord(char)
+            if notAllowed.find(char) != -1:
+                return False
+            valid = ((charCode >= range1[0] and charCode <= range1[1])
+                     or (charCode >= range2[0] and charCode <= range2[1])
+                     or (charCode >= range3[0] and charCode <= range3[1])
+                     or (charCode >= range4[0] and charCode <= range4[1])
+                     or (charCode >= range5[0] and charCode <= range5[1])
+                     or (charCode >= range6[0] and charCode <= range6[1]))
+            if valid == False:
+                return False
+        return True
+    def include_in_dictionary_transcription(transcription):
+        """
+        Function applied to the Ground Truth transcriptions used in Word Spotting. It removes special characters or terminations
+        """
+        # special case 's at final
+        if transcription[len(transcription) - 2:] == "'s" or transcription[len(transcription) - 2:] == "'S":
+            transcription = transcription[0:len(transcription) - 2]
+        # hypens at init or final of the word
+        transcription = transcription.strip('-')
+        specialCharacters = "'!?.:,*\"()·[]/"
+        for character in specialCharacters:
+            transcription = transcription.replace(character, ' ')
+        transcription = transcription.strip()
+        return transcription
+    perSampleMetrics = {}
+    matchedSum = 0
+    Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax')
+    gt = rrc_evaluation_funcs.load_zip_file(gtFilePath, evaluationParams['GT_SAMPLE_NAME_2_ID'])
+    subm = rrc_evaluation_funcs.load_zip_file(submFilePath, evaluationParams['DET_SAMPLE_NAME_2_ID'], True)
+    numGlobalCareGt = 0
+    numGlobalCareDet = 0
+    arrGlobalConfidences = []
+    arrGlobalMatches = []
+    for resFile in gt:
+        gtFile = rrc_evaluation_funcs.decode_utf8(gt[resFile])
+        if gtFile is None:
+            raise Exception('The file %s is not UTF-8' % resFile)
+        recall = 0
+        precision = 0
+        hmean = 0
+        detCorrect = 0
+        iouMat = np.empty([1, 1])
+        gtPols = []
+        detPols = []
+        gtTrans = []
+        detTrans = []
+        gtPolPoints = []
+        detPolPoints = []
+        gtDontCarePolsNum = []  # Array of Ground Truth Polygons' keys marked as don't Care
+        detDontCarePolsNum = []  # Array of Detected Polygons' matched with a don't Care GT
+        detMatchedNums = []
+        pairs = []
+        arrSampleConfidences = []
+        arrSampleMatch = []
+        sampleAP = 0
+        evaluationLog = ''
+        pointsList, _, transcriptionsList = rrc_evaluation_funcs.get_tl_line_values_from_file_contents(
+            gtFile, evaluationParams['CRLF'], evaluationParams['LTRB'], True, False
+        )
+        for n in range(len(pointsList)):
+            points = pointsList[n]
+            transcription = transcriptionsList[n]
+            dontCare = transcription == '###'
+            if evaluationParams['LTRB']:
+                gtRect = Rectangle(*points)
+                gtPol = rectangle_to_polygon(gtRect)
+            else:
+                gtPol = polygon_from_points(points)
+            gtPols.append(gtPol)
+            gtPolPoints.append(points)
+            # On word spotting we will filter some transcriptions with special characters
+            if evaluationParams['WORD_SPOTTING']:
+                if dontCare == False:
+                    if include_in_dictionary(transcription) == False:
+                        dontCare = True
+                    else:
+                        transcription = include_in_dictionary_transcription(transcription)
+            gtTrans.append(transcription)
+            if dontCare:
+                gtDontCarePolsNum.append(len(gtPols) - 1)
+        evaluationLog += 'GT polygons: ' + str(
+            len(gtPols)
+        ) + (' (' + str(len(gtDontCarePolsNum)) + " don't care)\n" if len(gtDontCarePolsNum) > 0 else '\n')
+        if resFile in subm:
+            detFile = rrc_evaluation_funcs.decode_utf8(subm[resFile])
+            pointsList, confidencesList, transcriptionsList = rrc_evaluation_funcs.get_tl_line_values_from_file_contents(
+                detFile, evaluationParams['CRLF'], evaluationParams['LTRB'], True, evaluationParams['CONFIDENCES']
+            )
+            for n in range(len(pointsList)):
+                points = pointsList[n]
+                transcription = transcriptionsList[n]
+                if evaluationParams['LTRB']:
+                    detRect = Rectangle(*points)
+                    detPol = rectangle_to_polygon(detRect)
+                else:
+                    detPol = polygon_from_points(points)
+                detPols.append(detPol)
+                detPolPoints.append(points)
+                detTrans.append(transcription)
+                if len(gtDontCarePolsNum) > 0:
+                    for dontCarePol in gtDontCarePolsNum:
+                        dontCarePol = gtPols[dontCarePol]
+                        intersected_area = get_intersection(dontCarePol, detPol)
+                        pdDimensions = detPol.area()
+                        precision = 0 if pdDimensions == 0 else intersected_area / pdDimensions
+                        if precision > evaluationParams['AREA_PRECISION_CONSTRAINT']:
+                            detDontCarePolsNum.append(len(detPols) - 1)
+                            break
+            evaluationLog += 'DET polygons: ' + str(
+                len(detPols)
+            ) + (' (' + str(len(detDontCarePolsNum)) + " don't care)\n" if len(detDontCarePolsNum) > 0 else '\n')
+            if len(gtPols) > 0 and len(detPols) > 0:
+                # Calculate IoU and precision matrixs
+                outputShape = [len(gtPols), len(detPols)]
+                iouMat = np.empty(outputShape)
+                gtRectMat = np.zeros(len(gtPols), np.int8)
+                detRectMat = np.zeros(len(detPols), np.int8)
+                for gtNum in range(len(gtPols)):
+                    for detNum in range(len(detPols)):
+                        pG = gtPols[gtNum]
+                        pD = detPols[detNum]
+                        iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG)
+                for gtNum in range(len(gtPols)):
+                    for detNum in range(len(detPols)):
+                        if gtRectMat[gtNum] == 0 and detRectMat[
+                            detNum] == 0 and gtNum not in gtDontCarePolsNum and detNum not in detDontCarePolsNum:
+                            if iouMat[gtNum, detNum] > evaluationParams['IOU_CONSTRAINT']:
+                                gtRectMat[gtNum] = 1
+                                detRectMat[detNum] = 1
+                                # detection matched only if transcription is equal
+                                if evaluationParams['WORD_SPOTTING']:
+                                    correct = gtTrans[gtNum].upper() == detTrans[detNum].upper()
+                                else:
+                                    correct = transcription_match(
+                                        gtTrans[gtNum].upper(), detTrans[detNum].upper(),
+                                        evaluationParams['SPECIAL_CHARACTERS'],
+                                        evaluationParams['ONLY_REMOVE_FIRST_LAST_CHARACTER']
+                                    ) == True
+                                detCorrect += 1 if correct else 0
+                                if correct:
+                                    detMatchedNums.append(detNum)
+                                pairs.append({'gt': gtNum, 'det': detNum, 'correct': correct})
+                                evaluationLog += 'Match GT #' + str(gtNum) + ' with Det #' + str(
+                                    detNum
+                                ) + ' trans. correct: ' + str(correct) + '\n'
+            if evaluationParams['CONFIDENCES']:
+                for detNum in range(len(detPols)):
+                    if detNum not in detDontCarePolsNum:
+                        # we exclude the don't care detections
+                        match = detNum in detMatchedNums
+                        arrSampleConfidences.append(confidencesList[detNum])
+                        arrSampleMatch.append(match)
+                        arrGlobalConfidences.append(confidencesList[detNum])
+                        arrGlobalMatches.append(match)
+        numGtCare = len(gtPols) - len(gtDontCarePolsNum)
+        numDetCare = len(detPols) - len(detDontCarePolsNum)
+        if numGtCare == 0:
+            recall = float(1)
+            precision = float(0) if numDetCare > 0 else float(1)
+            sampleAP = precision
+        else:
+            recall = float(detCorrect) / numGtCare
+            precision = 0 if numDetCare == 0 else float(detCorrect) / numDetCare
+            if evaluationParams['CONFIDENCES']:
+                sampleAP = compute_ap(arrSampleConfidences, arrSampleMatch, numGtCare)
+        hmean = 0 if (precision + recall) == 0 else 2.0 * precision * recall / (precision + recall)
+        matchedSum += detCorrect
+        numGlobalCareGt += numGtCare
+        numGlobalCareDet += numDetCare
+        perSampleMetrics[resFile] = {
+            'precision': precision,
+            'recall': recall,
+            'hmean': hmean,
+            'pairs': pairs,
+            'AP': sampleAP,
+            'iouMat': [] if len(detPols) > 100 else iouMat.tolist(),
+            'gtPolPoints': gtPolPoints,
+            'detPolPoints': detPolPoints,
+            'gtTrans': gtTrans,
+            'detTrans': detTrans,
+            'gtDontCare': gtDontCarePolsNum,
+            'detDontCare': detDontCarePolsNum,
+            'evaluationParams': evaluationParams,
+            'evaluationLog': evaluationLog,
+        }
+    # Compute AP
+    AP = 0
+    if evaluationParams['CONFIDENCES']:
+        AP = compute_ap(arrGlobalConfidences, arrGlobalMatches, numGlobalCareGt)
+    methodRecall = 0 if numGlobalCareGt == 0 else float(matchedSum) / numGlobalCareGt
+    methodPrecision = 0 if numGlobalCareDet == 0 else float(matchedSum) / numGlobalCareDet
+    methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * methodRecall * methodPrecision / (
+        methodRecall + methodPrecision
+    )
+    methodMetrics = {'precision': methodPrecision, 'recall': methodRecall, 'hmean': methodHmean, 'AP': AP}
+    resDict = {'calculated': True, 'Message': '', 'method': methodMetrics, 'per_sample': perSampleMetrics}
+    return resDict
+if __name__ == '__main__':
+    rrc_evaluation_funcs.main_evaluation(None, default_evaluation_params, validate_data, evaluate_method)

evalscope/benchmarks/ocr_bench_v2/spotting_metric.py ADDED Viewed

@@ -0,0 +1,179 @@
+import ast
+import os
+import re
+import shutil
+import zipfile
+from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR
+from evalscope.utils.function_utils import thread_safe
+from .spotting_eval import rrc_evaluation_funcs_1_1 as rrc_evaluation_funcs
+from .spotting_eval.script import default_evaluation_params, evaluate_method, validate_data
+def extract_bounding_boxes_robust(predict_str):
+    """
+    Extract coordinates and text content from the given prediction string,
+    handling potential format issues.
+    Args:
+    predict_str (str): Model prediction output as a string.
+    Returns:
+    list: Extracted data in the format [[x1, y1, x2, y2, text_content], ...].
+          Returns None if no valid data is extracted.
+    """
+    results = []
+    seen = set()
+    # try parsing with ast.literal_eval
+    try:
+        data = ast.literal_eval(predict_str)
+    except Exception:
+        data = None
+    if data is not None:
+        if isinstance(data, (list, tuple)):
+            for item in data:
+                if isinstance(item, (list, tuple)) and len(item) >= 5:
+                    x1_str, y1_str, x2_str, y2_str = item[:4]
+                    text_content = item[4]
+                    x1_str = str(x1_str).strip()
+                    y1_str = str(y1_str).strip()
+                    x2_str = str(x2_str).strip()
+                    y2_str = str(y2_str).strip()
+                    text_content = str(text_content).replace('\n', '').strip().strip('"').strip("'")
+                    try:
+                        x1 = int(x1_str)
+                        y1 = int(y1_str)
+                        x2 = int(x2_str)
+                        y2 = int(y2_str)
+                        if not (0 <= x1 <= 1000 and 0 <= y1 <= 1000 and 0 <= x2 <= 1000 and 0 <= y2 <= 1000):
+                            continue
+                        key = (x1, y1, x2, y2, text_content)
+                        if key in seen:
+                            continue
+                        seen.add(key)
+                        results.append([x1, y1, x2, y2, text_content])
+                    except ValueError:
+                        continue
+    else:
+        # try parsing with regular expression
+        list_content = predict_str
+        items = re.findall(r'[\[\(]\s*([^\[\]\(\)]*?)\s*[\]\)]', list_content)
+        if not items:
+            return None
+        for item in items:
+            parts = item.split(',', 4)
+            if len(parts) < 5:
+                continue
+            x1_str, y1_str, x2_str, y2_str, text_content = parts
+            x1_str = x1_str.strip()
+            y1_str = y1_str.strip()
+            x2_str = x2_str.strip()
+            y2_str = y2_str.strip()
+            text_content = text_content.replace('\n', '').strip().strip('"').strip("'")
+            try:
+                x1 = int(x1_str)
+                y1 = int(y1_str)
+                x2 = int(x2_str)
+                y2 = int(y2_str)
+                if not (0 <= x1 <= 1000 and 0 <= y1 <= 1000 and 0 <= x2 <= 1000 and 0 <= y2 <= 1000):
+                    continue
+                key = (x1, y1, x2, y2, text_content)
+                if key in seen:
+                    continue
+                seen.add(key)
+                results.append([x1, y1, x2, y2, text_content])
+            except ValueError:
+                continue
+    if not results:
+        return None
+    return results
+def zip_folder(source_folder, destination_zip):
+    abs_source = os.path.abspath(source_folder)
+    abs_destination = os.path.abspath(destination_zip)
+    with zipfile.ZipFile(abs_destination, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for root, _, files in os.walk(abs_source):
+            for file in files:
+                abs_file_path = os.path.join(root, file)
+                relative_path = os.path.relpath(abs_file_path, abs_source)
+                zf.write(abs_file_path, relative_path)
+@thread_safe
+def spotting_evaluation(prediction_list, img_metas):
+    score = 0
+    submit_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'submit')
+    gt_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'gt')
+    submit_zip_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'submit.zip')
+    gt_zip_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'gt.zip')
+    for file_path in [submit_path, gt_path, submit_zip_path, gt_zip_path]:
+        if 'zip' in file_path:
+            if os.path.exists(file_path):
+                os.remove(file_path)
+        else:
+            if os.path.exists(file_path):
+                shutil.rmtree(file_path)
+            os.makedirs(file_path, exist_ok=True)
+    res_submit_list = []
+    for item in prediction_list:
+        x1, y1, x2, y2, rec = item
+        if x1 >= x2 or y1 >= y2:
+            continue
+        res_submit_list.append(','.join([str(x1), str(y1), str(x2), str(y1), str(x2), str(y2), str(x1), str(y2), rec]))
+    res_gt_list = []
+    for bbox, rec in zip(img_metas['bbox_list'], img_metas['content']):
+        x_coords = bbox[0::2]
+        y_coords = bbox[1::2]
+        x1, y1 = min(x_coords), min(y_coords)
+        x2, y2 = max(x_coords), max(y_coords)
+        res_gt_list.append(','.join([str(x1), str(y1), str(x2), str(y1), str(x2), str(y2), str(x1), str(y2), rec]))
+    if len(res_submit_list) == 0 or len(res_gt_list) == 0:
+        return 0
+    with open(os.path.join(submit_path, 'res_img_0.txt'), 'w') as f:
+        for item in res_submit_list[:-1]:
+            f.write(item + '\n')
+        f.write(res_submit_list[-1])
+    with open(os.path.join(gt_path, 'gt_img_0.txt'), 'w') as f:
+        for item in res_gt_list[:-1]:
+            f.write(item + '\n')
+        f.write(res_gt_list[-1])
+    zip_folder(submit_path, submit_zip_path)
+    zip_folder(gt_path, gt_zip_path)
+    command = {'g': gt_zip_path, 's': submit_zip_path, 'o': DEFAULT_EVALSCOPE_CACHE_DIR, 'p': '{"IOU_CONSTRAINT":0.5}'}
+    # run rrc_evaluation_funcs
+    result = rrc_evaluation_funcs.main_evaluation(command, default_evaluation_params, validate_data, evaluate_method)
+    score = result['method']['hmean']
+    return score

evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl