PyPI - evalscope - Versions diffs - 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show

evalscope/api/benchmark/__init__.py +8 -1
evalscope/api/benchmark/adapters/__init__.py +1 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/benchmark.py +14 -0
evalscope/api/dataset/dataset.py +21 -0
evalscope/api/dataset/loader.py +6 -2
evalscope/api/mixin/sandbox_mixin.py +32 -54
evalscope/api/model/generate_config.py +6 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +111 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/constants.py +4 -0
evalscope/evaluator/evaluator.py +72 -79
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +52 -1
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/utils/openai.py +4 -0
evalscope/perf/arguments.py +24 -4
evalscope/perf/benchmark.py +74 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +179 -79
evalscope/perf/plugin/api/openai_api.py +4 -3
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/utils/benchmark_util.py +36 -22
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +0 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +11 -2
evalscope/report/combinator.py +52 -2
evalscope/run.py +4 -0
evalscope/utils/function_utils.py +195 -12
evalscope/utils/io_utils.py +74 -0
evalscope/utils/json_schema.py +8 -6
evalscope/utils/logger.py +49 -17
evalscope/utils/multi_choices.py +16 -1
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -429
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_sandbox.py +0 -81
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -137
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -206
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py ADDED Viewed

@@ -0,0 +1,963 @@
+# flake8: noqa
+# Copyright 2020 IBM
+# Author: peter.zhong@au1.ibm.com
+#
+# This is free software; you can redistribute it and/or modify
+# it under the terms of the Apache 2.0 License.
+#
+# This software is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# Apache 2.0 License for more details.
+import ast
+import distance
+import editdistance
+import json
+import Levenshtein
+import numpy as np
+import re
+import string
+from apted import APTED, Config
+from apted.helpers import Tree
+from collections import deque
+from itertools import product
+from lxml import etree, html
+from tqdm import tqdm
+from typing import Any, Callable, Optional, Sequence
+from zss import Node, simple_distance
+from .parallel import parallel_process
+class TableTree(Tree):
+    def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
+        self.tag = tag
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.content = content
+        self.children = list(children)
+    def bracket(self):
+        """Show tree using brackets notation"""
+        if self.tag == 'td':
+            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % (
+                self.tag, self.colspan, self.rowspan, self.content
+            )
+        else:
+            result = '"tag": %s' % self.tag
+        for child in self.children:
+            result += child.bracket()
+        return '{{{}}}'.format(result)
+class CustomConfig(Config):
+    @staticmethod
+    def maximum(*sequences):
+        """Get maximum possible value"""
+        return max(map(len, sequences))
+    def normalized_distance(self, *sequences):
+        """Get distance from 0 to 1"""
+        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
+    def rename(self, node1, node2):
+        """Compares attributes of trees"""
+        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
+            return 1.0
+        if node1.tag == 'td':
+            if node1.content or node2.content:
+                return self.normalized_distance(node1.content, node2.content)
+        return 0.0
+class TEDS(object):
+    """Tree Edit Distance basead Similarity"""
+    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
+        assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1'
+        self.structure_only = structure_only
+        self.n_jobs = n_jobs
+        self.ignore_nodes = ignore_nodes
+        self.__tokens__ = []
+    def tokenize(self, node):
+        """Tokenizes table cells"""
+        self.__tokens__.append('<%s>' % node.tag)
+        if node.text is not None:
+            self.__tokens__ += list(node.text)
+        for n in node.getchildren():
+            self.tokenize(n)
+        if node.tag != 'unk':
+            self.__tokens__.append('</%s>' % node.tag)
+        if node.tag != 'td' and node.tail is not None:
+            self.__tokens__ += list(node.tail)
+    def load_html_tree(self, node, parent=None):
+        """Converts HTML tree to the format required by apted"""
+        global __tokens__
+        if node.tag == 'td':
+            if self.structure_only:
+                cell = []
+            else:
+                self.__tokens__ = []
+                self.tokenize(node)
+                cell = self.__tokens__[1:-1].copy()
+            new_node = TableTree(
+                node.tag, int(node.attrib.get('colspan', '1')), int(node.attrib.get('rowspan', '1')), cell, *deque()
+            )
+        else:
+            new_node = TableTree(node.tag, None, None, None, *deque())
+        if parent is not None:
+            parent.children.append(new_node)
+        if node.tag != 'td':
+            for n in node.getchildren():
+                self.load_html_tree(n, new_node)
+        if parent is None:
+            return new_node
+    def evaluate(self, pred, true):
+        """Computes TEDS score between the prediction and the ground truth of a
+        given sample
+        """
+        if (not pred) or (not true):
+            return 0.0
+        parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
+        pred = html.fromstring(pred, parser=parser)
+        true = html.fromstring(true, parser=parser)
+        # print("pred:",pred)
+        # print("true:",true)
+        if pred.xpath('body/table') and true.xpath('body/table'):
+            pred = pred.xpath('body/table')[0]
+            true = true.xpath('body/table')[0]
+            if self.ignore_nodes:
+                etree.strip_tags(pred, *self.ignore_nodes)
+                etree.strip_tags(true, *self.ignore_nodes)
+            n_nodes_pred = len(pred.xpath('.//*'))
+            n_nodes_true = len(true.xpath('.//*'))
+            n_nodes = max(n_nodes_pred, n_nodes_true)
+            tree_pred = self.load_html_tree(pred)
+            tree_true = self.load_html_tree(true)
+            distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
+            return 1.0 - (float(distance) / n_nodes)
+        else:
+            return 0.0
+    def batch_evaluate(self, pred_json, true_json):
+        """Computes TEDS score between the prediction and the ground truth of
+        a batch of samples
+        @params pred_json: {'FILENAME': 'HTML CODE', ...}
+        @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
+        @output: {'FILENAME': 'TEDS SCORE', ...}
+        """
+        samples = true_json.keys()
+        if self.n_jobs == 1:
+            scores = [
+                self.evaluate(pred_json.get(filename, ''), true_json[filename]['html']) for filename in tqdm(samples)
+            ]
+        else:
+            # inputs = [{'pred': pred_json.get(filename, ''), 'true': true_json[filename]['html']} for filename in samples]
+            inputs = [{'pred': pred_json.get(filename, ''), 'true': true_json[filename]} for filename in samples]
+            scores = parallel_process(inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1)
+        scores = dict(zip(samples, scores))
+        return scores
+def convert_table_to_html_str(table_row_list=[]):
+    """
+    Given a list of table rows, build the corresponding html string, which is used to compute the TEDS score.
+    We use the official code of PubTabNet to compute TEDS score, it does not consider '<th>' label.
+    We also remove unneccessary spaces within a table cell and extra '\n' as they will influence the TEDS score.
+    """
+    html_table_str = '<html><body><table>' + '\n'
+    for data_row in table_row_list:
+        html_table_str += '<tr>'
+        for cell_str in data_row:
+            html_table_str += f'<td>{cell_str}</td>'
+        html_table_str += '</tr>'
+        html_table_str += '\n'
+    html_table_str += '</table></body></html>'
+    html_table_str = html_table_str.replace('\n', '')
+    return html_table_str
+def convert_markdown_table_to_html(markdown_table):
+    """
+    Converts a markdown table to the corresponding html string for TEDS computation.
+    """
+    # remove extra code block tokens like '```markdown' and '```
+    markdown_table = markdown_table.strip('```markdown').strip('```').strip()
+    row_str_list = markdown_table.split('\n')
+    # extra the first header row and other data rows
+    valid_row_str_list = [row_str_list[0]] + row_str_list[2:]
+    table_rows = []
+    for row_str in valid_row_str_list:
+        one_row = []
+        for cell in row_str.strip().split('|')[1:-1]:
+            if set(cell) != set(' '):
+                one_row.append(cell.strip())
+            else:
+                one_row.append(' ')
+        table_rows.append(one_row)
+    # build html string based on table rows
+    html_str = convert_table_to_html_str(table_rows)
+    return html_str
+def dict_to_html(data):
+    html = '<html><body><table>\n'
+    for key, value in data.items():
+        if not isinstance(value, str):
+            value = str(value)
+        value_str = ' '.join(value)
+        html += f'  <tr><td>{key}</td><td>{value_str}</td></tr>\n'
+    html += '</table></body></html>'
+    return html
+def convert_str_to_dict(predict_str: str):
+    """
+    Parses the 'predict' string and returns a dictionary.
+    Missing or unparseable content is handled gracefully.
+    Parameters:
+    - predict_str (str): The prediction string containing the output dict.
+    Returns:
+    - dict: A dictionary extracted from the predict string.
+    """
+    # Remove code fences like ```python\n...\n```
+    code_fence_pattern = r'```(?:python|json)?\n(.*?)\n```'
+    match = re.search(code_fence_pattern, predict_str, re.DOTALL | re.IGNORECASE)
+    if match:
+        content = match.group(1)
+    else:
+        content = predict_str.strip()
+    data = {}
+    success = False
+    # try parsing with JSON
+    try:
+        data = json.loads(content)
+        success = True
+    except json.JSONDecodeError:
+        pass
+    # try parsing with ast.literal_eval
+    if not success:
+        try:
+            data = ast.literal_eval(content)
+            if isinstance(data, dict):
+                success = True
+        except (ValueError, SyntaxError):
+            pass
+    # try parsing with regex
+    if not success:
+        key_value_pattern = r'["\']?([\w\s]+)["\']?\s*[:=]\s*["\']?([^\n,"\'{}]+)["\']?'
+        matches = re.findall(key_value_pattern, content)
+        try:
+            for key, value in matches:
+                data[key.strip()] = value.strip()
+        except:
+            return {}
+    if not data:
+        return {}
+    try:
+        result = {k.strip(): str(v).strip() for k, v in data.items()}
+    except:
+        return {}
+    return result
+def convert_str_to_multi_dict(predict_str: str):
+    """
+    Parses the 'predict' string and returns a dictionary.
+    Handles nested dictionaries and missing or unparseable content gracefully.
+    Parameters:
+    - predict_str (str): The prediction string containing the output dict.
+    Returns:
+    - dict: A dictionary extracted from the predict string.
+    """
+    # Remove code fences like ```python\n...\n```
+    code_fence_pattern = r'```(?:python|json)?\n(.*?)\n```'
+    matches = re.findall(code_fence_pattern, predict_str, re.DOTALL | re.IGNORECASE)
+    if matches:
+        content = max(matches, key=len)
+    else:
+        content = predict_str.strip()
+    def strip_variable_assignment(s):
+        variable_assignment_pattern = r'^\s*\w+\s*=\s*'
+        return re.sub(variable_assignment_pattern, '', s.strip(), count=1)
+    content = strip_variable_assignment(content)
+    def remove_comments(s):
+        return re.sub(r'#.*', '', s)
+    content = remove_comments(content)
+    last_brace_pos = content.rfind('}')
+    if last_brace_pos != -1:
+        content = content[:last_brace_pos + 1]
+    data = {}
+    success = False
+    # try parsing with ast.literal_eval
+    try:
+        data = ast.literal_eval(content)
+        if isinstance(data, dict):
+            success = True
+    except (ValueError, SyntaxError, TypeError):
+        pass
+    if not success:
+        return {}
+    def process_data(obj):
+        if isinstance(obj, dict):
+            return {k: process_data(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [process_data(elem) for elem in obj]
+        else:
+            return obj
+    data = process_data(data)
+    return data
+def generate_combinations(input_dict):
+    """
+    Function to generate all possible combinations of values from a dictionary.
+    """
+    kie_answer = input_dict
+    if not isinstance(kie_answer, dict):
+        kie_answer = kie_answer.strip('"')
+        try:
+            kie_answer = json.loads(kie_answer)
+        except json.JSONDecodeError:
+            try:
+                kie_answer = ast.literal_eval(kie_answer)
+                if not isinstance(kie_answer, dict):
+                    kie_answer = ast.literal_eval(kie_answer)
+            except (ValueError, SyntaxError):
+                print(f"Unable to parse 'answers' field: {kie_answer}")
+                return {}
+        # Ensure the parsed result is a dictionary.
+        if not isinstance(kie_answer, dict):
+            print("Parsed 'answers' is still not a dictionary.")
+            raise ValueError('Input could not be parsed into a dictionary.')
+        keys = list(kie_answer.keys())
+        value_lists = []
+        for single_key in keys:
+            sinlge_value = kie_answer[single_key]
+            if not isinstance(sinlge_value, list):
+                sinlge_value = [sinlge_value]
+            value_lists.append(sinlge_value)
+        # Compute the Cartesian product of the value lists.
+        combinations = list(product(*value_lists))
+        # Create a dictionary for each combination of values.
+        result = [dict(zip(keys, values)) for values in combinations]
+        return result
+    else:
+        keys = list(input_dict.keys())
+        value_lists = [input_dict[key] for key in keys]
+        # Compute the Cartesian product of the value lists.
+        combinations = list(product(*value_lists))
+        # Create a dictionary for each combination of values.
+        result = [dict(zip(keys, values)) for values in combinations]
+        return result
+def compute_f1_score(preds, gts, ignores=[]):
+    """Compute the F1-score for KIE task between predicted and ground truth dictionaries.
+    Args:
+        preds (dict): The predicted key-value pairs.
+        gts (dict): The ground truth key-value pairs.
+        ignores (list): The list of keys to ignore during evaluation.
+    Returns:
+        dict: A dictionary where keys are field names and values are their corresponding F1-scores.
+    """
+    # Optionally remove ignored keys from predictions and ground truths
+    keys = set(preds.keys()).union(set(gts.keys())) - set(ignores)
+    f1_scores = {}
+    for key in keys:
+        pred_value = preds.get(key, None)
+        gt_value = gts.get(key, None)
+        if pred_value:
+            pred_value = pred_value.lower().strip().replace('\n', ' ').replace(' ', '')
+        if gt_value:
+            gt_value = gt_value.lower().strip().replace('\n', ' ').replace(' ', '')
+        if pred_value is None and gt_value is None:
+            continue
+        elif pred_value is None:
+            precision = 0.0
+            recall = 0.0
+        elif gt_value is None:
+            # false positive
+            precision = 0.0
+            recall = 0.0
+        else:
+            if pred_value == gt_value:
+                # True positive
+                precision = 1.0
+                recall = 1.0
+            else:
+                precision = 0.0
+                recall = 0.0
+        # Compute F1-score
+        f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
+        f1_scores[key] = f1_score
+    if len(f1_scores) == 0:
+        return 0
+    average_f1 = sum(f1_scores.values()) / len(f1_scores)
+    return average_f1
+def pre_clean(text):
+    text = re.sub(r'<bos>|<eos>|<pad>|<unk>', '', text)
+    text = re.sub(r'\s##(\S)', r'\1', text)
+    text = re.sub(r'\\\s', r'\\', text)
+    text = re.sub(r'\s\*\s\*\s', r'**', text)
+    text = re.sub(r'{\s', r'{', text)
+    text = re.sub(r'\s}', r'}', text)
+    text = re.sub(r'\s}', r'}', text)
+    text = re.sub(r'\\begin\s', r'\\begin', text)
+    text = re.sub(r'\\end\s', r'\\end', text)
+    text = re.sub(r'\\end{table}', r'\\end{table} \n\n', text)
+    text = text.replace('\n', ' ')
+    text = text.replace('*', ' ')
+    text = text.replace('_', ' ')
+    return text
+def get_tree(input_str):
+    tree = Node('ROOT').addkid(Node('TITLE'))
+    lines = input_str.split('\n')
+    lines = [pre_clean(line) for line in lines]
+    last_title = ''
+    for line in lines:
+        if line.startswith('#'):
+            child = tree.get('ROOT')
+            line = line.replace('#', '')
+            child.addkid(Node(line))
+            last_title = line
+        else:
+            if last_title == '':
+                child = tree.get('TITLE')
+                child.addkid(Node(line))
+            else:
+                child = tree.get(last_title)
+                child.addkid(Node(line))
+    return tree
+def STEDS(pred_tree, ref_tree):
+    def my_distance(pred, ref):
+        if len(pred.split()) == 0 or len(ref.split()) == 0:
+            return 1
+        else:
+            return 0
+    total_distance = simple_distance(pred_tree, ref_tree, label_dist=my_distance)
+    num_of_nodes = max(len(list(pred_tree.iter())), len(list(ref_tree.iter())))
+    return 1 - total_distance / num_of_nodes
+def doc_parsing_evaluation(pred, gt):
+    score = 0
+    if not isinstance(pred, str):
+        return 0
+    pred_tree = get_tree(pred)
+    gt_tree = get_tree(gt)
+    score = STEDS(pred_tree, gt_tree)
+    return score
+def wrap_html_table(html_table):
+    """
+    The TEDS computation from PubTabNet code requires that the input html table should have <html>, <body>, and <table> tags.
+    Add them if they are missing.
+    """
+    html_table = html_table.replace('\n', '')
+    # add missing <table> tag if missing
+    if '<table' in html_table and '</table>' not in html_table:
+        html_table = html_table + '</table>'
+    elif '<table' not in html_table and '</table>' in html_table:
+        html_table = '<table>' + html_table
+    elif '<table' not in html_table and '</table>' not in html_table:
+        html_table = '<table>' + html_table + '</table>'
+    else:
+        pass
+    # add <body> and <html> tags if missing
+    if '<body>' not in html_table:
+        html_table = '<body>' + html_table + '</body>'
+    if '<html>' not in html_table:
+        html_table = '<html>' + html_table + '</html>'
+    return html_table
+def get_anls(s1, s2):
+    try:
+        s1 = s1.lower()
+        s2 = s2.lower()
+    except:
+        pass
+    if s1 == s2:
+        return 1.0
+    iou = 1 - editdistance.eval(s1, s2) / max(len(s1), len(s2))
+    anls = iou
+    return anls
+def ocr_eval(references, predictions):
+    socre_ = 0.0
+    None_num = 0
+    for idx, ref_value in enumerate(references):
+        pred_value = predictions[idx]
+        pred_values, ref_values = [], []
+        if isinstance(pred_value, str):
+            pred_values.append(pred_value)
+        else:
+            pred_values = pred_value
+        if isinstance(ref_value, str):
+            ref_values.append(ref_value)
+        else:
+            ref_values = ref_value
+        temp_score = 0.0
+        temp_num = len(ref_values)
+        for tmpidx, tmpref in enumerate(ref_values):
+            tmppred = pred_values[tmpidx] if tmpidx < len(pred_values) else pred_values[0]
+            if len(pred_values) == 1 and tmppred != 'None' and 'None' not in ref_values:  # pred 1, and not None
+                temp_score = max(temp_score, get_anls(tmppred, tmpref))
+                temp_num = len(ref_values)
+            else:
+                if tmppred == 'None' and tmpref != 'None':
+                    temp_score += 0.0
+                elif tmpref == 'None':
+                    temp_num -= 1
+                else:
+                    temp_score += get_anls(tmppred, tmpref)
+        if temp_num == 0:
+            ocr_score = 0.0
+            None_num += 1
+        else:
+            ocr_score = temp_score / (temp_num)
+        socre_ += ocr_score
+    if None_num == len(references):
+        return 9999
+    else:
+        return round(socre_ / (len(references) - None_num), 5)
+def csv_eval(predictions, references, easy, pred_type='json'):
+    predictions = predictions
+    labels = references
+    def is_int(val):
+        try:
+            int(val)
+            return True
+        except ValueError:
+            return False
+    def is_float(val):
+        try:
+            float(val)
+            return True
+        except ValueError:
+            return False
+    def convert_dict_to_list(data):
+        """
+        Convert a dictionary to a list of tuples, handling both simple and nested dictionaries.
+        Args:
+        data (dict): The input dictionary, which might be nested or simple.
+        Returns:
+        list: A list of tuples generated from the input dictionary.
+        """
+        # print(data)
+        converted_list = []
+        for key, value in data.items():
+            # Check if the value is a dictionary (indicating a nested structure)
+            if isinstance(value, dict):
+                # Handle nested dictionary
+                for subkey, subvalue in value.items():
+                    # converted_list.append((key, subkey, subvalue))
+                    converted_list.append((key, subkey, re.sub(r'[^\d.-]', '', str(subvalue))))
+            else:
+                # Handle simple key-value pair
+                # converted_list.append((key, "value", value))
+                converted_list.append((key, 'value', re.sub(r'[^\d.-]', '', str(value))))
+        return converted_list
+    def csv2triples(csv, separator='\\t', delimiter='\\n'):
+        lines = csv.strip().split(delimiter)
+        header = lines[0].split(separator)
+        triples = []
+        for line in lines[1:]:
+            if not line:
+                continue
+            values = line.split(separator)
+            entity = values[0]
+            for i in range(1, len(values)):
+                if i >= len(header):
+                    break
+                # ---------------------------------------------------------
+                temp = [entity.strip(), header[i].strip()]
+                temp = [x if len(x) == 0 or x[-1] != ':' else x[:-1] for x in temp]
+                value = values[i].strip()
+                value = re.sub(r'[^\d.-]', '', str(value))
+                # value = value.replace("%","")
+                # value = value.replace("$","")
+                triples.append((temp[0], temp[1], value))
+                # ---------------------------------------------------------
+        return triples
+    def csv2triples_noheader(csv, separator='\\t', delimiter='\\n'):
+        lines = csv.strip().split(delimiter)
+        maybe_header = [x.strip() for x in lines[0].split(separator)]
+        not_header = False
+        if len(maybe_header) > 2:
+            for c in maybe_header[1:]:
+                try:
+                    num = float(c)
+                    not_header = True
+                except:
+                    continue
+                if not_header:
+                    break
+        header = None if not_header else maybe_header
+        data_start = 0 if not_header and separator in lines[0] else 1
+        triples = []
+        for line in lines[data_start:]:
+            if not line:
+                continue
+            values = [x.strip() for x in line.split(separator)]
+            entity = values[0]
+            for i in range(1, len(values)):
+                try:
+                    temp = [entity if entity[-1] != ':' else entity[:-1], '']
+                except:
+                    temp = [entity, '']
+                if header is not None:
+                    try:
+                        this_header = header[i]
+                        temp = [entity, this_header]
+                        temp = [x if x[-1] != ':' else x[:-1] for x in temp]
+                    except:
+                        this_header = entity.strip()
+                value = values[i].strip()
+                value = re.sub(r'[^\d.-]', '', str(value))
+                # value = value.replace("%","")
+                # value = value.replace("$","")
+                triples.append((temp[0], temp[1], value))
+                # ---------------------------------------------------------
+        return triples
+    def process_triplets(triplets):
+        new_triplets = []
+        for triplet in triplets:
+            new_triplet = []
+            triplet_temp = []
+            if len(triplet) > 2:
+                if is_int(triplet[2]) or is_float(triplet[2]):
+                    triplet_temp = (triplet[0].lower(), triplet[1].lower(), float(triplet[2]))
+                else:
+                    triplet_temp = (triplet[0].lower(), triplet[1].lower(), triplet[2].lower())
+            else:
+                triplet_temp = (triplet[0].lower(), triplet[1].lower(), 'no meaning')
+            new_triplets.append(triplet_temp)
+        return new_triplets
+    def intersection_with_tolerance(a, b, tol_word, tol_num):
+        a = set(a)
+        b = set(b)
+        c = set()
+        for elem1 in a:
+            for elem2 in b:
+                if is_float(elem1[-1]) and is_float(elem2[-1]):
+                    if (((Levenshtein.distance(''.join(elem1[:-1]), ''.join(elem2[:-1])) <= tol_word) and
+                         (abs(elem1[-1] - elem2[-1]) / (abs(elem2[-1]) + 0.000001) <= tol_num))
+                        or ((''.join(elem1[:-1]) in ''.join(elem2[:-1])) and
+                            (abs(elem1[-1] - elem2[-1]) / (abs(elem2[-1]) + 0.000001) <= tol_num))
+                        or ((''.join(elem2[:-1]) in ''.join(elem1[:-1])) and
+                            (abs(elem1[-1] - elem2[-1]) / (abs(elem2[-1]) + 0.000001) <= tol_num))):
+                        c.add(elem1)
+                else:
+                    if Levenshtein.distance(
+                        ''.join([str(i) for i in elem1]), ''.join([str(j) for j in elem2])
+                    ) <= tol_word:
+                        c.add(elem1)
+        return list(c)
+    def union_with_tolerance(a, b, tol_word, tol_num):
+        c = set(a) | set(b)
+        d = set(a) & set(b)
+        e = intersection_with_tolerance(a, b, tol_word, tol_num)
+        f = set(e)
+        g = c - (f - d)
+        return list(g)
+    def get_eval_list(
+        pred_csv, label_csv, separator='\\t', delimiter='\\n', tol_word=3, tol_num=0.05, pred_type='json'
+    ):
+        if pred_type == 'json':
+            pred_triple_list = []
+            for it in pred_csv:
+                pred_triple_temp = convert_dict_to_list(it)
+                pred_triple_pre = process_triplets(pred_triple_temp)
+                pred_triple_list.append(pred_triple_pre)
+        else:
+            pred_triple_list = []
+            for it in pred_csv:
+                pred_triple_temp = csv2triples(it, separator=separator, delimiter=delimiter)
+                # pred_triple_temp = csv2triples_noheader(it, separator=separator, delimiter=delimiter)
+                pred_triple_pre = process_triplets(pred_triple_temp)
+                pred_triple_list.append(pred_triple_pre)
+        label_triple_list = []
+        for it in label_csv:
+            label_triple_temp = convert_dict_to_list(it)
+            label_triple_pre = process_triplets(label_triple_temp)
+            label_triple_list.append(label_triple_pre)
+        intersection_list = []
+        union_list = []
+        sim_list = []
+        # for each chart image
+        for pred, label in zip(pred_triple_list, label_triple_list):
+            for idx in range(len(pred)):
+                try:
+                    if label[idx][1] == 'value' and 'value' not in pred[idx][:2]:
+                        pred[idx] = (pred[idx][0], 'value', pred[idx][2])
+                    temp_pred_head = sorted(pred[idx][:2])
+                    temp_gt_head = sorted(label[idx][:2])
+                    pred[idx] = (temp_pred_head[0], temp_pred_head[1], pred[idx][2])
+                    label[idx] = (temp_gt_head[0], temp_gt_head[1], label[idx][2])
+                except:
+                    continue
+            intersection = intersection_with_tolerance(pred, label, tol_word=tol_word, tol_num=tol_num)
+            union = union_with_tolerance(pred, label, tol_word=tol_word, tol_num=tol_num)
+            sim = len(intersection) / len(union)
+            intersection_list.append(intersection)
+            union_list.append(union)
+            sim_list.append(sim)
+        return intersection_list, union_list, sim_list
+    def get_ap(predictions, labels, sim_threhold, tolerance, separator='\\t', delimiter='\\n', easy=1):
+        if tolerance == 'strict':
+            tol_word = 0
+            if easy == 1:
+                tol_num = 0
+            else:
+                tol_num = 0.1
+        elif tolerance == 'slight':
+            tol_word = 2
+            if easy == 1:
+                tol_num = 0.05
+            else:
+                tol_num = 0.3
+        elif tolerance == 'high':
+            tol_word = 5
+            if easy == 1:
+                tol_num = 0.1
+            else:
+                tol_num = 0.5
+        intersection_list, union_list, sim_list = get_eval_list(
+            predictions,
+            labels,
+            separator=separator,
+            delimiter=delimiter,
+            tol_word=tol_word,
+            tol_num=tol_num,
+            pred_type=pred_type
+        )
+        ap = len([num for num in sim_list if num >= sim_threhold]) / (len(sim_list) + 1e-16)
+        return ap
+    map_strict = 0
+    map_slight = 0
+    map_high = 0
+    s = '\\t'
+    d = '\\n'
+    for sim_threhold in np.arange(0.5, 1, 0.05):
+        map_temp_strict = get_ap(
+            predictions, labels, sim_threhold=sim_threhold, tolerance='strict', separator=s, delimiter=d, easy=easy
+        )
+        map_temp_slight = get_ap(
+            predictions, labels, sim_threhold=sim_threhold, tolerance='slight', separator=s, delimiter=d, easy=easy
+        )
+        map_temp_high = get_ap(
+            predictions, labels, sim_threhold=sim_threhold, tolerance='high', separator=s, delimiter=d, easy=easy
+        )
+        map_strict += map_temp_strict / 10
+        map_slight += map_temp_slight / 10
+        map_high += map_temp_high / 10
+    em = get_ap(predictions, labels, sim_threhold=1, tolerance='strict', separator=s, delimiter=d, easy=easy)
+    ap_50_strict = get_ap(
+        predictions, labels, sim_threhold=0.5, tolerance='strict', separator=s, delimiter=d, easy=easy
+    )
+    ap_75_strict = get_ap(
+        predictions, labels, sim_threhold=0.75, tolerance='strict', separator=s, delimiter=d, easy=easy
+    )
+    ap_90_strict = get_ap(
+        predictions, labels, sim_threhold=0.90, tolerance='strict', separator=s, delimiter=d, easy=easy
+    )
+    ap_50_slight = get_ap(
+        predictions, labels, sim_threhold=0.5, tolerance='slight', separator=s, delimiter=d, easy=easy
+    )
+    ap_75_slight = get_ap(
+        predictions, labels, sim_threhold=0.75, tolerance='slight', separator=s, delimiter=d, easy=easy
+    )
+    ap_90_slight = get_ap(
+        predictions, labels, sim_threhold=0.90, tolerance='slight', separator=s, delimiter=d, easy=easy
+    )
+    ap_50_high = get_ap(predictions, labels, sim_threhold=0.5, tolerance='high', separator=s, delimiter=d, easy=easy)
+    ap_75_high = get_ap(predictions, labels, sim_threhold=0.75, tolerance='high', separator=s, delimiter=d, easy=easy)
+    ap_90_high = get_ap(predictions, labels, sim_threhold=0.90, tolerance='high', separator=s, delimiter=d, easy=easy)
+    return em, map_strict, map_slight, map_high, ap_50_strict, ap_75_strict, ap_90_strict, ap_50_slight, ap_75_slight, ap_90_slight, ap_50_high, ap_75_high, ap_90_high
+def draw_SCRM_table(
+    em,
+    map_strict,
+    map_slight,
+    map_high,
+    ap_50_strict,
+    ap_75_strict,
+    ap_90_strict,
+    ap_50_slight,
+    ap_75_slight,
+    ap_90_slight,
+    ap_50_high,
+    ap_75_high,
+    ap_90_high,
+    title_ocr_socre,
+    source_ocr_socre,
+    x_title_ocr_socre,
+    y_title_ocr_socre,
+    structure_accuracy,
+):
+    result = f"""
+            -----------------------------------------------------------\n
+            |  Metrics   |  Sim_threshold  |  Tolerance  |    Value    |\n
+            -----------------------------------------------------------\n
+            |             |                 |   strict    |    {'%.4f' % map_strict}    |     \n
+            |             |                 ----------------------------\n
+            |  mPrecison  |  0.5:0.05:0.95  |   slight    |    {'%.4f' % map_slight}    |\n
+            |             |                  ---------------------------\n
+            |             |                 |    high     |    {'%.4f' % map_high}    |\n
+            -----------------------------------------------------------\n
+            |             |                 |   strict    |    {'%.4f' % ap_50_strict}    |\n
+            |             |                  ---------------------------\n
+            |  Precison   |       0.5       |   slight    |    {'%.4f' % ap_50_slight }    |\n
+            |             |                  ---------------------------\n
+            |             |                 |    high     |    {'%.4f' % ap_50_high }    |\n
+            -----------------------------------------------------------\n
+            |             |                 |   strict    |    {'%.4f' % ap_75_strict}    |\n
+            |             |                  ---------------------------\n
+            |  Precison   |      0.75       |   slight    |    {'%.4f' % ap_75_slight}    |\n
+            |             |                  ---------------------------\n
+            |             |                 |    high     |    {'%.4f' % ap_75_high}    |\n
+            -----------------------------------------------------------\n
+            |             |                 |   strict    |    {'%.4f' % ap_90_strict}    |\n
+            |             |                  ---------------------------\n
+            |  Precison   |       0.9       |   slight    |    {'%.4f' % ap_90_slight }    |\n
+            |             |                  ---------------------------\n
+            |             |                 |    high     |    {'%.4f' % ap_90_high}    |\n
+            -----------------------------------------------------------\n
+            |Precison(EM) |                                    {'%.4f' % em}    |\n
+            -----------------------------------------------------------\n
+            |Title(EM)    |                                    {'%.4f' % title_ocr_socre}    |\n
+            -----------------------------------------------------------\n
+            |Source(EM)   |                                    {'%.4f' % source_ocr_socre}    |\n
+            -----------------------------------------------------------\n
+            |X_title(EM)  |                                    {'%.4f' % x_title_ocr_socre}    |\n
+            -----------------------------------------------------------\n
+            |Y_title(EM)  |                                    {'%.4f' % y_title_ocr_socre}    |\n
+            -----------------------------------------------------------\n
+            |structure_acc|                                    {'%.4f' % structure_accuracy}    |\n
+            -----------------------------------------------------------\n
+            """
+    return result
+if __name__ == '__main__':
+    import json
+    import pprint
+    # markdown structure for Table Parsing task
+    pred_markdown = '| 1 | august 5 , 1972 | detroit lions | l 23 - 31 | 0 - 1 |\n| 2 | august 12 , 1972 | green bay packers | l 13 - 14 | 0 - 2 |\n| 3 | august 19 , 1972 | cincinnati bengals | w 35 - 17 | 1 - 2 |\n| 4 | august 25 , 1972 | atlanta falcons | w 24 - 10 | 2 - 2 |\n| 5 | august 31 , 1972 | washington redskins | l 24 - 27 | 2 - 3 |\n| 6 | september 10 , 1972 | minnesota vikings | w 21 - 19 | 3 - 3 |'
+    true_markdown = '| week | date | opponent | result | record |\n| --- | --- | --- | --- | --- |\n| 1 | august 5 , 1972 | detroit lions | l 23 - 31 | 0 - 1 |\n| 2 | august 12 , 1972 | green bay packers | l 13 - 14 | 0 - 2 |\n| 3 | august 19 , 1972 | cincinnati bengals | w 35 - 17 | 1 - 2 |\n| 4 | august 25 , 1972 | atlanta falcons | w 24 - 10 | 2 - 2 |\n| 5 | august 31 , 1972 | washington redskins | l 24 - 27 | 2 - 3 |\n| 6 | september 10 , 1972 | minnesota vikings | w 21 - 19 | 3 - 3 |'
+    teds = TEDS(n_jobs=4)
+    pred_table_html = convert_markdown_table_to_html(pred_markdown)
+    true_table_html = convert_markdown_table_to_html(true_markdown)
+    scores = teds.evaluate(pred_table_html, true_table_html)
+    pp = pprint.PrettyPrinter()
+    pp.pprint(scores)
+    # dict structure for Key Information Extraction task
+    pred_dict = {'company': ['OLD TOWN '], 'date': ['2024'], 'address': ['SRI RAMPAI'], 'total': ['30']}
+    true_dict = {
+        'company': ['OLD TOWN KOPITAM SND BHD'],
+        'date': ['2024/9/27'],
+        'address': ['SRI RAMPAI'],
+        'total': ['30']
+    }
+    teds = TEDS(n_jobs=4)
+    pred_dict_html = dict_to_html(pred_dict)
+    true_dict_html = dict_to_html(true_dict)
+    print(pred_dict_html)
+    print(true_dict_html)
+    scores = teds.evaluate(pred_dict_html, true_dict_html)
+    pp = pprint.PrettyPrinter()
+    pp.pprint(scores)

evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl