PyPI - evalscope - Versions diffs - 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl - Mend

evalscope 0.15.1py3-none-any.whl → 0.16.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (78) hide show

evalscope/app/__init__.py +28 -0
evalscope/{report → app}/app.py +67 -59
evalscope/app/constants.py +21 -0
evalscope/arguments.py +12 -1
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
evalscope/backend/rag_eval/utils/embedding.py +75 -35
evalscope/backend/rag_eval/utils/llm.py +1 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
evalscope/benchmarks/benchmark.py +1 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
evalscope/benchmarks/data_adapter.py +101 -18
evalscope/benchmarks/docmath/__init__.py +0 -0
evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
evalscope/benchmarks/docmath/utils.py +220 -0
evalscope/benchmarks/drop/__init__.py +0 -0
evalscope/benchmarks/drop/drop_adapter.py +133 -0
evalscope/benchmarks/drop/utils.py +59 -0
evalscope/benchmarks/frames/__init__.py +0 -0
evalscope/benchmarks/frames/frames_adapter.py +90 -0
evalscope/benchmarks/frames/utils.py +37 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
evalscope/benchmarks/needle_haystack/__init__.py +0 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
evalscope/benchmarks/needle_haystack/utils.py +79 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/__init__.py +0 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
evalscope/benchmarks/tool_bench/utils.py +203 -0
evalscope/benchmarks/utils.py +28 -2
evalscope/benchmarks/winogrande/__init__.py +0 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
evalscope/cli/start_app.py +2 -2
evalscope/collections/__init__.py +35 -3
evalscope/collections/evaluator.py +94 -32
evalscope/config.py +54 -17
evalscope/evaluator/evaluator.py +80 -41
evalscope/metrics/__init__.py +3 -1
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
evalscope/metrics/llm_judge.py +15 -8
evalscope/metrics/math_parser.py +1 -1
evalscope/metrics/rouge_metric.py +11 -13
evalscope/models/adapters/chat_adapter.py +51 -34
evalscope/models/adapters/server_adapter.py +17 -25
evalscope/perf/arguments.py +16 -7
evalscope/perf/benchmark.py +0 -15
evalscope/perf/main.py +72 -15
evalscope/perf/plugin/datasets/custom.py +15 -0
evalscope/perf/utils/benchmark_util.py +34 -16
evalscope/perf/utils/db_util.py +25 -15
evalscope/perf/utils/local_server.py +1 -0
evalscope/perf/utils/log_utils.py +12 -5
evalscope/perf/utils/rich_display.py +186 -0
evalscope/report/__init__.py +36 -4
evalscope/report/combinator.py +8 -0
evalscope/report/generator.py +33 -9
evalscope/report/utils.py +61 -4
evalscope/run.py +12 -0
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/deprecation_utils.py +42 -0
evalscope/utils/logger.py +1 -1
evalscope/utils/utils.py +12 -0
evalscope/version.py +2 -2
{evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
{evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
tests/aigc/test_t2i.py +40 -3
tests/cli/test_all.py +39 -32
tests/cli/test_collection.py +8 -6
tests/cli/test_run.py +43 -17
tests/perf/test_perf.py +23 -0
tests/rag/test_mteb.py +5 -5
/evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
{evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
{evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
{evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/tool_bench/tool_bench_adapter.py ADDED Viewed

@@ -0,0 +1,70 @@
+from typing import Dict, List
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType, OutputType
+from evalscope.metrics import Metric, mean, metric_registry
+@Benchmark.register(
+    name='tool_bench',
+    pretty_name='ToolBench-Static',
+    dataset_id='AI-ModelScope/ToolBench-Static',
+    subset_list=['in_domain', 'out_of_domain'],
+    metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+)
+class ToolBenchAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        metric_registry.register(Metric(name='Rouge-L', object=mean))
+        metric_registry.register(Metric(name='Act.EM', object=mean))
+        metric_registry.register(Metric(name='Plan.EM', object=mean))
+        metric_registry.register(Metric(name='F1', object=mean))
+        metric_registry.register(Metric(name='HalluRate', object=mean))
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate model prompt from input data.
+        """
+        messages = input_d['messages']
+        # use prepared messages and remove the name field
+        for message in messages:
+            if 'name' in message:
+                del message['name']
+        return self.gen_prompt_data(prompt='', messages=messages)
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        """
+        return input_d
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the predicted result and extract proper answer.
+        """
+        return result
+    def match(self, gold: dict, pred: str) -> Dict:
+        """
+        Match the gold answer and the predicted answer.
+        """
+        from .utils import calculate_metrics
+        data = {
+            'target': gold['target'],
+            'predictions': pred,
+            'tools': gold['tools'],
+        }
+        metrics = calculate_metrics(data)
+        return metrics
+    def compute_metric(self, review_res_list: List[dict], **kwargs) -> Dict:
+        # aggregate review results
+        res_dict = super().compute_dict_metric(review_res_list, **kwargs)
+        return super().compute_metric(res_dict, **kwargs)

evalscope/benchmarks/tool_bench/utils.py ADDED Viewed

@@ -0,0 +1,203 @@
+import json
+from evalscope.metrics import compute_rouge_score_one_sample
+def evaluate_rougel(cand_list: list, ref_list: list):
+    if len(ref_list) == 0:
+        return 0
+    rouge_score = compute_rouge_score_one_sample(cand_list, ref_list)
+    rougel = rouge_score.get('rouge-l-f', 0)
+    return rougel
+def evaluate_action_em(cand_list: list, ref_list: list):
+    if len(ref_list) == 0:
+        return 0
+    em = 0
+    for cand, ref in zip(cand_list, ref_list):
+        em += (1 if cand == ref else 0)
+    return em / len(cand_list)
+def evaluate_action_input_f1(action_pred: list, action_ref: list, cand_list: list, ref_list: list):
+    easy_f1 = []
+    hard_f1 = []
+    f1 = []
+    for i in range(len(action_pred)):
+        ref_action = action_ref[i]
+        pred_action = action_pred[i]
+        ref_input = ref_list[i]
+        cand_input = cand_list[i]
+        if ref_action != pred_action:
+            easy_f1.append(0)
+            hard_f1.append(0)
+            f1.append(0)
+        else:
+            try:
+                ref_input_json = json.loads(ref_input)
+                try:
+                    cand_input_json = json.loads(cand_input)
+                    half_match = 0
+                    full_match = 0
+                    if ref_input_json == {}:
+                        if cand_input_json == {}:
+                            easy_f1.append(1)
+                            f1.append(1)
+                        else:
+                            easy_f1.append(0)
+                            f1.append(0)
+                    else:
+                        for k, v in ref_input_json.items():
+                            if k in cand_input_json.keys():
+                                if cand_input_json[k] == v:
+                                    full_match += 1
+                                else:
+                                    half_match += 1
+                        recall = (0.5 * half_match + full_match) / (len(ref_input_json) + 1e-30)
+                        precision = (0.5 * half_match + full_match) / (len(cand_input_json) + 1e-30)
+                        hard_f1.append((2 * recall * precision) / (recall + precision))
+                        f1.append((2 * recall * precision) / (recall + precision))
+                except Exception:
+                    # cand_input = cand_input.replace("\n","").replace("\"","")
+                    # ref_input = cand_input.replace("\n","").replace("\"","")
+                    # rouge = Rouge()
+                    # rouge_score = rouge.get_scores(hyps=[cand_input], refs=[ref_input], avg=True)
+                    if ref_input_json == {}:
+                        easy_f1.append(0)
+                    else:
+                        hard_f1.append(0)
+                    # hard_f1.append(rouge_score["rouge-l"]["f"])
+                    # f1.append(rouge_score["rouge-l"]["f"])
+                    f1.append(0)
+            except Exception:
+                pass
+    # 检查列表是否为空，如果为空则返回0
+    easy_f1_avg = sum(easy_f1) / len(easy_f1) if easy_f1 else 0
+    hard_f1_avg = sum(hard_f1) / len(hard_f1) if hard_f1 else 0
+    f1_avg = sum(f1) / len(f1) if f1 else 0
+    return easy_f1_avg, hard_f1_avg, f1_avg
+def parse_action(text):
+    action = 'None'
+    action_input = '{}'
+    if 'Action Input:' in text:
+        input_idx = text.rindex('Action Input:')
+        action_input = text[input_idx + len('Action Input:'):].strip()
+    else:
+        action_input = '{}'
+    if 'Action:' in text:
+        action_idx = text.rindex('Action:')
+        action = text[action_idx + len('Action:'):].strip()
+        if 'Action Input:' in action:
+            input_idx = action.index('Action Input:')
+            action = action[:input_idx].strip()
+    else:
+        action = 'none'
+    return action, action_input
+def parse_output(text):
+    action, action_input = parse_action(text)
+    if action == 'Finish':
+        try:
+            action_input = json.loads(action_input)
+            # print(action_input)
+            # print(json.dumps(action_input,indent=2))
+            return_type = action_input['return_type']
+            if return_type == 'give_answer':
+                if 'final_answer' in action_input.keys():
+                    answer = str(action_input['final_answer'])
+                    if answer.strip() in ['', '.', ',']:
+                        answer = 'None'
+                else:
+                    answer = 'None'
+                return 'finish', action, action_input, answer
+            else:
+                return 'give up', None, None, None
+        except Exception:
+            return 'give up', None, None, None
+    else:
+        plan = 'call'
+        answer = None
+        return plan, action, action_input, answer
+def calculate_metrics(data):
+    """
+    Calculate the metrics for the given data.
+    """
+    plan_ref = []
+    plan_pred = []
+    hallu_cases = []
+    answer_ref = []
+    action_ref = []
+    action_input_ref = []
+    answer_pred = []
+    action_pred = []
+    action_input_pred = []
+    hallu_pred = 0
+    reference = data['target']
+    prediction = data['predictions']
+    ref_plan, ref_action, ref_input, ref_ans = parse_output(reference)
+    # ref_plan: call
+    # ref_action: spott
+    # ref_input: {"is_id": "city center" }
+    # ref_ans: None
+    pred_plan, pred_action, pred_input, pred_ans = parse_output(prediction)
+    if ref_action is not None and ref_action == 'invalid_hallucination_function_name':
+        return {}
+    if pred_action is not None and ref_action != 'none' and ref_action not in [t['name'] for t in data['tools']]:
+        return {}
+    if pred_action is not None and pred_action != 'none' and pred_action not in [t['name'] for t in data['tools']]:
+        hallu_pred += 1
+        hallu_cases.append(data)
+    plan_ref.append(ref_plan)
+    plan_pred.append(pred_plan)
+    if ref_plan == 'give up':
+        pass
+    elif ref_plan == 'finish':
+        answer_ref.append(ref_ans)
+        if pred_ans is None:
+            answer_pred.append('none')
+        else:
+            answer_pred.append(pred_ans)
+    else:
+        action_ref.append(ref_action)
+        action_input_ref.append(ref_input)
+        if pred_action is None:
+            action_pred.append('none')
+        else:
+            action_pred.append(pred_action)
+        if pred_input is None:
+            action_input_pred.append('{}')
+        else:
+            action_input_pred.append(pred_input)
+    metric = {}
+    rouge = evaluate_rougel(answer_pred, answer_ref)
+    plan_em = evaluate_action_em(cand_list=plan_pred, ref_list=plan_ref)
+    action_em = evaluate_action_em(cand_list=action_pred, ref_list=action_ref)
+    easy_f1, hard_f1, f1 = evaluate_action_input_f1(action_pred, action_ref, action_input_pred, action_input_ref)
+    hallu_rate = hallu_pred
+    metric['Act.EM'] = action_em
+    metric['F1'] = f1
+    metric['HalluRate'] = hallu_rate
+    metric['plan_em'] = plan_em
+    metric['Easy_F1'] = easy_f1
+    metric['Hard_F1'] = hard_f1
+    metric['Rouge-L'] = rouge
+    return metric

evalscope/benchmarks/utils.py CHANGED Viewed

@@ -13,6 +13,7 @@ class PromptData:
     system_prompt: Optional[str] = None
     multi_choices: Optional[List[str]] = None
     id: Optional[str] = None
+    messages: Optional[List[dict]] = None
     def to_dict(self) -> Dict:
         return {k: v for k, v in asdict(self).items() if v is not None}
@@ -21,7 +22,7 @@ class PromptData:
 def preprocess_decorator(func):
     @wraps(func)
-    def wrapper(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT):
+    def wrapper(self, result: str, raw_input_d: dict = None, **kwargs):
         if result is None:
             result = ''
         filters = self.config_kwargs.get('filters', None)
@@ -29,6 +30,31 @@ def preprocess_decorator(func):
             # Apply filters to the resultply filters to the result
             for filter_name, filter_value in filters.items():
                 result = Filter.apply(filter_name, result, filter_value)
-        return func(self, result, raw_input_d, eval_type)
+        return func(self, result, raw_input_d, **kwargs)
     return wrapper
+def load_file_with_extension(file_path: Union[str, List[str]]) -> List[dict]:
+    """
+    Load a file with a specific extension and return its content as a list of dictionaries.
+    """
+    import json
+    import os
+    if isinstance(file_path, str):
+        file_path = [file_path]
+    data = []
+    for path in file_path:
+        if not os.path.exists(path):
+            raise FileNotFoundError(f'The file {path} does not exist.')
+        with open(path, 'r', encoding='utf-8') as f:
+            if path.endswith('.json'):
+                data.extend(json.load(f))
+            elif path.endswith('.jsonl'):
+                data.extend([json.loads(line) for line in f])
+            elif path.endswith('.txt'):
+                data.extend([{'text': f.read()}])
+    return data

evalscope/benchmarks/winogrande/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/winogrande/winogrande_adapter.py ADDED Viewed

@@ -0,0 +1,57 @@
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType, OutputType
+from evalscope.metrics import exact_match
+from evalscope.utils.utils import ResponseParser
+@Benchmark.register(
+    name='winogrande',
+    pretty_name='Winogrande',
+    dataset_id='AI-ModelScope/winogrande_val',
+    model_adapter=OutputType.GENERATION,
+    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
+    metric_list=['AverageAccuracy'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='validation',
+    prompt_template='Question: {query}\nA. {option1}\nB. {option2}\nAnswer:',  # noqa: E501
+)
+class WinograndeAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.choices = ['A', 'B']
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate model prompt from input data.
+        """
+        prompt = self.prompt_template.format(
+            query=input_d['sentence'],
+            option1=input_d['option1'],
+            option2=input_d['option2'],
+        )
+        return self.gen_prompt_data(prompt)
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        """
+        answer_index = int(input_d['answer']) - 1
+        return self.choices[answer_index]
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the predicted result and extract proper answer.
+        """
+        if self.model_adapter == OutputType.MULTIPLE_CHOICE:
+            return result
+        else:
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)
+    def match(self, gold: str, pred: str) -> float:
+        """
+        Match the gold answer and the predicted answer.
+        """
+        return exact_match(gold=gold, pred=pred)

evalscope/cli/start_app.py CHANGED Viewed

@@ -21,13 +21,13 @@ class StartAppCMD(CLICommand):
     def define_args(parsers: ArgumentParser):
         """ define args for create pipeline template command.
         """
-        from evalscope.report import add_argument
+        from evalscope.app import add_argument
         parser = parsers.add_parser(StartAppCMD.name)
         add_argument(parser)
         parser.set_defaults(func=subparser_func)
     def execute(self):
-        from evalscope.report.app import create_app
+        from evalscope.app import create_app
         create_app(self.args)

evalscope/collections/__init__.py CHANGED Viewed

@@ -1,3 +1,35 @@
-from evalscope.collections.evaluator import EvaluatorCollection
-from evalscope.collections.sampler import StratifiedSampler, UniformSampler, WeightedSampler
-from evalscope.collections.schema import CollectionSchema, DatasetInfo
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+from evalscope.utils.import_utils import _LazyModule
+if TYPE_CHECKING:
+    from .evaluator import EvaluatorCollection
+    from .sampler import StratifiedSampler, UniformSampler, WeightedSampler
+    from .schema import CollectionSchema, DatasetInfo
+else:
+    _import_structure = {
+        'evaluator': [
+            'EvaluatorCollection',
+        ],
+        'sampler': [
+            'StratifiedSampler',
+            'UniformSampler',
+            'WeightedSampler',
+        ],
+        'schema': [
+            'CollectionSchema',
+            'DatasetInfo',
+        ],
+    }
+    import sys
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )

evalscope/collections/evaluator.py CHANGED Viewed

@@ -7,7 +7,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from copy import deepcopy
 from tabulate import tabulate
 from tqdm import tqdm
-from typing import List
+from typing import Any, Dict, List
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.collections.sampler import DatasetEntry
@@ -70,9 +70,13 @@ class EvaluatorCollection:
         dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
         raw_dataset = self.data_adapter.load()
         # random limit the dataset
-        if self.task_cfg.limit:
-            raw_dataset = random.sample(raw_dataset,
-                                        self.task_cfg.limit) if len(raw_dataset) > self.task_cfg.limit else raw_dataset
+        limit = len(raw_dataset)
+        if self.task_cfg.limit is not None:
+            if isinstance(self.task_cfg.limit, int):
+                limit = self.task_cfg.limit
+            elif isinstance(self.task_cfg.limit, float):
+                limit = int(len(raw_dataset) * self.task_cfg.limit)
+            raw_dataset = random.sample(raw_dataset, min(limit, len(raw_dataset)))
         # index dataset
         datasets = []
         for sample in raw_dataset:
@@ -179,32 +183,43 @@ class EvaluatorCollection:
             logger.info(f'{level} Report:\n{table}')
         report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
+        # Make report analysis
+        if self.task_cfg.analysis_report:
+            logger.info('Generating report analysis, please wait ...')
+            analysis = report.generate_analysis(self.task_cfg.judge_model_args)
+            logger.info('Report analysis:\n%s', analysis)
+        else:
+            logger.info('Skipping report analysis (`analysis_report=False`).')
         # save report to JSON file
         report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
-        os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
-        with open(report_file_path, 'w', encoding='utf-8') as f:
-            json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
+        report.to_json(report_file_path)
+        logger.info(f'Report saved to {report_file_path}')
         return report
     def _filter_answer(self, pred_file_path):
         answer_dict = defaultdict(dict)
         if self.task_cfg.use_cache and os.path.exists(pred_file_path):
             answers_list = jsonl_to_list(pred_file_path)
+            # Create a set of sample indices for which we have answers
             indices = set()
             for answer in answers_list:
                 index = answer.get(AnswerKeys.INDEX)
                 answer_dict[index] = answer
                 indices.add(index)
-            data = []
-            for sample in self.dataset:
-                if sample.index not in indices:
-                    data.append(sample)
+            # Filter dataset to only include samples that don't have answers
+            data = [sample for sample in self.dataset if sample.index not in indices]
+            # Initialize name map for the filtered dataset
             data_map = self._init_name_map(data)
             logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
             return answer_dict, data, data_map
-        return answer_dict, self.dataset, self.dataset_name_map
+        else:
+            # If cache isn't enabled or file doesn't exist, return the full dataset
+            return answer_dict, self.dataset, self.dataset_name_map
     def get_answers(self):
         pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
@@ -214,13 +229,16 @@ class EvaluatorCollection:
         answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
         eval_batch_size = self.task_cfg.eval_batch_size
+        # Process samples and get answers
         with tqdm(total=len(dataset), desc='Getting answers') as pbar:
             if self.task_cfg.eval_type == EvalType.SERVICE:
+                # Create a thread pool for parallel processing
                 with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
                     futures = []
                     for sample in dataset:
                         evaluator = self.evaluators[sample.dataset_name]
                         futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
+                    # Process completed tasks
                     for future in as_completed(futures):
                         answer_list, samples = future.result()
                         answers[samples[0].index] = answer_list[0]
@@ -244,35 +262,79 @@ class EvaluatorCollection:
                             pbar.update(len(batch_ids))
         return answers
-    def get_reviews(self, answers):
+    def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
+        """
+        Retrieve or generate reviews for given answers.
+        Args:
+            answers: Dictionary of answers indexed by sample index.
+        Returns:
+            Dictionary of reviews indexed by sample index.
+        """
+        # Set up the review file path
         review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
         os.makedirs(review_file_path, exist_ok=True)
-        if self.task_cfg.use_cache and os.path.exists(review_file_path):
-            logger.warning(
-                f'Ignore use_cache={self.task_cfg.use_cache}, updating the review file: {review_file_path} ...')
-            if os.path.isdir(review_file_path):
-                for filename in os.listdir(review_file_path):
-                    file_path = os.path.join(review_file_path, filename)
-                    try:
-                        if os.path.isfile(file_path):
-                            os.remove(file_path)
-                    except Exception as e:
-                        logger.error(f'Error deleting file {file_path}: {e}')
+        review_history_map = defaultdict(dict)
+        # Handle caching logic
+        if os.path.exists(review_file_path):
+            if not self.task_cfg.use_cache:
+                # Clear existing reviews if not using cache
+                self._clear_review_files(review_file_path)
             else:
-                os.remove(review_file_path)
+                # Load existing reviews if using cache
+                self._load_existing_reviews(review_file_path, review_history_map)
-        reviews = defaultdict(dict)
+        reviews = {}
         for sample in tqdm(self.dataset, desc='Getting reviews'):
-            evaluator = self.evaluators[sample.dataset_name]
-            review_d = evaluator.get_review(answers[sample.index])
+            file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
+            if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
+                # Use cached review if available
+                review_d = review_history_map[file_name][sample.index]
+            else:
+                # Generate new review
+                evaluator = self.evaluators[sample.dataset_name]
+                review_d = evaluator.get_review(answers[sample.index])
+                # Only save the review if it's not in the cache
+                self._save_review(review_file_path, file_name, review_d)
             reviews[sample.index] = review_d
-            dump_jsonl_data(
-                review_d,
-                os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
-                dump_mode=DumpMode.APPEND)
         return reviews
+    def _clear_review_files(self, review_file_path: str) -> None:
+        """Clear existing review files."""
+        if os.path.isdir(review_file_path):
+            for filename in os.listdir(review_file_path):
+                file_path = os.path.join(review_file_path, filename)
+                try:
+                    if os.path.isfile(file_path):
+                        os.remove(file_path)
+                except Exception as e:
+                    logger.error(f'Error deleting file {file_path}: {e}')
+        else:
+            os.remove(review_file_path)
+    def _load_existing_reviews(self, review_file_path: str, review_history_map: Dict[str, Dict[int, Any]]) -> None:
+        """Load existing reviews from files."""
+        logger.info(f'use_cache={self.task_cfg.use_cache}, reloading the review file: {review_file_path}')
+        if os.path.isdir(review_file_path):
+            for filename in os.listdir(review_file_path):
+                if '.ipynb_checkpoints' in filename:
+                    continue
+                file_path = os.path.join(review_file_path, filename)
+                with open(file_path, 'r') as f:
+                    review_history = [json.loads(line.strip()) for line in f]
+                review_history_map[filename] = {item['index']: item for item in review_history}
+    def _save_review(self, review_file_path: str, file_name: str, review_d: Dict[str, Any]) -> None:
+        """Save a single review to file."""
+        file_path = os.path.join(review_file_path, file_name)
+        dump_jsonl_data(review_d, file_path, dump_mode=DumpMode.APPEND)
     def get_scores(self, reviews) -> float:
         scores = defaultdict(dict)
         for sample in tqdm(self.dataset, desc='Getting scores'):

evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.15.1py3-none-any.whl → 0.16.1py3-none-any.whl