PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +6 -2
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +47 -51
evalscope/backend/rag_eval/utils/embedding.py +13 -12
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +33 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +154 -96
evalscope/constants.py +50 -32
evalscope/evaluator/evaluator.py +97 -377
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +16 -3
evalscope/perf/benchmark.py +9 -11
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +8 -1
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/plugin/registry.py +3 -3
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +5 -6
evalscope/perf/utils/db_util.py +77 -30
evalscope/perf/utils/local_server.py +21 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +153 -381
evalscope/run_arena.py +21 -25
evalscope/summarizer.py +27 -40
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -27
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -4
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
evalscope/tools/combine_reports.py +27 -34
evalscope/tools/rewrite_eval_results.py +15 -47
evalscope/utils/__init__.py +1 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +4 -5
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/io_utils.py +162 -0
evalscope/utils/logger.py +17 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +5 -306
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
evalscope-0.8.1.dist-info/RECORD +285 -0
tests/cli/test_run.py +53 -15
tests/perf/test_perf.py +6 -1
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
tests/vlm/test_vlmeval.py +3 -2
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0

evalscope/utils/model_utils.py ADDED Viewed

@@ -0,0 +1,11 @@
+from transformers import GenerationConfig
+def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
+    # Use the default values of temperature/top_p/top_k in generation_config.
+    if generation_config.temperature == 0:
+        generation_config.do_sample = False
+    if generation_config.do_sample is False:
+        generation_config.temperature = 1.
+        generation_config.top_p = 1.
+        generation_config.top_k = 50

evalscope/utils/utils.py CHANGED Viewed

@@ -5,20 +5,13 @@ import functools
 import hashlib
 import importlib
 import importlib.util
+import numpy as np
 import os
 import random
 import re
-import sys
-from typing import Any, Dict, List, Tuple, Union
-import json
-import jsonlines as jsonl
-import numpy as np
 import torch
-import torch.nn.functional as F
-import yaml
+from typing import Any, Dict, List, Tuple, Union
-from evalscope.constants import DumpMode, OutputsStructure
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -37,101 +30,6 @@ def test_level_list():
     return TEST_LEVEL_LIST
-def jsonl_to_list(jsonl_file):
-    """
-    Read jsonl file to list.
-    Args:
-        jsonl_file: jsonl file path.
-    Returns:
-        list: list of lines. Each line is a dict.
-    """
-    res_list = []
-    with jsonl.open(jsonl_file, mode='r') as reader:
-        for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
-            res_list.append(line)
-    return res_list
-def jsonl_to_reader(jsonl_file):
-    """
-    Read jsonl file to reader object.
-    Args:
-        jsonl_file: jsonl file path.
-    Returns:
-        reader: jsonl reader object.
-    """
-    with jsonl.open(jsonl_file, mode='r') as reader:
-        return reader
-def jsonl_to_csv():
-    pass
-def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
-    """
-    Dump data to jsonl file.
-    Args:
-        data_list: data list to be dumped.  [{'a': 'aaa'}, ...]
-        jsonl_file: jsonl file path.
-        dump_mode: dump mode. It can be 'overwrite' or 'append'.
-    """
-    if not jsonl_file:
-        raise ValueError('output file must be provided.')
-    jsonl_file = os.path.expanduser(jsonl_file)
-    if dump_mode == DumpMode.OVERWRITE:
-        dump_mode = 'w'
-    elif dump_mode == DumpMode.APPEND:
-        dump_mode = 'a'
-    with jsonl.open(jsonl_file, mode=dump_mode) as writer:
-        writer.write_all(data_list)
-    logger.info(f'Dump data to {jsonl_file} successfully.')
-def yaml_to_dict(yaml_file) -> dict:
-    """
-    Read yaml file to dict.
-    """
-    with open(yaml_file, 'r') as f:
-        try:
-            stream = yaml.safe_load(f)
-        except yaml.YAMLError as e:
-            logger.error(f'{e}')
-            raise e
-    return stream
-def dict_to_yaml(d: dict, yaml_file: str):
-    """
-    Dump dict to yaml file.
-    """
-    with open(yaml_file, 'w') as f:
-        yaml.dump(d, f, default_flow_style=False)
-    logger.info(f'Dump data to {yaml_file} successfully.')
-def json_to_dict(json_file) -> dict:
-    """
-    Read json file to dict.
-    """
-    with open(json_file, 'r') as f:
-        try:
-            stream = json.load(f)
-        except json.JSONDecodeError as e:
-            logger.error(f'{e}')
-            raise e
-    return stream
 def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
     module_name, spliter, cls_name = eval_class_ref.partition(':')
@@ -148,25 +46,13 @@ def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
     return functools.partial(obj_cls, *args, **kwargs)
-def markdown_table(header_l, data_l):
-    md_str = f'| {" | ".join(header_l)} |'
-    md_str += f'\n| {" | ".join(["---"] * len(header_l))} |'
-    for data in data_l:
-        if isinstance(data, str):
-            data = [data]
-        assert len(data) <= len(header_l)
-        tmp = data + [''] * (len(header_l) - len(data))
-        md_str += f'\n| {" | ".join(tmp)} |'
-    return md_str
 def random_seeded_choice(seed: Union[int, str, float], choices, **kwargs):
     """Random choice with a (potentially string) seed."""
     return random.Random(seed).choices(choices, k=1, **kwargs)[0]
-def gen_hash(name: str):
-    return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()
+def gen_hash(name: str, bits: int = 32):
+    return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
 def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
@@ -312,52 +198,6 @@ class ResponseParser:
         return ''
-def make_outputs_dir(root_dir: str, datasets: list, model_id: str, model_revision: str):
-    # model_revision = model_revision if model_revision is not None else 'none'
-    # now = datetime.datetime.now()
-    # format_time = now.strftime('%Y%m%d_%H%M%S')
-    # outputs_name = format_time + '_' + 'default' + '_' + model_id.replace('/', '_') + '_' + model_revision
-    # outputs_dir = os.path.join(work_dir, outputs_name)
-    # dataset_name = dataset_id.replace('/', '_')
-    # outputs_dir = os.path.join(work_dir, dataset_name)
-    if not model_id:
-        model_id = 'default'
-    model_id = model_id.replace('/', '_')
-    if not model_revision:
-        model_revision = 'default'
-    outputs_dir = os.path.join(root_dir, f"eval_{'-'.join(datasets)}_{model_id}_{model_revision}")
-    return outputs_dir
-def process_outputs_structure(outputs_dir: str, is_make: bool = True) -> dict:
-    logs_dir = os.path.join(outputs_dir, 'logs')
-    predictions_dir = os.path.join(outputs_dir, 'predictions')
-    reviews_dir = os.path.join(outputs_dir, 'reviews')
-    reports_dir = os.path.join(outputs_dir, 'reports')
-    configs_dir = os.path.join(outputs_dir, 'configs')
-    if is_make:
-        os.makedirs(outputs_dir, exist_ok=True)
-        os.makedirs(logs_dir, exist_ok=True)
-        os.makedirs(predictions_dir, exist_ok=True)
-        os.makedirs(reviews_dir, exist_ok=True)
-        os.makedirs(reports_dir, exist_ok=True)
-        os.makedirs(configs_dir, exist_ok=True)
-    outputs_structure = {
-        OutputsStructure.LOGS_DIR: logs_dir,
-        OutputsStructure.PREDICTIONS_DIR: predictions_dir,
-        OutputsStructure.REVIEWS_DIR: reviews_dir,
-        OutputsStructure.REPORTS_DIR: reports_dir,
-        OutputsStructure.CONFIGS_DIR: configs_dir,
-    }
-    return outputs_structure
 def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
     """
@@ -401,148 +241,6 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
     return score
-def split_str_parts_by(text: str, delimiters: List[str]):
-    """Split the text field into parts.
-    Args:
-        text: A text to be split.
-        delimiters: The delimiters.
-    Returns:
-        The split text in list of dicts.
-    """
-    all_start_chars = [d[0] for d in delimiters]
-    all_length = [len(d) for d in delimiters]
-    text_list = []
-    last_words = ''
-    while len(text) > 0:
-        for char_idx, char in enumerate(text):
-            match_index = [idx for idx, start_char in enumerate(all_start_chars) if start_char == char]
-            is_delimiter = False
-            for index in match_index:
-                if text[char_idx:char_idx + all_length[index]] == delimiters[index]:
-                    if last_words:
-                        if text_list:
-                            text_list[-1]['content'] = last_words
-                        else:
-                            text_list.append({'key': '', 'content': last_words})
-                    last_words = ''
-                    text_list.append({'key': delimiters[index]})
-                    text = text[char_idx + all_length[index]:]
-                    is_delimiter = True
-                    break
-            if not is_delimiter:
-                last_words += char
-            else:
-                break
-        if last_words == text:
-            text = ''
-    text_list[-1]['content'] = last_words
-    return text_list
-def calculate_loss_scale(response: str, use_loss_scale=False) -> Tuple[List[str], List[float]]:
-    """Calculate the loss scale by splitting the agent response.
-    This algorithm comes from paper: https://arxiv.org/pdf/2309.00986.pdf
-    Agent response format:
-    ```text
-        Thought: you should always think about what to do
-        Action: the action to take, should be one of the above tools[fire_recognition,
-            fire_alert, call_police, call_fireman]
-        Action Input: the input to the action
-        Observation: the result of the action
-        ... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
-        Thought: I now know the final answer
-        Final Answer: the final answer to the original input question
-    ```
-    Args:
-        response: The response text
-        use_loss_scale: Use weighted loss. With this, some part of the loss will be enhanced to improve performance.
-    Returns:
-        A tuple of agent response parts and their weights.
-    """
-    if 'Action:' in response and 'Observation:' in response and use_loss_scale:
-        agent_keyword = ['Action:', 'Action Input:', 'Thought:', 'Final Answer:', 'Observation:']
-        agent_parts = split_str_parts_by(response, agent_keyword)
-        weights = []
-        agent_content = []
-        for c in agent_parts:
-            if c['key'] in ('Action:', 'Action Input:'):
-                weights += [2.0]
-                weights += [2.0]
-            elif c['key'] in ('Thought:', 'Final Answer:', ''):
-                weights += [1.0]
-                weights += [1.0]
-            elif c['key'] in ('Observation:', ):
-                weights += [2.0]
-                weights += [0.0]
-            agent_content.append(c['key'])
-            agent_content.append(c['content'])
-        return agent_content, weights
-    else:
-        return [response], [1.0]
-def get_bucket_sizes(max_length: int) -> List[int]:
-    return [max_length // 4 * (i + 1) for i in range(4)]
-def _get_closet_bucket(bucket_sizes, data_length):
-    """Select the one from bucket_sizes that is closest in distance to
-    data_length. This is required for TorchAcc.
-    """
-    cloest_length = sys.maxsize
-    for b in bucket_sizes:
-        if b == data_length or ((b < cloest_length) and (b > data_length)):
-            cloest_length = b
-    if cloest_length == sys.maxsize:
-        bucket_sizes.append(data_length)
-        cloest_length = data_length
-    return cloest_length
-def pad_and_split_batch(padding_to, input_ids, attention_mask, labels, loss_scale, max_length, tokenizer, rank,
-                        world_size):
-    if padding_to is None:
-        longest_len = input_ids.shape[-1]
-        bucket_sizes = get_bucket_sizes(max_length)
-        bucket_data_length = _get_closet_bucket(bucket_sizes, longest_len)
-        padding_length = bucket_data_length - input_ids.shape[1]
-        input_ids = F.pad(input_ids, (0, padding_length), 'constant', tokenizer.pad_token_id)
-        attention_mask = F.pad(attention_mask, (0, padding_length), 'constant', 0)
-        if loss_scale:
-            loss_scale = F.pad(loss_scale, (0, padding_length), 'constant', 0.)
-        labels = F.pad(labels, (0, padding_length), 'constant', -100)
-    # manully split the batch to different DP rank.
-    batch_size = input_ids.shape[0] // world_size
-    if batch_size > 0:
-        start = rank * batch_size
-        end = (rank + 1) * batch_size
-        input_ids = input_ids[start:end, :]
-        attention_mask = attention_mask[start:end, :]
-        labels = labels[start:end, :]
-        if loss_scale:
-            loss_scale = loss_scale[start:end, :]
-    return input_ids, attention_mask, labels, loss_scale
-def get_dist_setting() -> Tuple[int, int, int, int]:
-    """return rank, local_rank, world_size, local_world_size"""
-    rank = int(os.getenv('RANK', -1))
-    local_rank = int(os.getenv('LOCAL_RANK', -1))
-    world_size = int(os.getenv('WORLD_SIZE', 1))
-    local_world_size = int(os.getenv('LOCAL_WORLD_SIZE', 1))
-    return rank, local_rank, world_size, local_world_size
-def use_torchacc() -> bool:
-    return os.getenv('USE_TORCHACC', '0') == '1'
 def is_module_installed(module_name):
     try:
         importlib.import_module(module_name)
@@ -576,6 +274,7 @@ def get_valid_list(input_list, candidate_list):
 def get_latest_folder_path(work_dir):
     from datetime import datetime
     # Get all subdirectories in the work_dir
     folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]

evalscope/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-__version__ = '0.7.2'
-__release_datetime__ = '2024-12-04 12:00:00'
+__version__ = '0.8.1'
+__release_datetime__ = '2024-12-17 20:00:00'

evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl