evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.1.dist-info/RECORD +0 -286
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
import os
|
|
3
|
-
|
|
4
2
|
import pandas as pd
|
|
5
3
|
from ragas.embeddings import LangchainEmbeddingsWrapper
|
|
6
4
|
from ragas.llms import LangchainLLMWrapper
|
|
@@ -9,117 +7,12 @@ from tqdm import tqdm
|
|
|
9
7
|
from evalscope.backend.rag_eval import LLM, ChatOpenAI, EmbeddingModel
|
|
10
8
|
from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
|
|
11
9
|
from evalscope.utils.logger import get_logger
|
|
12
|
-
from .
|
|
10
|
+
from .build_distribution import default_query_distribution
|
|
11
|
+
from .build_transform import default_transforms
|
|
13
12
|
|
|
14
13
|
logger = get_logger()
|
|
15
14
|
|
|
16
15
|
|
|
17
|
-
def get_transform(llm, embedding, language):
|
|
18
|
-
"""
|
|
19
|
-
Creates and returns a default set of transforms for processing a knowledge graph.
|
|
20
|
-
"""
|
|
21
|
-
from ragas.testset.transforms.engine import Parallel
|
|
22
|
-
from ragas.testset.transforms.extractors import (
|
|
23
|
-
EmbeddingExtractor,
|
|
24
|
-
HeadlinesExtractor,
|
|
25
|
-
SummaryExtractor,
|
|
26
|
-
)
|
|
27
|
-
from ragas.testset.transforms.extractors.llm_based import NERExtractor, ThemesExtractor
|
|
28
|
-
from ragas.testset.transforms.relationship_builders import (
|
|
29
|
-
CosineSimilarityBuilder,
|
|
30
|
-
OverlapScoreBuilder,
|
|
31
|
-
)
|
|
32
|
-
from ragas.testset.transforms.splitters import HeadlineSplitter
|
|
33
|
-
from ragas.testset.transforms.filters import CustomNodeFilter
|
|
34
|
-
from ragas.testset.graph import NodeType
|
|
35
|
-
from ragas.utils import num_tokens_from_string
|
|
36
|
-
|
|
37
|
-
def summary_filter(node):
|
|
38
|
-
return (node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > 500)
|
|
39
|
-
|
|
40
|
-
summary_extractor = SummaryExtractor(llm=llm, filter_nodes=lambda node: summary_filter(node))
|
|
41
|
-
ner_extractor = NERExtractor(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
|
|
42
|
-
theme_extractor = ThemesExtractor(llm=llm)
|
|
43
|
-
headline_extractor = HeadlinesExtractor(llm=llm)
|
|
44
|
-
|
|
45
|
-
asyncio.run(
|
|
46
|
-
translate_prompts(
|
|
47
|
-
prompts=[
|
|
48
|
-
summary_extractor,
|
|
49
|
-
theme_extractor,
|
|
50
|
-
ner_extractor,
|
|
51
|
-
headline_extractor,
|
|
52
|
-
],
|
|
53
|
-
target_lang=language,
|
|
54
|
-
llm=llm,
|
|
55
|
-
adapt_instruction=True,
|
|
56
|
-
))
|
|
57
|
-
|
|
58
|
-
splitter = HeadlineSplitter(min_tokens=500)
|
|
59
|
-
|
|
60
|
-
summary_emb_extractor = EmbeddingExtractor(
|
|
61
|
-
embedding_model=embedding,
|
|
62
|
-
property_name='summary_embedding',
|
|
63
|
-
embed_property_name='summary',
|
|
64
|
-
filter_nodes=lambda node: summary_filter(node),
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
cosine_sim_builder = CosineSimilarityBuilder(
|
|
68
|
-
property_name='summary_embedding',
|
|
69
|
-
new_property_name='summary_similarity',
|
|
70
|
-
threshold=0.7,
|
|
71
|
-
filter_nodes=lambda node: summary_filter(node),
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
ner_overlap_sim = OverlapScoreBuilder(threshold=0.01, filter_nodes=lambda node: node.type == NodeType.CHUNK)
|
|
75
|
-
|
|
76
|
-
node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
|
|
77
|
-
|
|
78
|
-
transforms = [
|
|
79
|
-
headline_extractor,
|
|
80
|
-
splitter,
|
|
81
|
-
summary_extractor,
|
|
82
|
-
node_filter,
|
|
83
|
-
Parallel(summary_emb_extractor, theme_extractor, ner_extractor),
|
|
84
|
-
Parallel(cosine_sim_builder, ner_overlap_sim),
|
|
85
|
-
]
|
|
86
|
-
|
|
87
|
-
return transforms
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def get_distribution(llm, distribution, language):
|
|
91
|
-
from ragas.testset.synthesizers.multi_hop import (
|
|
92
|
-
MultiHopAbstractQuerySynthesizer,
|
|
93
|
-
MultiHopSpecificQuerySynthesizer,
|
|
94
|
-
)
|
|
95
|
-
from ragas.testset.synthesizers.single_hop.specific import (
|
|
96
|
-
SingleHopSpecificQuerySynthesizer, )
|
|
97
|
-
|
|
98
|
-
single_hop = SingleHopSpecificQuerySynthesizer(llm=llm)
|
|
99
|
-
multi_hop_abs = MultiHopAbstractQuerySynthesizer(llm=llm)
|
|
100
|
-
multi_hop_spec = MultiHopSpecificQuerySynthesizer(llm=llm)
|
|
101
|
-
|
|
102
|
-
asyncio.run(
|
|
103
|
-
translate_prompts(
|
|
104
|
-
prompts=[
|
|
105
|
-
single_hop,
|
|
106
|
-
multi_hop_abs,
|
|
107
|
-
multi_hop_spec,
|
|
108
|
-
],
|
|
109
|
-
target_lang=language,
|
|
110
|
-
llm=llm,
|
|
111
|
-
adapt_instruction=True,
|
|
112
|
-
))
|
|
113
|
-
|
|
114
|
-
mapping = {
|
|
115
|
-
'simple': single_hop,
|
|
116
|
-
'multi_context': multi_hop_abs,
|
|
117
|
-
'reasoning': multi_hop_spec,
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
return [(mapping[key], distribution[key]) for key in mapping if key in distribution]
|
|
121
|
-
|
|
122
|
-
|
|
123
16
|
def get_knowledge_graph(documents, transforms, local_file, run_config):
|
|
124
17
|
from ragas.testset.graph import KnowledgeGraph, Node, NodeType
|
|
125
18
|
from ragas.testset.transforms import apply_transforms
|
|
@@ -153,15 +46,9 @@ def get_knowledge_graph(documents, transforms, local_file, run_config):
|
|
|
153
46
|
|
|
154
47
|
|
|
155
48
|
def get_persona(llm, kg, language):
|
|
156
|
-
from
|
|
157
|
-
from ragas.testset.persona import generate_personas_from_kg, PersonaGenerationPrompt
|
|
158
|
-
from ragas.testset.graph import Node
|
|
49
|
+
from ragas.testset.persona import PersonaGenerationPrompt, generate_personas_from_kg
|
|
159
50
|
|
|
160
|
-
|
|
161
|
-
if (node.type.name == 'DOCUMENT' and node.properties.get('summary_embedding') is not None):
|
|
162
|
-
return True
|
|
163
|
-
else:
|
|
164
|
-
return False
|
|
51
|
+
from evalscope.backend.rag_eval.ragas.prompts.persona_prompt import PersonaGenerationPromptZH
|
|
165
52
|
|
|
166
53
|
if language == 'chinese':
|
|
167
54
|
persona_prompt = PersonaGenerationPromptZH()
|
|
@@ -176,27 +63,21 @@ def get_persona(llm, kg, language):
|
|
|
176
63
|
# adapt_instruction=True,
|
|
177
64
|
# ))
|
|
178
65
|
|
|
179
|
-
return generate_personas_from_kg(
|
|
180
|
-
llm=llm,
|
|
181
|
-
kg=kg,
|
|
182
|
-
num_personas=3,
|
|
183
|
-
persona_generation_prompt=persona_prompt,
|
|
184
|
-
filter_fn=filter,
|
|
185
|
-
)
|
|
66
|
+
return generate_personas_from_kg(llm=llm, kg=kg, num_personas=3, persona_generation_prompt=persona_prompt)
|
|
186
67
|
|
|
187
68
|
|
|
188
69
|
def load_data(file_path):
|
|
189
70
|
from langchain_community.document_loaders import UnstructuredFileLoader
|
|
190
71
|
|
|
191
|
-
loader = UnstructuredFileLoader(file_path, mode='
|
|
72
|
+
loader = UnstructuredFileLoader(file_path, mode='single')
|
|
192
73
|
data = loader.load()
|
|
193
74
|
return data
|
|
194
75
|
|
|
195
76
|
|
|
196
77
|
def generate_testset(args: TestsetGenerationArguments) -> None:
|
|
197
78
|
|
|
198
|
-
from ragas.testset import TestsetGenerator
|
|
199
79
|
from ragas import RunConfig
|
|
80
|
+
from ragas.testset import TestsetGenerator
|
|
200
81
|
|
|
201
82
|
# load data
|
|
202
83
|
documents = load_data(args.docs)
|
|
@@ -208,23 +89,26 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
|
|
|
208
89
|
wrapped_llm = LangchainLLMWrapper(generator_llm)
|
|
209
90
|
wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)
|
|
210
91
|
|
|
211
|
-
# Change resulting question type distribution
|
|
212
|
-
distributions = get_distribution(wrapped_llm, args.distribution, args.language)
|
|
213
|
-
|
|
214
|
-
run_config = RunConfig(timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True)
|
|
215
92
|
# get transforms
|
|
216
|
-
transforms =
|
|
93
|
+
transforms = default_transforms(
|
|
94
|
+
documents,
|
|
217
95
|
wrapped_llm,
|
|
218
96
|
wrapped_embeddings,
|
|
219
97
|
args.language,
|
|
220
98
|
)
|
|
221
99
|
|
|
100
|
+
run_config = RunConfig(timeout=600, max_retries=10, max_wait=120, max_workers=1, log_tenacity=True)
|
|
222
101
|
# get knowledge graph
|
|
223
102
|
knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph, run_config)
|
|
224
|
-
|
|
103
|
+
# get persona
|
|
225
104
|
persona_list = get_persona(llm=wrapped_llm, kg=knowledge_graph, language=args.language)
|
|
226
105
|
|
|
227
|
-
|
|
106
|
+
# Change resulting question type distribution
|
|
107
|
+
distributions = default_query_distribution(wrapped_llm, knowledge_graph, args.language)
|
|
108
|
+
|
|
109
|
+
# generate testset
|
|
110
|
+
generator = TestsetGenerator(
|
|
111
|
+
llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list)
|
|
228
112
|
|
|
229
113
|
testset = generator.generate(
|
|
230
114
|
testset_size=args.test_size,
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import asyncio
|
|
3
|
-
|
|
4
|
-
from ragas.prompt import PromptMixin
|
|
2
|
+
import os
|
|
5
3
|
from ragas.llms import BaseRagasLLM
|
|
4
|
+
from ragas.prompt import PromptMixin, PydanticPrompt
|
|
6
5
|
from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
|
|
7
|
-
from
|
|
6
|
+
from typing import List
|
|
8
7
|
|
|
8
|
+
from evalscope.utils.logger import get_logger
|
|
9
9
|
|
|
10
10
|
logger = get_logger()
|
|
11
11
|
|
|
@@ -17,9 +17,7 @@ async def translate_prompt(
|
|
|
17
17
|
adapt_instruction: bool = False,
|
|
18
18
|
):
|
|
19
19
|
if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
|
|
20
|
-
logger.warning(
|
|
21
|
-
f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}'
|
|
22
|
-
)
|
|
20
|
+
logger.warning(f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}')
|
|
23
21
|
return
|
|
24
22
|
|
|
25
23
|
if not issubclass(type(prompt_user), PromptMixin):
|
|
@@ -28,9 +26,7 @@ async def translate_prompt(
|
|
|
28
26
|
|
|
29
27
|
class_name = prompt_user.__class__.__name__
|
|
30
28
|
current_dir = os.path.dirname(__file__)
|
|
31
|
-
prompt_dir = os.path.abspath(
|
|
32
|
-
os.path.join(current_dir, f'../prompts/{target_lang}/{class_name}')
|
|
33
|
-
)
|
|
29
|
+
prompt_dir = os.path.abspath(os.path.join(current_dir, f'../prompts/{target_lang}/{class_name}'))
|
|
34
30
|
os.makedirs(prompt_dir, exist_ok=True)
|
|
35
31
|
|
|
36
32
|
try:
|
|
@@ -43,8 +39,7 @@ async def translate_prompt(
|
|
|
43
39
|
|
|
44
40
|
logger.info(f'Translating prompts to {target_lang}')
|
|
45
41
|
adapted_prompts = await prompt_user.adapt_prompts(
|
|
46
|
-
language=target_lang, llm=llm, adapt_instruction=adapt_instruction
|
|
47
|
-
)
|
|
42
|
+
language=target_lang, llm=llm, adapt_instruction=adapt_instruction)
|
|
48
43
|
prompt_user.set_prompts(**adapted_prompts)
|
|
49
44
|
try:
|
|
50
45
|
prompt_user.save_prompts(prompt_dir)
|
|
@@ -62,11 +57,6 @@ async def translate_prompts(
|
|
|
62
57
|
adapt_instruction: bool = False,
|
|
63
58
|
):
|
|
64
59
|
if target_lang and target_lang != 'english':
|
|
65
|
-
await asyncio.gather(
|
|
66
|
-
*[
|
|
67
|
-
translate_prompt(prompt, target_lang, llm, adapt_instruction)
|
|
68
|
-
for prompt in prompts
|
|
69
|
-
]
|
|
70
|
-
)
|
|
60
|
+
await asyncio.gather(*[translate_prompt(prompt, target_lang, llm, adapt_instruction) for prompt in prompts])
|
|
71
61
|
|
|
72
62
|
logger.info('Translate prompts finished')
|
|
@@ -1,33 +1,37 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import torch
|
|
3
3
|
import torch.nn.functional as F
|
|
4
|
-
from
|
|
4
|
+
from langchain_core.embeddings import Embeddings
|
|
5
5
|
from PIL import Image
|
|
6
|
-
from evalscope.backend.rag_eval.utils.tools import download_model, PIL_to_base64
|
|
7
6
|
from transformers import AutoModel, AutoProcessor
|
|
8
|
-
from
|
|
7
|
+
from typing import List
|
|
8
|
+
|
|
9
|
+
from evalscope.backend.rag_eval.utils.tools import PIL_to_base64, download_model
|
|
10
|
+
from evalscope.constants import HubType
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class VisionModel:
|
|
14
|
+
|
|
12
15
|
@staticmethod
|
|
13
16
|
def load(**kw):
|
|
14
|
-
api_base = kw.get(
|
|
17
|
+
api_base = kw.get('api_base', None)
|
|
15
18
|
if api_base:
|
|
16
19
|
|
|
17
20
|
return VLMAPI(
|
|
18
|
-
model_name=kw.get(
|
|
21
|
+
model_name=kw.get('model_name', ''),
|
|
19
22
|
openai_api_base=api_base,
|
|
20
|
-
openai_api_key=kw.get(
|
|
21
|
-
prompt=kw.get(
|
|
23
|
+
openai_api_key=kw.get('api_key', 'EMPTY'),
|
|
24
|
+
prompt=kw.get('prompt', None),
|
|
22
25
|
)
|
|
23
26
|
else:
|
|
24
27
|
return CLIPModel(**kw)
|
|
25
28
|
|
|
26
29
|
|
|
27
30
|
class VLMAPI:
|
|
31
|
+
|
|
28
32
|
def __init__(self, model_name, openai_api_base, openai_api_key, prompt=None):
|
|
29
|
-
from langchain_openai import ChatOpenAI
|
|
30
33
|
from langchain_core.prompts import ChatPromptTemplate
|
|
34
|
+
from langchain_openai import ChatOpenAI
|
|
31
35
|
|
|
32
36
|
self.model_name = model_name
|
|
33
37
|
self.model = ChatOpenAI(
|
|
@@ -35,46 +39,45 @@ class VLMAPI:
|
|
|
35
39
|
openai_api_base=openai_api_base,
|
|
36
40
|
openai_api_key=openai_api_key,
|
|
37
41
|
)
|
|
38
|
-
self.default_prompt = "Please describe this image in general. Directly provide the description, do not include prefix like 'This image depicts'"
|
|
39
|
-
self.prompt = ChatPromptTemplate.from_messages(
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
]
|
|
52
|
-
)
|
|
42
|
+
self.default_prompt = "Please describe this image in general. Directly provide the description, do not include prefix like 'This image depicts'" # noqa: E501
|
|
43
|
+
self.prompt = ChatPromptTemplate.from_messages([
|
|
44
|
+
('system', prompt if prompt else self.default_prompt),
|
|
45
|
+
(
|
|
46
|
+
'user',
|
|
47
|
+
[{
|
|
48
|
+
'type': 'image_url',
|
|
49
|
+
'image_url': {
|
|
50
|
+
'url': 'data:image/jpeg;base64,{image_data}'
|
|
51
|
+
},
|
|
52
|
+
}],
|
|
53
|
+
),
|
|
54
|
+
])
|
|
53
55
|
self.chain = self.prompt | self.model
|
|
54
56
|
self.transform = PIL_to_base64
|
|
55
57
|
|
|
56
58
|
def encode_image(self, images):
|
|
57
59
|
captions = []
|
|
58
60
|
for image in images:
|
|
59
|
-
response = self.chain.invoke({
|
|
61
|
+
response = self.chain.invoke({'image_data': image})
|
|
60
62
|
captions.append(response.content)
|
|
61
63
|
return captions
|
|
62
64
|
|
|
63
65
|
|
|
64
66
|
class CLIPModel(Embeddings):
|
|
67
|
+
|
|
65
68
|
def __init__(
|
|
66
69
|
self,
|
|
67
70
|
model_name: str,
|
|
68
|
-
revision: str =
|
|
69
|
-
hub=
|
|
70
|
-
device=
|
|
71
|
+
revision: str = 'master',
|
|
72
|
+
hub=HubType.MODELSCOPE,
|
|
73
|
+
device='cpu',
|
|
71
74
|
):
|
|
72
75
|
self.device = device
|
|
73
76
|
self.model_name = model_name
|
|
74
77
|
self.revision = revision
|
|
75
78
|
|
|
76
79
|
# Download the model if it doesn't exist locally
|
|
77
|
-
if not os.path.exists(model_name) and hub ==
|
|
80
|
+
if not os.path.exists(model_name) and hub == HubType.MODELSCOPE:
|
|
78
81
|
model_name = download_model(self.model_name, self.revision)
|
|
79
82
|
|
|
80
83
|
# Load the model and processor
|
|
@@ -85,9 +88,7 @@ class CLIPModel(Embeddings):
|
|
|
85
88
|
|
|
86
89
|
def encode_text(self, batch_texts: List[str] | List[List[str]]):
|
|
87
90
|
if isinstance(batch_texts[0], list):
|
|
88
|
-
batch_texts = [
|
|
89
|
-
text for _, texts in enumerate(batch_texts) for text in texts
|
|
90
|
-
]
|
|
91
|
+
batch_texts = [text for _, texts in enumerate(batch_texts) for text in texts]
|
|
91
92
|
# Ensure that the input texts are within the token limit
|
|
92
93
|
max_length = self.tokenizer.model_max_length
|
|
93
94
|
if not max_length or max_length > 0xFFFFFF:
|
|
@@ -97,7 +98,7 @@ class CLIPModel(Embeddings):
|
|
|
97
98
|
max_length=max_length,
|
|
98
99
|
padding=True,
|
|
99
100
|
truncation=True,
|
|
100
|
-
return_tensors=
|
|
101
|
+
return_tensors='pt',
|
|
101
102
|
)
|
|
102
103
|
|
|
103
104
|
inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
|
|
@@ -108,7 +109,7 @@ class CLIPModel(Embeddings):
|
|
|
108
109
|
return text_features
|
|
109
110
|
|
|
110
111
|
def encode_image(self, image):
|
|
111
|
-
batch_images = torch.stack([d[
|
|
112
|
+
batch_images = torch.stack([d['pixel_values'][0] for d in image])
|
|
112
113
|
batch_images = batch_images.to(self.device)
|
|
113
114
|
with torch.no_grad():
|
|
114
115
|
image_features = self.model.get_image_features(batch_images)
|
|
@@ -126,24 +127,19 @@ class CLIPModel(Embeddings):
|
|
|
126
127
|
def embed_image(self, uris: List[str]):
|
|
127
128
|
# read image and transform
|
|
128
129
|
images = [Image.open(image_path) for image_path in uris]
|
|
129
|
-
transformed_images = [
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
)
|
|
134
|
-
for image in images
|
|
135
|
-
]
|
|
130
|
+
transformed_images = [self.transform(
|
|
131
|
+
image,
|
|
132
|
+
return_tensors='pt',
|
|
133
|
+
) for image in images]
|
|
136
134
|
image_features = self.encode_image(transformed_images)
|
|
137
135
|
return image_features.cpu().numpy().tolist()
|
|
138
136
|
|
|
139
137
|
|
|
140
|
-
if __name__ ==
|
|
141
|
-
model = CLIPModel(
|
|
142
|
-
model.embed_image(
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
)
|
|
148
|
-
model.encode_text(["我喜欢吃饭" * 1000])
|
|
149
|
-
print("done")
|
|
138
|
+
if __name__ == '__main__':
|
|
139
|
+
model = CLIPModel('AI-ModelScope/chinese-clip-vit-large-patch14-336px')
|
|
140
|
+
model.embed_image([
|
|
141
|
+
'custom_eval/multimodal/images/AMNH.jpg',
|
|
142
|
+
'custom_eval/multimodal/images/AMNH.jpg',
|
|
143
|
+
])
|
|
144
|
+
model.encode_text(['我喜欢吃饭' * 1000])
|
|
145
|
+
print('done')
|
|
@@ -1,18 +1,21 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import torch
|
|
3
|
-
from
|
|
3
|
+
from langchain_core.embeddings import Embeddings
|
|
4
4
|
from sentence_transformers import models
|
|
5
|
-
from sentence_transformers.SentenceTransformer import SentenceTransformer
|
|
6
5
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
6
|
+
from sentence_transformers.SentenceTransformer import SentenceTransformer
|
|
7
7
|
from torch import Tensor
|
|
8
|
+
from typing import Dict, List, Optional, Union
|
|
9
|
+
|
|
8
10
|
from evalscope.backend.rag_eval.utils.tools import download_model
|
|
11
|
+
from evalscope.constants import HubType
|
|
9
12
|
from evalscope.utils.logger import get_logger
|
|
10
|
-
from langchain_core.embeddings import Embeddings
|
|
11
13
|
|
|
12
14
|
logger = get_logger()
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
class BaseModel(Embeddings):
|
|
18
|
+
|
|
16
19
|
def __init__(
|
|
17
20
|
self,
|
|
18
21
|
model_name_or_path: str,
|
|
@@ -83,9 +86,8 @@ class BaseModel(Embeddings):
|
|
|
83
86
|
|
|
84
87
|
|
|
85
88
|
class SentenceTransformerModel(BaseModel):
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
):
|
|
89
|
+
|
|
90
|
+
def __init__(self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs):
|
|
89
91
|
super().__init__(model_name_or_path, **kwargs)
|
|
90
92
|
|
|
91
93
|
if not pooling_mode:
|
|
@@ -104,9 +106,7 @@ class SentenceTransformerModel(BaseModel):
|
|
|
104
106
|
word_embedding_model.get_word_embedding_dimension(),
|
|
105
107
|
pooling_mode=pooling_mode,
|
|
106
108
|
)
|
|
107
|
-
self.model = SentenceTransformer(
|
|
108
|
-
modules=[word_embedding_model, pooling_model],
|
|
109
|
-
)
|
|
109
|
+
self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model], )
|
|
110
110
|
|
|
111
111
|
self.model.max_seq_length = self.max_seq_length
|
|
112
112
|
|
|
@@ -130,6 +130,7 @@ class SentenceTransformerModel(BaseModel):
|
|
|
130
130
|
|
|
131
131
|
|
|
132
132
|
class CrossEncoderModel(BaseModel):
|
|
133
|
+
|
|
133
134
|
def __init__(self, model_name_or_path: str, **kwargs):
|
|
134
135
|
super().__init__(model_name_or_path, **kwargs)
|
|
135
136
|
self.model = CrossEncoder(
|
|
@@ -160,12 +161,12 @@ class EmbeddingModel:
|
|
|
160
161
|
def load(
|
|
161
162
|
model_name_or_path: str = '',
|
|
162
163
|
is_cross_encoder: bool = False,
|
|
163
|
-
hub: str =
|
|
164
|
+
hub: str = HubType.MODELSCOPE,
|
|
164
165
|
revision: Optional[str] = 'master',
|
|
165
166
|
**kwargs,
|
|
166
167
|
):
|
|
167
168
|
# If model path does not exist and hub is 'modelscope', download the model
|
|
168
|
-
if not os.path.exists(model_name_or_path) and hub ==
|
|
169
|
+
if not os.path.exists(model_name_or_path) and hub == HubType.MODELSCOPE:
|
|
169
170
|
model_name_or_path = download_model(model_name_or_path, revision)
|
|
170
171
|
|
|
171
172
|
# Return different model instances based on whether it is a cross-encoder and pooling mode
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
|
3
|
-
from modelscope.utils.hf_util import GenerationConfig
|
|
4
2
|
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
|
|
5
3
|
from langchain_core.language_models.llms import LLM as BaseLLM
|
|
6
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter
|
|
7
4
|
from langchain_openai import ChatOpenAI
|
|
5
|
+
from modelscope.utils.hf_util import GenerationConfig
|
|
6
|
+
from typing import Any, Dict, Iterator, List, Mapping, Optional
|
|
7
|
+
|
|
8
|
+
from evalscope.constants import DEFAULT_MODEL_REVISION
|
|
9
|
+
from evalscope.models.model_adapter import ChatGenerationModelAdapter
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class LLM:
|
|
13
|
+
|
|
11
14
|
@staticmethod
|
|
12
15
|
def load(**kw):
|
|
13
16
|
api_base = kw.get('api_base', None)
|
|
@@ -25,8 +28,8 @@ class LocalLLM(BaseLLM):
|
|
|
25
28
|
"""A custom LLM that loads a model from a given path and performs inference."""
|
|
26
29
|
|
|
27
30
|
model_name_or_path: str
|
|
28
|
-
model_revision: str =
|
|
29
|
-
template_type: str =
|
|
31
|
+
model_revision: str = DEFAULT_MODEL_REVISION
|
|
32
|
+
template_type: Optional[str] = None
|
|
30
33
|
model_name: Optional[str]
|
|
31
34
|
model: Optional[ChatGenerationModelAdapter]
|
|
32
35
|
generation_config: Optional[Dict]
|
|
@@ -37,7 +40,6 @@ class LocalLLM(BaseLLM):
|
|
|
37
40
|
self.model = ChatGenerationModelAdapter(
|
|
38
41
|
model_id=self.model_name_or_path,
|
|
39
42
|
model_revision=self.model_revision,
|
|
40
|
-
template_type=self.template_type,
|
|
41
43
|
generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
|
|
42
44
|
)
|
|
43
45
|
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import io
|
|
2
3
|
import os
|
|
3
|
-
import base64
|
|
4
4
|
from modelscope import snapshot_download
|
|
5
|
+
|
|
5
6
|
from evalscope.utils.logger import get_logger
|
|
6
7
|
|
|
7
8
|
logger = get_logger()
|
|
@@ -9,9 +10,9 @@ logger = get_logger()
|
|
|
9
10
|
|
|
10
11
|
def PIL_to_bytes(image_format, **kwargs):
|
|
11
12
|
OPTIONS = {
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
'webp': dict(format='webp', lossless=True),
|
|
14
|
+
'png': dict(format='png'),
|
|
15
|
+
'jpg': dict(format='jpeg'),
|
|
15
16
|
}
|
|
16
17
|
|
|
17
18
|
def transform(image):
|
|
@@ -24,18 +25,18 @@ def PIL_to_bytes(image_format, **kwargs):
|
|
|
24
25
|
|
|
25
26
|
def PIL_to_base64(image, **kwargs):
|
|
26
27
|
bytestream = io.BytesIO()
|
|
27
|
-
image.save(bytestream, format=
|
|
28
|
-
return base64.b64encode(bytestream.getvalue()).decode(
|
|
28
|
+
image.save(bytestream, format='jpeg')
|
|
29
|
+
return base64.b64encode(bytestream.getvalue()).decode('utf-8')
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
def path_to_bytes(filepath):
|
|
32
|
-
with open(filepath,
|
|
33
|
+
with open(filepath, 'rb') as fp:
|
|
33
34
|
return fp.read()
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
def path_to_base64(filepath):
|
|
37
38
|
file_content = path_to_bytes(filepath)
|
|
38
|
-
return base64.b64encode(file_content).decode(
|
|
39
|
+
return base64.b64encode(file_content).decode('utf-8')
|
|
39
40
|
|
|
40
41
|
|
|
41
42
|
def ensure_dir(file_path):
|
|
@@ -44,19 +45,19 @@ def ensure_dir(file_path):
|
|
|
44
45
|
|
|
45
46
|
def save_to_jsonl(df, file_path):
|
|
46
47
|
ensure_dir(file_path)
|
|
47
|
-
df.to_json(file_path, orient=
|
|
48
|
+
df.to_json(file_path, orient='records', lines=True, force_ascii=False)
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
def save_to_tsv(df, file_path):
|
|
51
52
|
ensure_dir(file_path)
|
|
52
|
-
df.to_csv(file_path, sep=
|
|
53
|
+
df.to_csv(file_path, sep='\t', index=False)
|
|
53
54
|
|
|
54
55
|
|
|
55
56
|
def download_model(model_id: str, revision: str):
|
|
56
57
|
"""
|
|
57
58
|
default base dir: '~/.cache/modelscope/hub/model_id'
|
|
58
59
|
"""
|
|
59
|
-
logger.info(f
|
|
60
|
+
logger.info(f'Loading model {model_id} from modelscope')
|
|
60
61
|
|
|
61
62
|
model_path = snapshot_download(model_id=model_id, revision=revision)
|
|
62
63
|
|
|
@@ -1 +1 @@
|
|
|
1
|
-
from evalscope.backend.vlm_eval_kit.backend_manager import VLMEvalKitBackendManager
|
|
1
|
+
from evalscope.backend.vlm_eval_kit.backend_manager import VLMEvalKitBackendManager
|
|
@@ -1,32 +1,31 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import numpy as np
|
|
2
|
+
import os
|
|
3
3
|
from vlmeval.dataset.image_base import ImageBaseDataset
|
|
4
4
|
from vlmeval.dataset.image_vqa import CustomVQADataset
|
|
5
|
-
from vlmeval.smp import
|
|
5
|
+
from vlmeval.smp import d2df, dump, load
|
|
6
|
+
|
|
6
7
|
|
|
7
8
|
class CustomDataset:
|
|
8
9
|
|
|
9
10
|
def load_data(self, dataset):
|
|
10
11
|
# customize the loading of the dataset
|
|
11
|
-
data_path = os.path.join(os.path.expanduser(
|
|
12
|
+
data_path = os.path.join(os.path.expanduser('~/LMUData'), f'{dataset}.tsv')
|
|
12
13
|
return load(data_path)
|
|
13
14
|
|
|
14
|
-
|
|
15
15
|
def build_prompt(self, line):
|
|
16
16
|
msgs = ImageBaseDataset.build_prompt(self, line)
|
|
17
17
|
# add a hint or custom instruction here
|
|
18
18
|
msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
|
|
19
19
|
return msgs
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
|
|
22
21
|
def evaluate(self, eval_file, **judge_kwargs):
|
|
23
22
|
data = load(eval_file)
|
|
24
23
|
assert 'answer' in data and 'prediction' in data
|
|
25
24
|
data['prediction'] = [str(x) for x in data['prediction']]
|
|
26
25
|
data['answer'] = [str(x).lower() for x in data['answer']]
|
|
27
|
-
|
|
26
|
+
|
|
28
27
|
print(data)
|
|
29
|
-
|
|
28
|
+
|
|
30
29
|
# ========compute the evaluation metrics as you need =========
|
|
31
30
|
# exact match
|
|
32
31
|
result = np.mean(data['answer'] == data['prediction'])
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from evalscope.benchmarks.arc.arc_adapter import
|
|
3
|
+
from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
+
from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
|
|
4
5
|
from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass
|
|
6
|
+
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
|