evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +2 -3
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.2.dist-info/RECORD +0 -286
  230. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,4 @@
1
- import asyncio
2
1
  import os
3
-
4
2
  import pandas as pd
5
3
  from ragas.embeddings import LangchainEmbeddingsWrapper
6
4
  from ragas.llms import LangchainLLMWrapper
@@ -9,117 +7,12 @@ from tqdm import tqdm
9
7
  from evalscope.backend.rag_eval import LLM, ChatOpenAI, EmbeddingModel
10
8
  from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
11
9
  from evalscope.utils.logger import get_logger
12
- from .translate_prompt import translate_prompts
10
+ from .build_distribution import default_query_distribution
11
+ from .build_transform import default_transforms
13
12
 
14
13
  logger = get_logger()
15
14
 
16
15
 
17
- def get_transform(llm, embedding, language):
18
- """
19
- Creates and returns a default set of transforms for processing a knowledge graph.
20
- """
21
- from ragas.testset.transforms.engine import Parallel
22
- from ragas.testset.transforms.extractors import (
23
- EmbeddingExtractor,
24
- HeadlinesExtractor,
25
- SummaryExtractor,
26
- )
27
- from ragas.testset.transforms.extractors.llm_based import NERExtractor, ThemesExtractor
28
- from ragas.testset.transforms.relationship_builders import (
29
- CosineSimilarityBuilder,
30
- OverlapScoreBuilder,
31
- )
32
- from ragas.testset.transforms.splitters import HeadlineSplitter
33
- from ragas.testset.transforms.filters import CustomNodeFilter
34
- from ragas.testset.graph import NodeType
35
- from ragas.utils import num_tokens_from_string
36
-
37
- def summary_filter(node):
38
- return (node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > 500)
39
-
40
- summary_extractor = SummaryExtractor(llm=llm, filter_nodes=lambda node: summary_filter(node))
41
- ner_extractor = NERExtractor(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
42
- theme_extractor = ThemesExtractor(llm=llm)
43
- headline_extractor = HeadlinesExtractor(llm=llm)
44
-
45
- asyncio.run(
46
- translate_prompts(
47
- prompts=[
48
- summary_extractor,
49
- theme_extractor,
50
- ner_extractor,
51
- headline_extractor,
52
- ],
53
- target_lang=language,
54
- llm=llm,
55
- adapt_instruction=True,
56
- ))
57
-
58
- splitter = HeadlineSplitter(min_tokens=500)
59
-
60
- summary_emb_extractor = EmbeddingExtractor(
61
- embedding_model=embedding,
62
- property_name='summary_embedding',
63
- embed_property_name='summary',
64
- filter_nodes=lambda node: summary_filter(node),
65
- )
66
-
67
- cosine_sim_builder = CosineSimilarityBuilder(
68
- property_name='summary_embedding',
69
- new_property_name='summary_similarity',
70
- threshold=0.7,
71
- filter_nodes=lambda node: summary_filter(node),
72
- )
73
-
74
- ner_overlap_sim = OverlapScoreBuilder(threshold=0.01, filter_nodes=lambda node: node.type == NodeType.CHUNK)
75
-
76
- node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
77
-
78
- transforms = [
79
- headline_extractor,
80
- splitter,
81
- summary_extractor,
82
- node_filter,
83
- Parallel(summary_emb_extractor, theme_extractor, ner_extractor),
84
- Parallel(cosine_sim_builder, ner_overlap_sim),
85
- ]
86
-
87
- return transforms
88
-
89
-
90
- def get_distribution(llm, distribution, language):
91
- from ragas.testset.synthesizers.multi_hop import (
92
- MultiHopAbstractQuerySynthesizer,
93
- MultiHopSpecificQuerySynthesizer,
94
- )
95
- from ragas.testset.synthesizers.single_hop.specific import (
96
- SingleHopSpecificQuerySynthesizer, )
97
-
98
- single_hop = SingleHopSpecificQuerySynthesizer(llm=llm)
99
- multi_hop_abs = MultiHopAbstractQuerySynthesizer(llm=llm)
100
- multi_hop_spec = MultiHopSpecificQuerySynthesizer(llm=llm)
101
-
102
- asyncio.run(
103
- translate_prompts(
104
- prompts=[
105
- single_hop,
106
- multi_hop_abs,
107
- multi_hop_spec,
108
- ],
109
- target_lang=language,
110
- llm=llm,
111
- adapt_instruction=True,
112
- ))
113
-
114
- mapping = {
115
- 'simple': single_hop,
116
- 'multi_context': multi_hop_abs,
117
- 'reasoning': multi_hop_spec,
118
- }
119
-
120
- return [(mapping[key], distribution[key]) for key in mapping if key in distribution]
121
-
122
-
123
16
  def get_knowledge_graph(documents, transforms, local_file, run_config):
124
17
  from ragas.testset.graph import KnowledgeGraph, Node, NodeType
125
18
  from ragas.testset.transforms import apply_transforms
@@ -153,15 +46,9 @@ def get_knowledge_graph(documents, transforms, local_file, run_config):
153
46
 
154
47
 
155
48
  def get_persona(llm, kg, language):
156
- from evalscope.backend.rag_eval.ragas.prompts.persona_prompt import PersonaGenerationPromptZH
157
- from ragas.testset.persona import generate_personas_from_kg, PersonaGenerationPrompt
158
- from ragas.testset.graph import Node
49
+ from ragas.testset.persona import PersonaGenerationPrompt, generate_personas_from_kg
159
50
 
160
- def filter(node: Node) -> bool:
161
- if (node.type.name == 'DOCUMENT' and node.properties.get('summary_embedding') is not None):
162
- return True
163
- else:
164
- return False
51
+ from evalscope.backend.rag_eval.ragas.prompts.persona_prompt import PersonaGenerationPromptZH
165
52
 
166
53
  if language == 'chinese':
167
54
  persona_prompt = PersonaGenerationPromptZH()
@@ -176,27 +63,21 @@ def get_persona(llm, kg, language):
176
63
  # adapt_instruction=True,
177
64
  # ))
178
65
 
179
- return generate_personas_from_kg(
180
- llm=llm,
181
- kg=kg,
182
- num_personas=3,
183
- persona_generation_prompt=persona_prompt,
184
- filter_fn=filter,
185
- )
66
+ return generate_personas_from_kg(llm=llm, kg=kg, num_personas=3, persona_generation_prompt=persona_prompt)
186
67
 
187
68
 
188
69
  def load_data(file_path):
189
70
  from langchain_community.document_loaders import UnstructuredFileLoader
190
71
 
191
- loader = UnstructuredFileLoader(file_path, mode='elements')
72
+ loader = UnstructuredFileLoader(file_path, mode='single')
192
73
  data = loader.load()
193
74
  return data
194
75
 
195
76
 
196
77
  def generate_testset(args: TestsetGenerationArguments) -> None:
197
78
 
198
- from ragas.testset import TestsetGenerator
199
79
  from ragas import RunConfig
80
+ from ragas.testset import TestsetGenerator
200
81
 
201
82
  # load data
202
83
  documents = load_data(args.docs)
@@ -208,23 +89,26 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
208
89
  wrapped_llm = LangchainLLMWrapper(generator_llm)
209
90
  wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)
210
91
 
211
- # Change resulting question type distribution
212
- distributions = get_distribution(wrapped_llm, args.distribution, args.language)
213
-
214
- run_config = RunConfig(timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True)
215
92
  # get transforms
216
- transforms = get_transform(
93
+ transforms = default_transforms(
94
+ documents,
217
95
  wrapped_llm,
218
96
  wrapped_embeddings,
219
97
  args.language,
220
98
  )
221
99
 
100
+ run_config = RunConfig(timeout=600, max_retries=10, max_wait=120, max_workers=1, log_tenacity=True)
222
101
  # get knowledge graph
223
102
  knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph, run_config)
224
-
103
+ # get persona
225
104
  persona_list = get_persona(llm=wrapped_llm, kg=knowledge_graph, language=args.language)
226
105
 
227
- generator = TestsetGenerator(llm=wrapped_llm, knowledge_graph=knowledge_graph, persona_list=persona_list)
106
+ # Change resulting question type distribution
107
+ distributions = default_query_distribution(wrapped_llm, knowledge_graph, args.language)
108
+
109
+ # generate testset
110
+ generator = TestsetGenerator(
111
+ llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list)
228
112
 
229
113
  testset = generator.generate(
230
114
  testset_size=args.test_size,
@@ -1,11 +1,11 @@
1
- import os
2
1
  import asyncio
3
- from typing import List
4
- from ragas.prompt import PromptMixin
2
+ import os
5
3
  from ragas.llms import BaseRagasLLM
4
+ from ragas.prompt import PromptMixin, PydanticPrompt
6
5
  from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
7
- from evalscope.utils.logger import get_logger
6
+ from typing import List
8
7
 
8
+ from evalscope.utils.logger import get_logger
9
9
 
10
10
  logger = get_logger()
11
11
 
@@ -17,9 +17,7 @@ async def translate_prompt(
17
17
  adapt_instruction: bool = False,
18
18
  ):
19
19
  if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
20
- logger.warning(
21
- f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}'
22
- )
20
+ logger.warning(f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}')
23
21
  return
24
22
 
25
23
  if not issubclass(type(prompt_user), PromptMixin):
@@ -28,9 +26,7 @@ async def translate_prompt(
28
26
 
29
27
  class_name = prompt_user.__class__.__name__
30
28
  current_dir = os.path.dirname(__file__)
31
- prompt_dir = os.path.abspath(
32
- os.path.join(current_dir, f'../prompts/{target_lang}/{class_name}')
33
- )
29
+ prompt_dir = os.path.abspath(os.path.join(current_dir, f'../prompts/{target_lang}/{class_name}'))
34
30
  os.makedirs(prompt_dir, exist_ok=True)
35
31
 
36
32
  try:
@@ -43,8 +39,7 @@ async def translate_prompt(
43
39
 
44
40
  logger.info(f'Translating prompts to {target_lang}')
45
41
  adapted_prompts = await prompt_user.adapt_prompts(
46
- language=target_lang, llm=llm, adapt_instruction=adapt_instruction
47
- )
42
+ language=target_lang, llm=llm, adapt_instruction=adapt_instruction)
48
43
  prompt_user.set_prompts(**adapted_prompts)
49
44
  try:
50
45
  prompt_user.save_prompts(prompt_dir)
@@ -62,11 +57,6 @@ async def translate_prompts(
62
57
  adapt_instruction: bool = False,
63
58
  ):
64
59
  if target_lang and target_lang != 'english':
65
- await asyncio.gather(
66
- *[
67
- translate_prompt(prompt, target_lang, llm, adapt_instruction)
68
- for prompt in prompts
69
- ]
70
- )
60
+ await asyncio.gather(*[translate_prompt(prompt, target_lang, llm, adapt_instruction) for prompt in prompts])
71
61
 
72
62
  logger.info('Translate prompts finished')
@@ -1,33 +1,37 @@
1
1
  import os
2
2
  import torch
3
3
  import torch.nn.functional as F
4
- from typing import List
4
+ from langchain_core.embeddings import Embeddings
5
5
  from PIL import Image
6
- from evalscope.backend.rag_eval.utils.tools import download_model, PIL_to_base64
7
6
  from transformers import AutoModel, AutoProcessor
8
- from langchain_core.embeddings import Embeddings
7
+ from typing import List
8
+
9
+ from evalscope.backend.rag_eval.utils.tools import PIL_to_base64, download_model
10
+ from evalscope.constants import HubType
9
11
 
10
12
 
11
13
  class VisionModel:
14
+
12
15
  @staticmethod
13
16
  def load(**kw):
14
- api_base = kw.get("api_base", None)
17
+ api_base = kw.get('api_base', None)
15
18
  if api_base:
16
19
 
17
20
  return VLMAPI(
18
- model_name=kw.get("model_name", ""),
21
+ model_name=kw.get('model_name', ''),
19
22
  openai_api_base=api_base,
20
- openai_api_key=kw.get("api_key", "EMPTY"),
21
- prompt=kw.get("prompt", None),
23
+ openai_api_key=kw.get('api_key', 'EMPTY'),
24
+ prompt=kw.get('prompt', None),
22
25
  )
23
26
  else:
24
27
  return CLIPModel(**kw)
25
28
 
26
29
 
27
30
  class VLMAPI:
31
+
28
32
  def __init__(self, model_name, openai_api_base, openai_api_key, prompt=None):
29
- from langchain_openai import ChatOpenAI
30
33
  from langchain_core.prompts import ChatPromptTemplate
34
+ from langchain_openai import ChatOpenAI
31
35
 
32
36
  self.model_name = model_name
33
37
  self.model = ChatOpenAI(
@@ -35,46 +39,45 @@ class VLMAPI:
35
39
  openai_api_base=openai_api_base,
36
40
  openai_api_key=openai_api_key,
37
41
  )
38
- self.default_prompt = "Please describe this image in general. Directly provide the description, do not include prefix like 'This image depicts'"
39
- self.prompt = ChatPromptTemplate.from_messages(
40
- [
41
- ("system", prompt if prompt else self.default_prompt),
42
- (
43
- "user",
44
- [
45
- {
46
- "type": "image_url",
47
- "image_url": {"url": "data:image/jpeg;base64,{image_data}"},
48
- }
49
- ],
50
- ),
51
- ]
52
- )
42
+ self.default_prompt = "Please describe this image in general. Directly provide the description, do not include prefix like 'This image depicts'" # noqa: E501
43
+ self.prompt = ChatPromptTemplate.from_messages([
44
+ ('system', prompt if prompt else self.default_prompt),
45
+ (
46
+ 'user',
47
+ [{
48
+ 'type': 'image_url',
49
+ 'image_url': {
50
+ 'url': 'data:image/jpeg;base64,{image_data}'
51
+ },
52
+ }],
53
+ ),
54
+ ])
53
55
  self.chain = self.prompt | self.model
54
56
  self.transform = PIL_to_base64
55
57
 
56
58
  def encode_image(self, images):
57
59
  captions = []
58
60
  for image in images:
59
- response = self.chain.invoke({"image_data": image})
61
+ response = self.chain.invoke({'image_data': image})
60
62
  captions.append(response.content)
61
63
  return captions
62
64
 
63
65
 
64
66
  class CLIPModel(Embeddings):
67
+
65
68
  def __init__(
66
69
  self,
67
70
  model_name: str,
68
- revision: str = "master",
69
- hub="modelscope",
70
- device="cpu",
71
+ revision: str = 'master',
72
+ hub=HubType.MODELSCOPE,
73
+ device='cpu',
71
74
  ):
72
75
  self.device = device
73
76
  self.model_name = model_name
74
77
  self.revision = revision
75
78
 
76
79
  # Download the model if it doesn't exist locally
77
- if not os.path.exists(model_name) and hub == "modelscope":
80
+ if not os.path.exists(model_name) and hub == HubType.MODELSCOPE:
78
81
  model_name = download_model(self.model_name, self.revision)
79
82
 
80
83
  # Load the model and processor
@@ -85,9 +88,7 @@ class CLIPModel(Embeddings):
85
88
 
86
89
  def encode_text(self, batch_texts: List[str] | List[List[str]]):
87
90
  if isinstance(batch_texts[0], list):
88
- batch_texts = [
89
- text for _, texts in enumerate(batch_texts) for text in texts
90
- ]
91
+ batch_texts = [text for _, texts in enumerate(batch_texts) for text in texts]
91
92
  # Ensure that the input texts are within the token limit
92
93
  max_length = self.tokenizer.model_max_length
93
94
  if not max_length or max_length > 0xFFFFFF:
@@ -97,7 +98,7 @@ class CLIPModel(Embeddings):
97
98
  max_length=max_length,
98
99
  padding=True,
99
100
  truncation=True,
100
- return_tensors="pt",
101
+ return_tensors='pt',
101
102
  )
102
103
 
103
104
  inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
@@ -108,7 +109,7 @@ class CLIPModel(Embeddings):
108
109
  return text_features
109
110
 
110
111
  def encode_image(self, image):
111
- batch_images = torch.stack([d["pixel_values"][0] for d in image])
112
+ batch_images = torch.stack([d['pixel_values'][0] for d in image])
112
113
  batch_images = batch_images.to(self.device)
113
114
  with torch.no_grad():
114
115
  image_features = self.model.get_image_features(batch_images)
@@ -126,24 +127,19 @@ class CLIPModel(Embeddings):
126
127
  def embed_image(self, uris: List[str]):
127
128
  # read image and transform
128
129
  images = [Image.open(image_path) for image_path in uris]
129
- transformed_images = [
130
- self.transform(
131
- image,
132
- return_tensors="pt",
133
- )
134
- for image in images
135
- ]
130
+ transformed_images = [self.transform(
131
+ image,
132
+ return_tensors='pt',
133
+ ) for image in images]
136
134
  image_features = self.encode_image(transformed_images)
137
135
  return image_features.cpu().numpy().tolist()
138
136
 
139
137
 
140
- if __name__ == "__main__":
141
- model = CLIPModel("AI-ModelScope/chinese-clip-vit-large-patch14-336px")
142
- model.embed_image(
143
- [
144
- "custom_eval/multimodal/images/AMNH.jpg",
145
- "custom_eval/multimodal/images/AMNH.jpg",
146
- ]
147
- )
148
- model.encode_text(["我喜欢吃饭" * 1000])
149
- print("done")
138
+ if __name__ == '__main__':
139
+ model = CLIPModel('AI-ModelScope/chinese-clip-vit-large-patch14-336px')
140
+ model.embed_image([
141
+ 'custom_eval/multimodal/images/AMNH.jpg',
142
+ 'custom_eval/multimodal/images/AMNH.jpg',
143
+ ])
144
+ model.encode_text(['我喜欢吃饭' * 1000])
145
+ print('done')
@@ -1,18 +1,21 @@
1
1
  import os
2
2
  import torch
3
- from typing import List, Optional, Union, Dict
3
+ from langchain_core.embeddings import Embeddings
4
4
  from sentence_transformers import models
5
- from sentence_transformers.SentenceTransformer import SentenceTransformer
6
5
  from sentence_transformers.cross_encoder import CrossEncoder
6
+ from sentence_transformers.SentenceTransformer import SentenceTransformer
7
7
  from torch import Tensor
8
+ from typing import Dict, List, Optional, Union
9
+
8
10
  from evalscope.backend.rag_eval.utils.tools import download_model
11
+ from evalscope.constants import HubType
9
12
  from evalscope.utils.logger import get_logger
10
- from langchain_core.embeddings import Embeddings
11
13
 
12
14
  logger = get_logger()
13
15
 
14
16
 
15
17
  class BaseModel(Embeddings):
18
+
16
19
  def __init__(
17
20
  self,
18
21
  model_name_or_path: str,
@@ -83,9 +86,8 @@ class BaseModel(Embeddings):
83
86
 
84
87
 
85
88
  class SentenceTransformerModel(BaseModel):
86
- def __init__(
87
- self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs
88
- ):
89
+
90
+ def __init__(self, model_name_or_path: str, pooling_mode: Optional[str] = None, **kwargs):
89
91
  super().__init__(model_name_or_path, **kwargs)
90
92
 
91
93
  if not pooling_mode:
@@ -104,9 +106,7 @@ class SentenceTransformerModel(BaseModel):
104
106
  word_embedding_model.get_word_embedding_dimension(),
105
107
  pooling_mode=pooling_mode,
106
108
  )
107
- self.model = SentenceTransformer(
108
- modules=[word_embedding_model, pooling_model],
109
- )
109
+ self.model = SentenceTransformer(modules=[word_embedding_model, pooling_model], )
110
110
 
111
111
  self.model.max_seq_length = self.max_seq_length
112
112
 
@@ -130,6 +130,7 @@ class SentenceTransformerModel(BaseModel):
130
130
 
131
131
 
132
132
  class CrossEncoderModel(BaseModel):
133
+
133
134
  def __init__(self, model_name_or_path: str, **kwargs):
134
135
  super().__init__(model_name_or_path, **kwargs)
135
136
  self.model = CrossEncoder(
@@ -160,12 +161,12 @@ class EmbeddingModel:
160
161
  def load(
161
162
  model_name_or_path: str = '',
162
163
  is_cross_encoder: bool = False,
163
- hub: str = 'modelscope',
164
+ hub: str = HubType.MODELSCOPE,
164
165
  revision: Optional[str] = 'master',
165
166
  **kwargs,
166
167
  ):
167
168
  # If model path does not exist and hub is 'modelscope', download the model
168
- if not os.path.exists(model_name_or_path) and hub == 'modelscope':
169
+ if not os.path.exists(model_name_or_path) and hub == HubType.MODELSCOPE:
169
170
  model_name_or_path = download_model(model_name_or_path, revision)
170
171
 
171
172
  # Return different model instances based on whether it is a cross-encoder and pooling mode
@@ -1,13 +1,16 @@
1
1
  import os
2
- from typing import Any, Dict, Iterator, List, Mapping, Optional
3
- from modelscope.utils.hf_util import GenerationConfig
4
2
  from langchain_core.callbacks.manager import CallbackManagerForLLMRun
5
3
  from langchain_core.language_models.llms import LLM as BaseLLM
6
- from evalscope.models.model_adapter import ChatGenerationModelAdapter
7
4
  from langchain_openai import ChatOpenAI
5
+ from modelscope.utils.hf_util import GenerationConfig
6
+ from typing import Any, Dict, Iterator, List, Mapping, Optional
7
+
8
+ from evalscope.constants import DEFAULT_MODEL_REVISION
9
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter
8
10
 
9
11
 
10
12
  class LLM:
13
+
11
14
  @staticmethod
12
15
  def load(**kw):
13
16
  api_base = kw.get('api_base', None)
@@ -25,8 +28,8 @@ class LocalLLM(BaseLLM):
25
28
  """A custom LLM that loads a model from a given path and performs inference."""
26
29
 
27
30
  model_name_or_path: str
28
- model_revision: str = 'master'
29
- template_type: str = 'default'
31
+ model_revision: str = DEFAULT_MODEL_REVISION
32
+ template_type: Optional[str] = None
30
33
  model_name: Optional[str]
31
34
  model: Optional[ChatGenerationModelAdapter]
32
35
  generation_config: Optional[Dict]
@@ -37,7 +40,6 @@ class LocalLLM(BaseLLM):
37
40
  self.model = ChatGenerationModelAdapter(
38
41
  model_id=self.model_name_or_path,
39
42
  model_revision=self.model_revision,
40
- template_type=self.template_type,
41
43
  generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
42
44
  )
43
45
 
@@ -1,7 +1,8 @@
1
+ import base64
1
2
  import io
2
3
  import os
3
- import base64
4
4
  from modelscope import snapshot_download
5
+
5
6
  from evalscope.utils.logger import get_logger
6
7
 
7
8
  logger = get_logger()
@@ -9,9 +10,9 @@ logger = get_logger()
9
10
 
10
11
  def PIL_to_bytes(image_format, **kwargs):
11
12
  OPTIONS = {
12
- "webp": dict(format="webp", lossless=True),
13
- "png": dict(format="png"),
14
- "jpg": dict(format="jpeg"),
13
+ 'webp': dict(format='webp', lossless=True),
14
+ 'png': dict(format='png'),
15
+ 'jpg': dict(format='jpeg'),
15
16
  }
16
17
 
17
18
  def transform(image):
@@ -24,18 +25,18 @@ def PIL_to_bytes(image_format, **kwargs):
24
25
 
25
26
  def PIL_to_base64(image, **kwargs):
26
27
  bytestream = io.BytesIO()
27
- image.save(bytestream, format="jpeg")
28
- return base64.b64encode(bytestream.getvalue()).decode("utf-8")
28
+ image.save(bytestream, format='jpeg')
29
+ return base64.b64encode(bytestream.getvalue()).decode('utf-8')
29
30
 
30
31
 
31
32
  def path_to_bytes(filepath):
32
- with open(filepath, "rb") as fp:
33
+ with open(filepath, 'rb') as fp:
33
34
  return fp.read()
34
35
 
35
36
 
36
37
  def path_to_base64(filepath):
37
38
  file_content = path_to_bytes(filepath)
38
- return base64.b64encode(file_content).decode("utf-8")
39
+ return base64.b64encode(file_content).decode('utf-8')
39
40
 
40
41
 
41
42
  def ensure_dir(file_path):
@@ -44,19 +45,19 @@ def ensure_dir(file_path):
44
45
 
45
46
  def save_to_jsonl(df, file_path):
46
47
  ensure_dir(file_path)
47
- df.to_json(file_path, orient="records", lines=True, force_ascii=False)
48
+ df.to_json(file_path, orient='records', lines=True, force_ascii=False)
48
49
 
49
50
 
50
51
  def save_to_tsv(df, file_path):
51
52
  ensure_dir(file_path)
52
- df.to_csv(file_path, sep="\t", index=False)
53
+ df.to_csv(file_path, sep='\t', index=False)
53
54
 
54
55
 
55
56
  def download_model(model_id: str, revision: str):
56
57
  """
57
58
  default base dir: '~/.cache/modelscope/hub/model_id'
58
59
  """
59
- logger.info(f"Loading model {model_id} from modelscope")
60
+ logger.info(f'Loading model {model_id} from modelscope')
60
61
 
61
62
  model_path = snapshot_download(model_id=model_id, revision=revision)
62
63
 
@@ -1 +1 @@
1
- from evalscope.backend.vlm_eval_kit.backend_manager import VLMEvalKitBackendManager
1
+ from evalscope.backend.vlm_eval_kit.backend_manager import VLMEvalKitBackendManager
@@ -1,32 +1,31 @@
1
- import os
2
1
  import numpy as np
2
+ import os
3
3
  from vlmeval.dataset.image_base import ImageBaseDataset
4
4
  from vlmeval.dataset.image_vqa import CustomVQADataset
5
- from vlmeval.smp import load, dump, d2df
5
+ from vlmeval.smp import d2df, dump, load
6
+
6
7
 
7
8
  class CustomDataset:
8
9
 
9
10
  def load_data(self, dataset):
10
11
  # customize the loading of the dataset
11
- data_path = os.path.join(os.path.expanduser("~/LMUData"), f'{dataset}.tsv')
12
+ data_path = os.path.join(os.path.expanduser('~/LMUData'), f'{dataset}.tsv')
12
13
  return load(data_path)
13
14
 
14
-
15
15
  def build_prompt(self, line):
16
16
  msgs = ImageBaseDataset.build_prompt(self, line)
17
17
  # add a hint or custom instruction here
18
18
  msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
19
19
  return msgs
20
-
21
-
20
+
22
21
  def evaluate(self, eval_file, **judge_kwargs):
23
22
  data = load(eval_file)
24
23
  assert 'answer' in data and 'prediction' in data
25
24
  data['prediction'] = [str(x) for x in data['prediction']]
26
25
  data['answer'] = [str(x).lower() for x in data['answer']]
27
-
26
+
28
27
  print(data)
29
-
28
+
30
29
  # ========compute the evaluation metrics as you need =========
31
30
  # exact match
32
31
  result = np.mean(data['answer'] == data['prediction'])
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.benchmarks.arc.arc_adapter import ARCAdapter, DATASET_ID, SUBSET_LIST
3
+ from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
4
+ from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
4
5
  from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
6
+ from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa