evalscope 0.6.0rc0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (222) hide show
  1. {evalscope-0.6.0rc0 → evalscope-0.6.1}/PKG-INFO +8 -7
  2. {evalscope-0.6.0rc0 → evalscope-0.6.1}/README.md +6 -5
  3. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
  4. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +120 -100
  5. evalscope-0.6.1/evalscope/backend/rag_eval/utils/clip.py +149 -0
  6. evalscope-0.6.1/evalscope/backend/rag_eval/utils/embedding.py +183 -0
  7. evalscope-0.6.1/evalscope/backend/rag_eval/utils/llm.py +72 -0
  8. evalscope-0.6.1/evalscope/backend/rag_eval/utils/tools.py +63 -0
  9. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  10. evalscope-0.6.1/evalscope/preprocess/tokenizers/__init__.py +0 -0
  11. evalscope-0.6.1/evalscope/version.py +4 -0
  12. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope.egg-info/PKG-INFO +8 -7
  13. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope.egg-info/SOURCES.txt +5 -0
  14. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope.egg-info/requires.txt +2 -2
  15. evalscope-0.6.0rc0/evalscope/version.py +0 -4
  16. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/__init__.py +0 -0
  17. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/__init__.py +0 -0
  18. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/base.py +0 -0
  19. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/opencompass/__init__.py +0 -0
  20. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  21. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
  22. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  23. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  24. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/__init__.py +0 -0
  25. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  26. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  27. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  28. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  29. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  30. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  31. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  32. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  33. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  34. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  35. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  36. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  37. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  38. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  39. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  40. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  41. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  42. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  43. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  44. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  45. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  46. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  47. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  48. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -0
  49. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -0
  50. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -0
  51. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  52. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  53. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  54. {evalscope-0.6.0rc0/evalscope/perf → evalscope-0.6.1/evalscope/backend/rag_eval/utils}/__init__.py +0 -0
  55. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  56. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  57. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  58. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/__init__.py +0 -0
  59. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/arc/__init__.py +0 -0
  60. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  61. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  62. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
  63. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  64. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  65. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  66. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  67. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  68. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  69. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  70. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  71. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  72. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  73. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  74. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  75. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  76. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  77. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  78. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  79. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  80. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  81. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  82. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  83. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  84. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  85. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  86. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  87. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  88. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  89. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  90. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  91. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/benchmark.py +0 -0
  92. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
  93. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  94. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  95. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  96. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  97. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  98. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  99. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  100. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  101. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/data_adapter.py +0 -0
  102. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  103. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
  104. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  105. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  106. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  107. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  108. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  109. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  110. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  111. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  112. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  113. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  114. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  115. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  116. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/race/__init__.py +0 -0
  117. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/race/race.py +0 -0
  118. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
  119. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  120. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  121. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  122. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  123. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  124. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  125. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/cache.py +0 -0
  126. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/cli/__init__.py +0 -0
  127. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/cli/base.py +0 -0
  128. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/cli/cli.py +0 -0
  129. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/cli/start_perf.py +0 -0
  130. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/cli/start_server.py +0 -0
  131. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/config.py +0 -0
  132. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/constants.py +0 -0
  133. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/evaluator/__init__.py +0 -0
  134. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/evaluator/evaluator.py +0 -0
  135. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/evaluator/rating_eval.py +0 -0
  136. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
  137. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  138. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/metrics/__init__.py +0 -0
  139. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  140. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/metrics/code_metric.py +0 -0
  141. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/metrics/math_accuracy.py +0 -0
  142. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/metrics/metrics.py +0 -0
  143. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/metrics/rouge_metric.py +0 -0
  144. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/models/__init__.py +0 -0
  145. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/models/api/__init__.py +0 -0
  146. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/models/api/openai_api.py +0 -0
  147. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/models/custom/__init__.py +0 -0
  148. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/models/custom/custom_model.py +0 -0
  149. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/models/dummy_chat_model.py +0 -0
  150. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/models/model.py +0 -0
  151. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/models/model_adapter.py +0 -0
  152. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/models/openai_model.py +0 -0
  153. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/models/template.py +0 -0
  154. {evalscope-0.6.0rc0/evalscope/perf/datasets → evalscope-0.6.1/evalscope/perf}/__init__.py +0 -0
  155. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/_logging.py +0 -0
  156. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/api_plugin_base.py +0 -0
  157. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/custom_api.py +0 -0
  158. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/dashscope_api.py +0 -0
  159. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/dataset_plugin_base.py +0 -0
  160. {evalscope-0.6.0rc0/evalscope/preprocess/tokenizers → evalscope-0.6.1/evalscope/perf/datasets}/__init__.py +0 -0
  161. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/datasets/line_by_line.py +0 -0
  162. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
  163. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/datasets/openqa.py +0 -0
  164. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/how_to_analysis_result.py +0 -0
  165. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/http_client.py +0 -0
  166. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/openai_api.py +0 -0
  167. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/plugin_registry.py +0 -0
  168. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/query_parameters.py +0 -0
  169. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/perf/server_sent_event.py +0 -0
  170. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/preprocess/__init__.py +0 -0
  171. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
  172. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/__init__.py +0 -0
  173. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/arc.yaml +0 -0
  174. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/bbh.yaml +0 -0
  175. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  176. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/ceval.yaml +0 -0
  177. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  178. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  179. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  180. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
  181. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  182. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
  183. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  184. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/run.py +0 -0
  185. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/run_arena.py +0 -0
  186. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/run_ms.py +0 -0
  187. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/summarizer.py +0 -0
  188. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/__init__.py +0 -0
  189. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
  190. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/eval.py +0 -0
  191. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/infer.py +0 -0
  192. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  193. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  194. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  195. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  196. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  197. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  198. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  199. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  200. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/longbench_write/utils.py +0 -0
  201. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  202. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
  203. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
  204. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  205. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  206. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  207. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/tools/__init__.py +0 -0
  208. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/tools/combine_reports.py +0 -0
  209. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
  210. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/tools/rewrite_eval_results.py +0 -0
  211. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/utils/__init__.py +0 -0
  212. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/utils/arena_utils.py +0 -0
  213. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/utils/completion_parsers.py +0 -0
  214. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/utils/logger.py +0 -0
  215. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/utils/task_cfg_parser.py +0 -0
  216. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/utils/task_utils.py +0 -0
  217. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope/utils/utils.py +0 -0
  218. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope.egg-info/dependency_links.txt +0 -0
  219. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope.egg-info/entry_points.txt +0 -0
  220. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope.egg-info/not-zip-safe +0 -0
  221. {evalscope-0.6.0rc0 → evalscope-0.6.1}/evalscope.egg-info/top_level.txt +0 -0
  222. {evalscope-0.6.0rc0 → evalscope-0.6.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.6.0rc0
3
+ Version: 0.6.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -53,7 +53,7 @@ Provides-Extra: vlmeval
53
53
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
54
54
  Provides-Extra: rag
55
55
  Requires-Dist: mteb==1.19.4; extra == "rag"
56
- Requires-Dist: ragas==0.2.3; extra == "rag"
56
+ Requires-Dist: ragas==0.2.5; extra == "rag"
57
57
  Requires-Dist: webdataset>0.2.0; extra == "rag"
58
58
  Provides-Extra: inner
59
59
  Requires-Dist: absl-py; extra == "inner"
@@ -118,7 +118,7 @@ Requires-Dist: rouge-chinese; extra == "all"
118
118
  Requires-Dist: ms-opencompass>=0.1.3; extra == "all"
119
119
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
120
120
  Requires-Dist: mteb==1.19.4; extra == "all"
121
- Requires-Dist: ragas==0.2.3; extra == "all"
121
+ Requires-Dist: ragas==0.2.5; extra == "all"
122
122
  Requires-Dist: webdataset>0.2.0; extra == "all"
123
123
 
124
124
 
@@ -140,6 +140,7 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
140
140
  <a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
141
141
  <p>
142
142
 
143
+ > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
143
144
 
144
145
  ## 📋 Table of Contents
145
146
  - [Introduction](#introduction)
@@ -165,7 +166,7 @@ EvalScope is the official model evaluation and performance benchmarking framewor
165
166
  The architecture includes the following modules:
166
167
  1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
167
168
  2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
168
- 3. **Evaluation Backend**:
169
+ 3. **Evaluation Backend**:
169
170
  - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
170
171
  - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
171
172
  - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
@@ -252,7 +253,7 @@ You can execute this command from any directory:
252
253
  python -m evalscope.run \
253
254
  --model qwen/Qwen2-0.5B-Instruct \
254
255
  --template-type qwen \
255
- --datasets arc
256
+ --datasets arc
256
257
  ```
257
258
 
258
259
  #### Install from source
@@ -359,13 +360,13 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
359
360
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
360
361
 
361
362
  ## Offline Evaluation
362
- You can use local dataset to evaluate the model without internet connection.
363
+ You can use local dataset to evaluate the model without internet connection.
363
364
 
364
365
  Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
365
366
 
366
367
 
367
368
  ## Arena Mode
368
- The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
369
+ The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
369
370
 
370
371
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
371
372
 
@@ -17,6 +17,7 @@
17
17
  <a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
18
18
  <p>
19
19
 
20
+ > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
20
21
 
21
22
  ## 📋 Table of Contents
22
23
  - [Introduction](#introduction)
@@ -42,7 +43,7 @@ EvalScope is the official model evaluation and performance benchmarking framewor
42
43
  The architecture includes the following modules:
43
44
  1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
44
45
  2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
45
- 3. **Evaluation Backend**:
46
+ 3. **Evaluation Backend**:
46
47
  - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
47
48
  - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
48
49
  - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
@@ -129,7 +130,7 @@ You can execute this command from any directory:
129
130
  python -m evalscope.run \
130
131
  --model qwen/Qwen2-0.5B-Instruct \
131
132
  --template-type qwen \
132
- --datasets arc
133
+ --datasets arc
133
134
  ```
134
135
 
135
136
  #### Install from source
@@ -236,13 +237,13 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
236
237
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
237
238
 
238
239
  ## Offline Evaluation
239
- You can use local dataset to evaluate the model without internet connection.
240
+ You can use local dataset to evaluate the model without internet connection.
240
241
 
241
242
  Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
242
243
 
243
244
 
244
245
  ## Arena Mode
245
- The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
246
+ The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
246
247
 
247
248
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
248
249
 
@@ -270,4 +271,4 @@ Refer to : Model Serving Performance Evaluation [📖 User Guide](https://evalsc
270
271
 
271
272
  ## Star History
272
273
 
273
- [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
274
+ [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -51,12 +51,12 @@ with read_base():
51
51
  from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
52
52
  from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
53
53
  from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
54
- from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
55
54
 
56
55
  # Note: to be supported
57
56
  # from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
58
57
  # from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
59
58
  # from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
59
+ # from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
60
60
 
61
61
 
62
62
  datasets = []
@@ -1,15 +1,15 @@
1
- import os
2
1
  import asyncio
2
+ import os
3
+
3
4
  import pandas as pd
4
- from tqdm import tqdm
5
- from ragas.llms import LangchainLLMWrapper
6
5
  from ragas.embeddings import LangchainEmbeddingsWrapper
7
- from .translate_prompt import translate_prompts
8
- from evalscope.utils.logger import get_logger
9
- from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
10
- from evalscope.backend.rag_eval import EmbeddingModel, LLM, ChatOpenAI
6
+ from ragas.llms import LangchainLLMWrapper
7
+ from tqdm import tqdm
11
8
 
12
- os.environ['DO_NOT_TRACK'] = 'true'
9
+ from evalscope.backend.rag_eval import LLM, ChatOpenAI, EmbeddingModel
10
+ from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments
11
+ from evalscope.utils.logger import get_logger
12
+ from .translate_prompt import translate_prompts
13
13
 
14
14
  logger = get_logger()
15
15
 
@@ -17,116 +17,110 @@ logger = get_logger()
17
17
  def get_transform(llm, embedding, language):
18
18
  """
19
19
  Creates and returns a default set of transforms for processing a knowledge graph.
20
-
21
- This function defines a series of transformation steps to be applied to a
22
- knowledge graph, including extracting summaries, keyphrases, titles,
23
- headlines, and embeddings, as well as building similarity relationships
24
- between nodes.
25
-
26
- The transforms are applied in the following order:
27
- 1. Parallel extraction of summaries and headlines
28
- 2. Embedding of summaries for document nodes
29
- 3. Splitting of headlines
30
- 4. Parallel extraction of embeddings, keyphrases, and titles
31
- 5. Building cosine similarity relationships between nodes
32
- 6. Building cosine similarity relationships between summaries
33
-
34
- Returns
35
- -------
36
- Transforms
37
- A list of transformation steps to be applied to the knowledge graph.
38
-
39
20
  """
40
21
  from ragas.testset.transforms.engine import Parallel
41
22
  from ragas.testset.transforms.extractors import (
42
23
  EmbeddingExtractor,
43
24
  HeadlinesExtractor,
44
- KeyphrasesExtractor,
45
25
  SummaryExtractor,
46
- TitleExtractor,
47
26
  )
48
- from ragas.testset.transforms.relationship_builders.cosine import (
27
+ from ragas.testset.transforms.extractors.llm_based import NERExtractor, ThemesExtractor
28
+ from ragas.testset.transforms.relationship_builders import (
49
29
  CosineSimilarityBuilder,
50
- SummaryCosineSimilarityBuilder,
30
+ OverlapScoreBuilder,
51
31
  )
52
32
  from ragas.testset.transforms.splitters import HeadlineSplitter
33
+ from ragas.testset.transforms.filters import CustomNodeFilter
53
34
  from ragas.testset.graph import NodeType
35
+ from ragas.utils import num_tokens_from_string
36
+
37
+ def summary_filter(node):
38
+ return (node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > 500)
54
39
 
55
- # define the transforms
56
- summary_extractor = SummaryExtractor(llm=llm)
57
- keyphrase_extractor = KeyphrasesExtractor(llm=llm)
58
- title_extractor = TitleExtractor(llm=llm)
40
+ summary_extractor = SummaryExtractor(llm=llm, filter_nodes=lambda node: summary_filter(node))
41
+ ner_extractor = NERExtractor(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
42
+ theme_extractor = ThemesExtractor(llm=llm)
59
43
  headline_extractor = HeadlinesExtractor(llm=llm)
60
44
 
61
45
  asyncio.run(
62
46
  translate_prompts(
63
47
  prompts=[
64
48
  summary_extractor,
65
- keyphrase_extractor,
66
- title_extractor,
49
+ theme_extractor,
50
+ ner_extractor,
67
51
  headline_extractor,
68
52
  ],
69
53
  target_lang=language,
70
54
  llm=llm,
71
55
  adapt_instruction=True,
72
- )
73
- )
56
+ ))
57
+
58
+ splitter = HeadlineSplitter(min_tokens=500)
74
59
 
75
- embedding_extractor = EmbeddingExtractor(embedding_model=embedding)
76
- headline_splitter = HeadlineSplitter()
77
- cosine_sim_builder = CosineSimilarityBuilder(threshold=0.8)
78
- summary_embedder = EmbeddingExtractor(
79
- name='summary_embedder',
80
- filter_nodes=lambda node: True if node.type == NodeType.DOCUMENT else False,
60
+ summary_emb_extractor = EmbeddingExtractor(
61
+ embedding_model=embedding,
81
62
  property_name='summary_embedding',
82
63
  embed_property_name='summary',
83
- embedding_model=embedding,
64
+ filter_nodes=lambda node: summary_filter(node),
84
65
  )
85
- summary_cosine_sim_builder = SummaryCosineSimilarityBuilder(threshold=0.6)
86
66
 
87
- # specify the transforms and their order to be applied
67
+ cosine_sim_builder = CosineSimilarityBuilder(
68
+ property_name='summary_embedding',
69
+ new_property_name='summary_similarity',
70
+ threshold=0.7,
71
+ filter_nodes=lambda node: summary_filter(node),
72
+ )
73
+
74
+ ner_overlap_sim = OverlapScoreBuilder(threshold=0.01, filter_nodes=lambda node: node.type == NodeType.CHUNK)
75
+
76
+ node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: node.type == NodeType.CHUNK)
77
+
88
78
  transforms = [
89
- Parallel(summary_extractor, headline_extractor),
90
- summary_embedder,
91
- headline_splitter,
92
- Parallel(embedding_extractor, keyphrase_extractor, title_extractor),
93
- cosine_sim_builder,
94
- summary_cosine_sim_builder,
79
+ headline_extractor,
80
+ splitter,
81
+ summary_extractor,
82
+ node_filter,
83
+ Parallel(summary_emb_extractor, theme_extractor, ner_extractor),
84
+ Parallel(cosine_sim_builder, ner_overlap_sim),
95
85
  ]
86
+
96
87
  return transforms
97
88
 
98
89
 
99
90
  def get_distribution(llm, distribution, language):
100
- from ragas.testset.synthesizers.abstract_query import (
101
- AbstractQuerySynthesizer,
102
- ComparativeAbstractQuerySynthesizer,
91
+ from ragas.testset.synthesizers.multi_hop import (
92
+ MultiHopAbstractQuerySynthesizer,
93
+ MultiHopSpecificQuerySynthesizer,
103
94
  )
104
- from ragas.testset.synthesizers.specific_query import SpecificQuerySynthesizer
95
+ from ragas.testset.synthesizers.single_hop.specific import (
96
+ SingleHopSpecificQuerySynthesizer, )
105
97
 
106
- abstract = AbstractQuerySynthesizer(llm=llm)
107
- comparative = ComparativeAbstractQuerySynthesizer(llm=llm)
108
- specific = SpecificQuerySynthesizer(llm=llm)
98
+ single_hop = SingleHopSpecificQuerySynthesizer(llm=llm)
99
+ multi_hop_abs = MultiHopAbstractQuerySynthesizer(llm=llm)
100
+ multi_hop_spec = MultiHopSpecificQuerySynthesizer(llm=llm)
109
101
 
110
102
  asyncio.run(
111
103
  translate_prompts(
112
104
  prompts=[
113
- abstract,
114
- comparative,
115
- specific,
105
+ single_hop,
106
+ multi_hop_abs,
107
+ multi_hop_spec,
116
108
  ],
117
109
  target_lang=language,
118
110
  llm=llm,
119
111
  adapt_instruction=True,
120
- )
121
- )
122
- return [
123
- (abstract, distribution['simple']),
124
- (comparative, distribution['multi_context']),
125
- (specific, distribution['reasoning']),
126
- ]
112
+ ))
113
+
114
+ mapping = {
115
+ 'simple': single_hop,
116
+ 'multi_context': multi_hop_abs,
117
+ 'reasoning': multi_hop_spec,
118
+ }
119
+
120
+ return [(mapping[key], distribution[key]) for key in mapping if key in distribution]
127
121
 
128
122
 
129
- def get_knowledge_graph(documents, transforms, local_file):
123
+ def get_knowledge_graph(documents, transforms, local_file, run_config):
130
124
  from ragas.testset.graph import KnowledgeGraph, Node, NodeType
131
125
  from ragas.testset.transforms import apply_transforms
132
126
 
@@ -148,7 +142,7 @@ def get_knowledge_graph(documents, transforms, local_file):
148
142
  kg = KnowledgeGraph(nodes=nodes)
149
143
 
150
144
  # apply transforms and update the knowledge graph
151
- apply_transforms(kg, transforms)
145
+ apply_transforms(kg, transforms, run_config=run_config)
152
146
 
153
147
  # save the knowledge graph
154
148
  output_path = os.path.dirname(local_file)
@@ -158,6 +152,39 @@ def get_knowledge_graph(documents, transforms, local_file):
158
152
  return kg
159
153
 
160
154
 
155
+ def get_persona(llm, kg, language):
156
+ from evalscope.backend.rag_eval.ragas.prompts.persona_prompt import PersonaGenerationPromptZH
157
+ from ragas.testset.persona import generate_personas_from_kg, PersonaGenerationPrompt
158
+ from ragas.testset.graph import Node
159
+
160
+ def filter(node: Node) -> bool:
161
+ if (node.type.name == 'DOCUMENT' and node.properties.get('summary_embedding') is not None):
162
+ return True
163
+ else:
164
+ return False
165
+
166
+ if language == 'chinese':
167
+ persona_prompt = PersonaGenerationPromptZH()
168
+ else:
169
+ persona_prompt = PersonaGenerationPrompt()
170
+ # NOTE: can't translate this yet
171
+ # asyncio.run(
172
+ # translate_prompts(
173
+ # prompts=[persona_prompt],
174
+ # target_lang=language,
175
+ # llm=llm,
176
+ # adapt_instruction=True,
177
+ # ))
178
+
179
+ return generate_personas_from_kg(
180
+ llm=llm,
181
+ kg=kg,
182
+ num_personas=3,
183
+ persona_generation_prompt=persona_prompt,
184
+ filter_fn=filter,
185
+ )
186
+
187
+
161
188
  def load_data(file_path):
162
189
  from langchain_community.document_loaders import UnstructuredFileLoader
163
190
 
@@ -178,32 +205,31 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
178
205
  generator_llm = LLM.load(**args.generator_llm)
179
206
  embeddings = EmbeddingModel.load(**args.embeddings)
180
207
 
208
+ wrapped_llm = LangchainLLMWrapper(generator_llm)
209
+ wrapped_embeddings = LangchainEmbeddingsWrapper(embeddings)
210
+
181
211
  # Change resulting question type distribution
182
- distributions = get_distribution(
183
- LangchainLLMWrapper(generator_llm), args.distribution, args.language
184
- )
212
+ distributions = get_distribution(wrapped_llm, args.distribution, args.language)
185
213
 
214
+ run_config = RunConfig(timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True)
186
215
  # get transforms
187
216
  transforms = get_transform(
188
- LangchainLLMWrapper(generator_llm),
189
- LangchainEmbeddingsWrapper(embeddings),
217
+ wrapped_llm,
218
+ wrapped_embeddings,
190
219
  args.language,
191
220
  )
192
221
 
193
222
  # get knowledge graph
194
- knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph)
223
+ knowledge_graph = get_knowledge_graph(documents, transforms, args.knowledge_graph, run_config)
195
224
 
196
- generator = TestsetGenerator.from_langchain(
197
- generator_llm, embeddings, knowledge_graph
198
- )
225
+ persona_list = get_persona(llm=wrapped_llm, kg=knowledge_graph, language=args.language)
226
+
227
+ generator = TestsetGenerator(llm=wrapped_llm, knowledge_graph=knowledge_graph, persona_list=persona_list)
199
228
 
200
- runconfig = RunConfig(
201
- timeout=600, max_retries=3, max_wait=120, max_workers=1, log_tenacity=True
202
- )
203
229
  testset = generator.generate(
204
230
  testset_size=args.test_size,
205
231
  query_distribution=distributions,
206
- run_config=runconfig,
232
+ run_config=run_config,
207
233
  with_debugging_logs=True,
208
234
  raise_exceptions=True,
209
235
  )
@@ -212,9 +238,7 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
212
238
  testset_df = testset.to_pandas()
213
239
  output_path = os.path.dirname(args.output_file)
214
240
  os.makedirs(output_path, exist_ok=True)
215
- testset_df.to_json(
216
- args.output_file, indent=4, index=False, orient='records', force_ascii=False
217
- )
241
+ testset_df.to_json(args.output_file, indent=4, index=False, orient='records', force_ascii=False)
218
242
 
219
243
  # get answer
220
244
  testset_with_answer = get_answer(testset_df, generator_llm, args.language)
@@ -243,21 +267,17 @@ Answer:
243
267
  contexts = '\n'.join(row['reference_contexts'])
244
268
 
245
269
  # Combine question and contexts as input for the LLM
246
- input_text = template.format(
247
- language=language, question=question, contexts=contexts
248
- )
270
+ input_text = template.format(language=language, question=question, contexts=contexts)
249
271
 
250
272
  # Generate the answer using the generator LLM
251
273
  answer = generator_llm.invoke(input_text)
252
274
  if isinstance(generator_llm, ChatOpenAI):
253
275
  answer = answer.content
254
- items.append(
255
- {
256
- 'user_input': question,
257
- 'retrieved_contexts': row['reference_contexts'],
258
- 'response': answer,
259
- 'reference': row['reference'],
260
- }
261
- )
276
+ items.append({
277
+ 'user_input': question,
278
+ 'retrieved_contexts': row['reference_contexts'],
279
+ 'response': answer,
280
+ 'reference': row['reference'],
281
+ })
262
282
 
263
283
  return pd.DataFrame.from_dict(items)
@@ -0,0 +1,149 @@
1
+ import os
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from typing import List
5
+ from PIL import Image
6
+ from evalscope.backend.rag_eval.utils.tools import download_model, PIL_to_base64
7
+ from transformers import AutoModel, AutoProcessor
8
+ from langchain_core.embeddings import Embeddings
9
+
10
+
11
+ class VisionModel:
12
+ @staticmethod
13
+ def load(**kw):
14
+ api_base = kw.get("api_base", None)
15
+ if api_base:
16
+
17
+ return VLMAPI(
18
+ model_name=kw.get("model_name", ""),
19
+ openai_api_base=api_base,
20
+ openai_api_key=kw.get("api_key", "EMPTY"),
21
+ prompt=kw.get("prompt", None),
22
+ )
23
+ else:
24
+ return CLIPModel(**kw)
25
+
26
+
27
+ class VLMAPI:
28
+ def __init__(self, model_name, openai_api_base, openai_api_key, prompt=None):
29
+ from langchain_openai import ChatOpenAI
30
+ from langchain_core.prompts import ChatPromptTemplate
31
+
32
+ self.model_name = model_name
33
+ self.model = ChatOpenAI(
34
+ model_name=model_name,
35
+ openai_api_base=openai_api_base,
36
+ openai_api_key=openai_api_key,
37
+ )
38
+ self.default_prompt = "Please describe this image in general. Directly provide the description, do not include prefix like 'This image depicts'"
39
+ self.prompt = ChatPromptTemplate.from_messages(
40
+ [
41
+ ("system", prompt if prompt else self.default_prompt),
42
+ (
43
+ "user",
44
+ [
45
+ {
46
+ "type": "image_url",
47
+ "image_url": {"url": "data:image/jpeg;base64,{image_data}"},
48
+ }
49
+ ],
50
+ ),
51
+ ]
52
+ )
53
+ self.chain = self.prompt | self.model
54
+ self.transform = PIL_to_base64
55
+
56
+ def encode_image(self, images):
57
+ captions = []
58
+ for image in images:
59
+ response = self.chain.invoke({"image_data": image})
60
+ captions.append(response.content)
61
+ return captions
62
+
63
+
64
+ class CLIPModel(Embeddings):
65
+ def __init__(
66
+ self,
67
+ model_name: str,
68
+ revision: str = "master",
69
+ hub="modelscope",
70
+ device="cpu",
71
+ ):
72
+ self.device = device
73
+ self.model_name = model_name
74
+ self.revision = revision
75
+
76
+ # Download the model if it doesn't exist locally
77
+ if not os.path.exists(model_name) and hub == "modelscope":
78
+ model_name = download_model(self.model_name, self.revision)
79
+
80
+ # Load the model and processor
81
+ self.model = AutoModel.from_pretrained(model_name).to(self.device)
82
+ self.processor = AutoProcessor.from_pretrained(model_name)
83
+ self.transform = self.processor.image_processor
84
+ self.tokenizer = self.processor.tokenizer
85
+
86
+ def encode_text(self, batch_texts: List[str] | List[List[str]]):
87
+ if isinstance(batch_texts[0], list):
88
+ batch_texts = [
89
+ text for _, texts in enumerate(batch_texts) for text in texts
90
+ ]
91
+ # Ensure that the input texts are within the token limit
92
+ max_length = self.tokenizer.model_max_length
93
+ if not max_length or max_length > 0xFFFFFF:
94
+ max_length = 512
95
+ encoded_inputs = self.tokenizer(
96
+ text=batch_texts,
97
+ max_length=max_length,
98
+ padding=True,
99
+ truncation=True,
100
+ return_tensors="pt",
101
+ )
102
+
103
+ inputs = {k: v.to(self.device) for k, v in encoded_inputs.items()}
104
+
105
+ with torch.no_grad():
106
+ text_features = self.model.get_text_features(**inputs)
107
+ text_features = F.normalize(text_features, p=2, dim=-1)
108
+ return text_features
109
+
110
+ def encode_image(self, image):
111
+ batch_images = torch.stack([d["pixel_values"][0] for d in image])
112
+ batch_images = batch_images.to(self.device)
113
+ with torch.no_grad():
114
+ image_features = self.model.get_image_features(batch_images)
115
+ image_features = F.normalize(image_features, p=2, dim=-1)
116
+ return image_features
117
+
118
+ def embed_documents(self, texts):
119
+ text_features = self.encode_text(texts)
120
+ return text_features.cpu().numpy().tolist()
121
+
122
+ def embed_query(self, text):
123
+ text_features = self.encode_text([text])
124
+ return text_features.cpu().numpy().tolist()[0]
125
+
126
+ def embed_image(self, uris: List[str]):
127
+ # read image and transform
128
+ images = [Image.open(image_path) for image_path in uris]
129
+ transformed_images = [
130
+ self.transform(
131
+ image,
132
+ return_tensors="pt",
133
+ )
134
+ for image in images
135
+ ]
136
+ image_features = self.encode_image(transformed_images)
137
+ return image_features.cpu().numpy().tolist()
138
+
139
+
140
+ if __name__ == "__main__":
141
+ model = CLIPModel("AI-ModelScope/chinese-clip-vit-large-patch14-336px")
142
+ model.embed_image(
143
+ [
144
+ "custom_eval/multimodal/images/AMNH.jpg",
145
+ "custom_eval/multimodal/images/AMNH.jpg",
146
+ ]
147
+ )
148
+ model.encode_text(["我喜欢吃饭" * 1000])
149
+ print("done")