PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +2 -3
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json CHANGED Viewed

@@ -1,15 +1,15 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": -4418228766304805311,
+  "ragas_version": "0.2.7",
+  "original_hash": 5296785184599215999,
   "language": "chinese",
   "instruction": "您的任务是根据给定的上下文判断一系列陈述的真实性。对于每个陈述，如果可以根据上下文直接推断出该陈述，则必须返回判决为1；如果不能根据上下文直接推断出该陈述，则返回判决为0。",
   "examples": [
     {
       "input": {
-        "context": "约翰是XYZ大学的学生。他正在攻读计算机科学学位。本学期他选修了几门课程，包括数据结构、算法和数据库管理。约翰是一个勤奋的学生，花费大量时间学习和完成作业。他经常在图书馆待到很晚以完成他的项目。",
+        "context": "约翰是XYZ大学的学生。他正在攻读计算机科学学位。本学期他注册了几门课程，包括数据结构、算法和数据库管理。约翰是一个勤奋的学生，花费大量时间学习和完成作业。他经常在图书馆待到很晚以完成他的项目。",
         "statements": [
           "约翰主修生物学。",
-          "约翰正在学习人工智能课程。",
+          "约翰正在学习一门人工智能课程。",
           "约翰是一个勤奋的学生。",
           "约翰有一份兼职工作。"
         ]
@@ -22,18 +22,18 @@
             "verdict": 0
           },
           {
-            "statement": "约翰正在学习人工智能课程。",
-            "reason": "上下文提到约翰目前选修的课程，并未提到人工智能。因此，无法推断出约翰正在学习人工智能课程。",
+            "statement": "约翰正在学习一门人工智能课程。",
+            "reason": "上下文中提到约翰目前注册的课程，并未提到人工智能。因此，不能推断出约翰正在学习人工智能课程。",
             "verdict": 0
           },
           {
             "statement": "约翰是一个勤奋的学生。",
-            "reason": "上下文指出他花费大量时间学习和完成作业。此外，还提到他经常在图书馆待到很晚以完成他的项目，这表明了他的勤奋。",
+            "reason": "上下文中提到他花费大量时间学习和完成作业。此外，还提到他经常在图书馆待到很晚以完成他的项目，这意味着他很勤奋。",
             "verdict": 1
           },
           {
             "statement": "约翰有一份兼职工作。",
-            "reason": "上下文中没有提供关于约翰有兼职工作的信息。",
+            "reason": "上下文中没有给出约翰有兼职工作的信息。",
             "verdict": 0
           }
         ]

evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json CHANGED Viewed

@@ -1,16 +1,16 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": 8370494081602031492,
+  "ragas_version": "0.2.7",
+  "original_hash": -8546983388246528139,
   "language": "chinese",
   "instruction": "给定一个问题、一个答案和答案中的句子，分析在“句子”下给出的每个句子的复杂性，并将每个句子分解为一个或多个完全可理解的陈述，同时确保每个陈述中不使用代词。将输出格式化为JSON。",
   "examples": [
     {
       "input": {
         "question": "阿尔伯特·爱因斯坦是谁，他最出名的是什么？",
-        "answer": "他是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最出名的是发展了相对论，他还对量子力学理论的发展做出了重要贡献。",
+        "answer": "他是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最出名的是发展了相对论，他还为量子力学理论的发展做出了重要贡献。",
         "sentences": {
           "0": "他是一位出生于德国的理论物理学家，被广泛认为是有史以来最伟大和最有影响力的物理学家之一。",
-          "1": "他最出名的是发展了相对论，他还对量子力学理论的发展做出了重要贡献。"
+          "1": "他最出名的是发展了相对论，他还为量子力学理论的发展做出了重要贡献。"
         }
       },
       "output": {
@@ -26,7 +26,7 @@
             "sentence_index": 1,
             "simpler_statements": [
               "阿尔伯特·爱因斯坦最出名的是发展了相对论。",
-              "阿尔伯特·爱因斯坦还对量子力学理论的发展做出了重要贡献。"
+              "阿尔伯特·爱因斯坦还为量子力学理论的发展做出了重要贡献。"
             ]
           }
         ]

evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json CHANGED Viewed

@@ -1,20 +1,22 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": 5488893487931336269,
+  "ragas_version": "0.2.7",
+  "original_hash": 7972318980248949928,
   "language": "chinese",
-  "instruction": "从给定文本中仅提取第2级和第3级标题。",
+  "instruction": "从给定文本中提取最重要的max_num标题，这些标题可用于将文本分成独立的部分。重点关注第2级和第3级标题。",
   "examples": [
     {
       "input": {
-        "text": "                介绍\n                主题概述...\n\n                主要概念\n                核心思想的解释...\n\n                详细分析\n                分析的技术和方法...\n\n                小节：专业技术\n                专业技术的进一步细节...\n\n                未来方向\n                对即将到来的趋势的见解...\n\n                小节：研究的下一步\n                新研究领域的讨论...\n\n                结论\n                最后的评论和总结。\n                "
+        "text": "                介绍\n                主题概述...\n\n                主要概念\n                核心思想的解释...\n\n                详细分析\n                分析的技术和方法...\n\n                小节：专业技术\n                专业技术的进一步细节...\n\n                未来方向\n                对即将到来的趋势的见解...\n\n                小节：研究的下一步\n                新研究领域的讨论...\n\n                结论\n                最后的评论和总结。\n                ",
+        "max_num": 6
       },
       "output": {
         "headlines": [
+          "介绍",
           "主要概念",
           "详细分析",
           "小节：专业技术",
           "未来方向",
-          "小节：研究的下一步"
+          "结论"
         ]
       }
     }

evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": -5063505674847369221,
+  "ragas_version": "0.2.7",
+  "original_hash": 5035835898922847346,
   "language": "chinese",
   "instruction": "通过将来自至少两个不同列表的概念配对来形成组合。\n**说明：**\n- 查看每个节点的概念。\n- 确定可以逻辑连接或对比的概念。\n- 形成涉及来自不同节点的概念的组合。\n- 每个组合应至少包括来自两个或多个节点的一个概念。\n- 清晰简洁地列出组合。\n- 不要重复相同的组合。",
   "examples": [

evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json CHANGED Viewed

@@ -1,7 +1,30 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": -1698100170803872933,
+  "ragas_version": "0.2.7",
+  "original_hash": 5691378570114822729,
   "language": "chinese",
-  "instruction": "根据指定的条件（角色、主题、风格、长度）和提供的上下文生成查询和答案。确保答案完全忠实于上下文，仅使用直接来自提供节点的信息。### 指令：\n1. **生成查询**：根据上下文、角色、主题、风格和长度，创建一个符合角色视角并反映主题的问题。\n2. **生成答案**：仅使用提供的上下文内容，创建一个忠实且详细的答案。不要包含任何不在或无法从给定上下文中推断的信息。\n### 示例输出：\n\n",
-  "examples": []
+  "instruction": "根据指定的条件（角色、主题、风格、长度）和提供的上下文生成一个多跳查询和答案。主题代表从上下文中提取或生成的一组短语，这些短语突出显示了所选上下文在创建多跳查询时的适用性。确保查询明确包含这些主题。### 指导：\n1. **生成多跳查询**：使用提供的上下文片段和主题形成一个需要结合多个片段信息的查询（例如，`<1-hop>` 和 `<2-hop>`）。确保查询明确包含一个或多个主题，并反映其与上下文的相关性。\n2. **生成答案**：仅使用提供的上下文中的内容来创建查询的详细和忠实的答案。避免添加不直接存在或无法从给定上下文推断的信息。\n3. **多跳上下文标签**：\n   - 每个上下文片段标记为 `<1-hop>`、`<2-hop>` 等。\n   - 确保查询使用至少两个片段的信息并有意义地连接它们。",
+  "examples": [
+    {
+      "input": {
+        "persona": {
+          "name": "历史学家",
+          "role_description": "专注于主要科学里程碑及其全球影响。"
+        },
+        "themes": [
+          "相对论",
+          "实验验证"
+        ],
+        "query_style": "正式",
+        "query_length": "中等",
+        "context": [
+          "<1-hop> 阿尔伯特·爱因斯坦发展了相对论，引入了时空的概念。",
+          "<2-hop> 在1919年的日食期间，光线被重力弯曲的现象得到了证实，支持了爱因斯坦的理论。"
+        ]
+      },
+      "output": {
+        "query": "在1919年日食期间，相对论的实验验证是如何实现的？",
+        "answer": "在1919年日食期间，通过确认光线被重力弯曲，实现了相对论的实验验证，这支持了爱因斯坦在理论中提出的时空概念。"
+      }
+    }
+  ]
 }

evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": 2334929353739018813,
+  "ragas_version": "0.2.7",
+  "original_hash": 4608101540215877909,
   "language": "chinese",
   "instruction": "给定一个主题和角色列表，根据角色描述将每个角色与相关主题关联起来。",
   "examples": [

evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json CHANGED Viewed

@@ -1,7 +1,30 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": -1698100170803872933,
+  "ragas_version": "0.2.7",
+  "original_hash": 5691378570114822729,
   "language": "chinese",
-  "instruction": "根据指定的条件（角色、主题、风格、长度）和提供的上下文生成查询和答案。确保答案完全忠实于上下文，仅使用直接来自提供节点的信息。### 指令：\n1. **生成查询**：根据上下文、角色、主题、风格和长度，创建一个与角色视角一致并反映主题的问题。\n2. **生成答案**：仅使用提供的上下文内容，创建一个忠实且详细的答案。不要包含任何不在或无法从给定上下文中推断的信息。\n### 示例输出：\n\n",
-  "examples": []
+  "instruction": "根据指定的条件（角色、主题、风格、长度）和提供的上下文生成一个多跳查询和答案。主题代表从上下文中提取或生成的一组短语，这些短语突出显示了所选上下文适合多跳查询创建的适用性。确保查询明确包含这些主题。### 指导：\n1. **生成多跳查询**：使用提供的上下文片段和主题形成一个需要结合多个片段信息的查询（例如，`<1-hop>` 和 `<2-hop>`）。确保查询明确包含一个或多个主题，并反映其与上下文的相关性。\n2. **生成答案**：仅使用提供的上下文中的内容来创建对查询的详细和忠实的答案。避免添加不直接存在或无法从给定上下文推断的信息。\n3. **多跳上下文标签**：\n   - 每个上下文片段标记为 `<1-hop>`、`<2-hop>` 等。\n   - 确保查询使用至少两个片段的信息并有意义地连接它们。",
+  "examples": [
+    {
+      "input": {
+        "persona": {
+          "name": "历史学家",
+          "role_description": "专注于重大的科学里程碑及其全球影响。"
+        },
+        "themes": [
+          "相对论",
+          "实验验证"
+        ],
+        "query_style": "正式",
+        "query_length": "中等",
+        "context": [
+          "<1-hop> 阿尔伯特·爱因斯坦发展了相对论，引入了时空的概念。",
+          "<2-hop> 在1919年的日全食期间，光线被重力弯曲的现象得到了证实，支持了爱因斯坦的理论。"
+        ]
+      },
+      "output": {
+        "query": "在1919年的日全食期间，相对论的实验验证是如何实现的？",
+        "answer": "在1919年的日全食期间，通过确认光线被重力弯曲的现象，实现了相对论的实验验证，这支持了爱因斯坦在理论中提出的时空概念。"
+      }
+    }
+  ]
 }

evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": 2334929353739018813,
+  "ragas_version": "0.2.7",
+  "original_hash": 4608101540215877909,
   "language": "chinese",
   "instruction": "给定一个主题和角色列表，根据角色描述将每个角色与相关主题关联起来。",
   "examples": [

evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": -2189588237940965149,
+  "ragas_version": "0.2.7",
+  "original_hash": -1903496084584659501,
   "language": "chinese",
   "instruction": "请说明给定的信息是否得到视觉和文本上下文信息的支持。您需要回答“是”或“否”。如果任何图像和文本上下文支持该信息，请回答“是”。",
   "examples": [

evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": -7302860412443151372,
+  "ragas_version": "0.2.7",
+  "original_hash": -2067734205359291291,
   "language": "chinese",
   "instruction": "\n您的任务是评估查询的响应是否与提供的图像和文本上下文信息一致。\n您有两个选项可以回答。要么是 True / False。\n如果查询的响应与上下文信息一致，则回答 - True，否则为 False。\n",
   "examples": [
@@ -9,7 +9,7 @@
         "user_input": "传统玛格丽塔披萨的主要成分是什么？",
         "response": "玛格丽塔披萨的主要成分是番茄、马苏里拉奶酪和新鲜罗勒。",
         "retrieved_contexts": [
-          "传统的玛格丽塔披萨由薄薄的饼皮组成。",
+          "传统的玛格丽塔披萨由薄饼皮组成。",
           "主要的配料包括番茄、马苏里拉奶酪、新鲜罗勒、盐和橄榄油。",
           "它是最简单和最经典的披萨类型之一。"
         ]
@@ -20,11 +20,11 @@
     },
     {
       "input": {
-        "user_input": "谁在2021年奥斯卡颁奖典礼上获得了最佳男演员奖？",
+        "user_input": "谁在2021年获得了奥斯卡最佳男演员奖？",
         "response": "2021年的最佳男演员奖由莱昂纳多·迪卡普里奥获得。",
         "retrieved_contexts": [
           "第93届奥斯卡颁奖典礼于2021年举行。",
-          "安东尼·霍普金斯凭借在《困在时间里的父亲》中的角色赢得了最佳男演员奖。",
+          "安东尼·霍普金斯凭借在《困在时间里的父亲》中的角色获得了最佳男演员奖。",
           "由于COVID-19的限制，这次活动具有独特性。"
         ]
       },

evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": -7036736759899743798,
+  "ragas_version": "0.2.7",
+  "original_hash": -677862064343016555,
   "language": "chinese",
-  "instruction": "从给定文本中提取命名实体，限制输出为最重要的实体。确保实体数量不超过指定的最大值。",
+  "instruction": "从给定文本中提取命名实体，限制输出为顶级实体。确保实体数量不超过指定的最大值。",
   "examples": [
     {
       "input": {

evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json CHANGED Viewed

@@ -1,7 +1,24 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": -1422723613754983378,
+  "ragas_version": "0.2.7",
+  "original_hash": 3079700511467088808,
   "language": "chinese",
-  "instruction": "根据指定的条件（角色、术语、风格、长度）和提供的上下文生成查询和答案。确保答案完全忠实于上下文，仅使用直接来自提供上下文的信息。### 指令：\n1. **生成查询**：根据上下文、角色、术语、风格和长度，创建一个与角色视角一致并包含术语的问题。\n2. **生成答案**：仅使用提供的上下文中的内容，构建对查询的详细答案。不要添加上下文中未包含或无法推断的信息。\n### 示例输出：\n\n",
-  "examples": []
+  "instruction": "根据指定的条件（角色、术语、风格、长度）和提供的上下文生成一个单跳查询和答案。确保答案完全忠实于上下文，仅使用提供的上下文中的信息。### 指导：\n1. **生成查询**：根据上下文、角色、术语、风格和长度，创建一个与角色视角一致并包含术语的问题。\n2. **生成答案**：仅使用提供的上下文中的内容，构建对查询的详细答案。不要添加上下文中未包含或无法推断的信息。\n",
+  "examples": [
+    {
+      "input": {
+        "persona": {
+          "name": "软件工程师",
+          "role_description": "专注于编码最佳实践和系统设计。"
+        },
+        "term": "微服务",
+        "query_style": "正式",
+        "query_length": "中等",
+        "context": "微服务是一种架构风格，其中应用程序被构建为一组松散耦合的服务。每个服务都是细粒度的，并专注于单一功能。"
+      },
+      "output": {
+        "query": "微服务在软件架构中的目的是什么？",
+        "answer": "微服务旨在将应用程序结构化为一组松散耦合的服务，每个服务专注于单一功能。"
+      }
+    }
+  ]
 }

evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": 2334929353739018813,
+  "ragas_version": "0.2.7",
+  "original_hash": 4608101540215877909,
   "language": "chinese",
   "instruction": "给定一个主题和角色列表，根据角色描述将每个角色与相关主题关联起来。",
   "examples": [
@@ -17,7 +17,7 @@
             "role_description": "专注于包容性和员工支持。"
           },
           {
-            "name": "远程团队领导",
+            "name": "远程团队负责人",
             "role_description": "管理远程团队沟通。"
           }
         ]

evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": -5467318232123540806,
+  "ragas_version": "0.2.7",
+  "original_hash": -2203889341293275650,
   "language": "chinese",
-  "instruction": "将给定文本总结为少于10个句子。",
+  "instruction": "将给定文本总结为不超过10个句子。",
   "examples": [
     {
       "input": {
-        "text": "人工智能\n\n人工智能正在通过自动化以前需要人类智能的任务来改变各个行业。从医疗到金融，人工智能正在被用来快速准确地分析大量数据。这项技术还推动了自动驾驶汽车和个性化推荐等领域的创新。"
+        "text": "人工智能\n\n人工智能正在通过自动化以前需要人类智能的任务来改变各个行业。从医疗保健到金融，人工智能正在被用来快速准确地分析大量数据。这项技术还推动了自动驾驶汽车和个性化推荐等领域的创新。"
       },
       "output": {
         "text": "人工智能通过自动化任务、分析数据和推动自动驾驶汽车和个性化推荐等创新，正在革新各个行业。"

evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "ragas_version": "0.2.5",
-  "original_hash": 2452110859551524285,
+  "ragas_version": "0.2.7",
+  "original_hash": -7344189172470926110,
   "language": "chinese",
   "instruction": "从给定的文本中提取主要主题和概念。",
   "examples": [

evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import typing as t
 from pydantic import BaseModel
 from ragas.prompt import PydanticPrompt, StringIO
 from ragas.testset.persona import Persona

evalscope/backend/rag_eval/ragas/task_template.py CHANGED Viewed

@@ -1,7 +1,8 @@
-import os
 import asyncio
+import os
 from datasets import Dataset
-from evalscope.backend.rag_eval import EmbeddingModel, LLM
+from evalscope.backend.rag_eval import LLM, EmbeddingModel
 from evalscope.backend.rag_eval.ragas.tasks.translate_prompt import translate_prompts
 from evalscope.utils.logger import get_logger
 from .arguments import EvaluationArguments
@@ -9,13 +10,11 @@ from .arguments import EvaluationArguments
 logger = get_logger()
-def rag_eval(
-    args: EvaluationArguments,
-) -> None:
+def rag_eval(args: EvaluationArguments, ) -> None:
-    from ragas import evaluate, RunConfig
-    from ragas.llms import LangchainLLMWrapper
     import importlib
+    from ragas import RunConfig, evaluate
+    from ragas.llms import LangchainLLMWrapper
     def dynamic_import(*function_names):
         functions = []
@@ -27,9 +26,6 @@ def rag_eval(
     llm = LLM.load(**args.critic_llm)
     embedding = EmbeddingModel.load(**args.embeddings)
-    # load dataset
-    dataset = Dataset.from_json(args.testset_file)
     # load metrics
     metrics = dynamic_import(*args.metrics)
     asyncio.run(
@@ -38,8 +34,9 @@ def rag_eval(
             target_lang=args.language,
             llm=LangchainLLMWrapper(llm),
             adapt_instruction=True,
-        )
-    )
+        ))
+    # load dataset
+    dataset = Dataset.from_json(args.testset_file)
     # evaluate
     runconfig = RunConfig(timeout=600, max_retries=2, max_wait=60, max_workers=1)
@@ -54,8 +51,6 @@ def rag_eval(
     logger.info(score_df)
     output_path = args.testset_file.replace('.json', '_score.json')
-    score_df.to_json(
-        output_path, indent=4, index=False, orient='records', force_ascii=False
-    )
+    score_df.to_json(output_path, indent=4, index=False, orient='records', force_ascii=False)
     logger.info(f'Eval score saved to {output_path}')

evalscope/backend/rag_eval/ragas/tasks/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
 from evalscope.backend.rag_eval.ragas.tasks.testset_generation import generate_testset
-from evalscope.backend.rag_eval.ragas.tasks.translate_prompt import translate_prompts
+from evalscope.backend.rag_eval.ragas.tasks.translate_prompt import translate_prompts

evalscope/backend/rag_eval/ragas/tasks/build_distribution.py ADDED Viewed

@@ -0,0 +1,45 @@
+import asyncio
+from ragas.llms import BaseRagasLLM
+from ragas.testset.graph import KnowledgeGraph
+from ragas.testset.synthesizers.multi_hop import MultiHopAbstractQuerySynthesizer, MultiHopSpecificQuerySynthesizer
+from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer
+from .translate_prompt import translate_prompts
+def default_query_distribution(llm: BaseRagasLLM, kg: KnowledgeGraph, language: str):
+    """
+    Generates a distribution of query synthesizers, optionally tailored to a
+        specific knowledge graph (KG) and translated into a given language.
+    """
+    single_hop = SingleHopSpecificQuerySynthesizer(llm=llm)
+    multi_hop_abs = MultiHopAbstractQuerySynthesizer(llm=llm)
+    multi_hop_spec = MultiHopSpecificQuerySynthesizer(llm=llm)
+    asyncio.run(
+        translate_prompts(
+            prompts=[
+                single_hop,
+                multi_hop_abs,
+                multi_hop_spec,
+            ],
+            target_lang=language,
+            llm=llm,
+            adapt_instruction=True,
+        ))
+    default_queries = [
+        single_hop,
+        multi_hop_abs,
+        multi_hop_spec,
+    ]
+    if kg is not None:
+        available_queries = []
+        for query in default_queries:
+            if query.get_node_clusters(kg):
+                available_queries.append(query)
+    else:
+        available_queries = default_queries
+    return [(query, 1 / len(available_queries)) for query in available_queries]

evalscope/backend/rag_eval/ragas/tasks/build_transform.py ADDED Viewed

@@ -0,0 +1,135 @@
+import asyncio
+from langchain_core.documents import Document
+from ragas.embeddings import BaseRagasEmbeddings
+from ragas.llms import BaseRagasLLM
+from ragas.testset.graph import NodeType
+from ragas.testset.transforms.engine import Parallel
+from ragas.testset.transforms.extractors import EmbeddingExtractor, HeadlinesExtractor, SummaryExtractor
+from ragas.testset.transforms.extractors.llm_based import NERExtractor, ThemesExtractor
+from ragas.testset.transforms.filters import CustomNodeFilter
+from ragas.testset.transforms.relationship_builders import CosineSimilarityBuilder, OverlapScoreBuilder
+from ragas.testset.transforms.splitters import HeadlineSplitter
+from ragas.utils import num_tokens_from_string
+from typing import List
+from .translate_prompt import translate_prompts
+def default_transforms(
+    documents: List[Document],
+    llm: BaseRagasLLM,
+    embedding_model: BaseRagasEmbeddings,
+    language: str,
+):
+    """
+    Creates and returns a default set of transforms for processing a knowledge graph.
+    This function defines a series of transformation steps to be applied to a
+    knowledge graph, including extracting summaries, keyphrases, titles,
+    headlines, and embeddings, as well as building similarity relationships
+    between nodes.
+    """
+    def count_doc_length_bins(documents, bin_ranges):
+        data = [num_tokens_from_string(doc.page_content) for doc in documents]
+        bins = {f'{start}-{end}': 0 for start, end in bin_ranges}
+        for num in data:
+            for start, end in bin_ranges:
+                if start <= num <= end:
+                    bins[f'{start}-{end}'] += 1
+                    break  # Move to the next number once it’s placed in a bin
+        return bins
+    def filter_doc_with_num_tokens(node, min_num_tokens=500):
+        return (node.type == NodeType.DOCUMENT
+                and num_tokens_from_string(node.properties['page_content']) > min_num_tokens)
+    def filter_docs(node):
+        return node.type == NodeType.DOCUMENT
+    def filter_chunks(node):
+        return node.type == NodeType.CHUNK
+    bin_ranges = [(0, 100), (101, 500), (501, 100000)]
+    result = count_doc_length_bins(documents, bin_ranges)
+    result = {k: v / len(documents) for k, v in result.items()}
+    if result['501-100000'] >= 0.25:
+        headline_extractor = HeadlinesExtractor(llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node))
+        splitter = HeadlineSplitter(min_tokens=500)
+        summary_extractor = SummaryExtractor(llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node))
+        theme_extractor = ThemesExtractor(llm=llm, filter_nodes=lambda node: filter_chunks(node))
+        ner_extractor = NERExtractor(llm=llm, filter_nodes=lambda node: filter_chunks(node))
+        summary_emb_extractor = EmbeddingExtractor(
+            embedding_model=embedding_model,
+            property_name='summary_embedding',
+            embed_property_name='summary',
+            filter_nodes=lambda node: filter_doc_with_num_tokens(node),
+        )
+        cosine_sim_builder = CosineSimilarityBuilder(
+            property_name='summary_embedding',
+            new_property_name='summary_similarity',
+            threshold=0.7,
+            filter_nodes=lambda node: filter_doc_with_num_tokens(node),
+        )
+        ner_overlap_sim = OverlapScoreBuilder(threshold=0.01, filter_nodes=lambda node: filter_chunks(node))
+        node_filter = CustomNodeFilter(llm=llm, filter_nodes=lambda node: filter_chunks(node))
+        # translate prompts
+        asyncio.run(
+            translate_prompts(
+                prompts=[headline_extractor, summary_extractor, theme_extractor, ner_extractor, node_filter],
+                target_lang=language,
+                llm=llm,
+                adapt_instruction=True,
+            ))
+        transforms = [
+            headline_extractor,
+            splitter,
+            summary_extractor,
+            node_filter,
+            Parallel(summary_emb_extractor, theme_extractor, ner_extractor),
+            Parallel(cosine_sim_builder, ner_overlap_sim),
+        ]
+    elif result['101-500'] >= 0.25:
+        summary_extractor = SummaryExtractor(llm=llm, filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100))
+        summary_emb_extractor = EmbeddingExtractor(
+            embedding_model=embedding_model,
+            property_name='summary_embedding',
+            embed_property_name='summary',
+            filter_nodes=lambda node: filter_doc_with_num_tokens(node, 100),
+        )
+        ner_extractor = NERExtractor(llm=llm)
+        ner_overlap_sim = OverlapScoreBuilder(threshold=0.01)
+        theme_extractor = ThemesExtractor(llm=llm, filter_nodes=lambda node: filter_docs(node))
+        node_filter = CustomNodeFilter(llm=llm)
+        # translate prompts
+        asyncio.run(
+            translate_prompts(
+                prompts=[summary_extractor, theme_extractor, ner_extractor, node_filter],
+                target_lang=language,
+                llm=llm,
+                adapt_instruction=True,
+            ))
+        transforms = [
+            summary_extractor,
+            node_filter,
+            Parallel(summary_emb_extractor, theme_extractor, ner_extractor),
+            ner_overlap_sim,
+        ]
+    else:
+        raise ValueError('Documents appears to be too short (ie 100 tokens or less). Please provide longer documents.')
+    return transforms

evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl