evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.8.2
3
+ Version: 0.10.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -92,6 +92,11 @@ Requires-Dist: numpy; extra == "all"
92
92
  Requires-Dist: sse-starlette; extra == "all"
93
93
  Requires-Dist: transformers; extra == "all"
94
94
  Requires-Dist: unicorn; extra == "all"
95
+ Requires-Dist: gradio>=5.4.0; extra == "all"
96
+ Requires-Dist: plotly>=5.23.0; extra == "all"
97
+ Provides-Extra: app
98
+ Requires-Dist: gradio>=5.4.0; extra == "app"
99
+ Requires-Dist: plotly>=5.23.0; extra == "app"
95
100
  Provides-Extra: inner
96
101
  Requires-Dist: absl-py; extra == "inner"
97
102
  Requires-Dist: accelerate; extra == "inner"
@@ -160,14 +165,16 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
160
165
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
161
166
 
162
167
  ## 📋 Contents
163
- - [Introduction](#introduction)
164
- - [News](#News)
165
- - [Installation](#installation)
166
- - [Quick Start](#quick-start)
168
+ - [Introduction](#-introduction)
169
+ - [News](#-news)
170
+ - [Installation](#️-installation)
171
+ - [Quick Start](#-quick-start)
167
172
  - [Evaluation Backend](#evaluation-backend)
168
- - [Custom Dataset Evaluation](#custom-dataset-evaluation)
169
- - [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
170
- - [Arena Mode](#arena-mode)
173
+ - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
174
+ - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
175
+ - [Arena Mode](#-arena-mode)
176
+ - [Contribution](#️-contribution)
177
+ - [Roadmap](#-roadmap)
171
178
 
172
179
 
173
180
  ## 📝 Introduction
@@ -208,11 +215,17 @@ Please scan the QR code below to join our community groups:
208
215
 
209
216
 
210
217
  ## 🎉 News
218
+ - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
219
+ - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
220
+ - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
211
221
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
212
222
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
213
223
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
214
224
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
215
225
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
226
+
227
+ <details><summary>More</summary>
228
+
216
229
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
217
230
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
218
231
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -224,7 +237,7 @@ Please scan the QR code below to join our community groups:
224
237
  - 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
225
238
  - 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
226
239
 
227
-
240
+ </details>
228
241
 
229
242
  ## 🛠️ Installation
230
243
  ### Method 1: Install Using pip
@@ -368,15 +381,85 @@ run_task(task_cfg="config.json")
368
381
  - `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
369
382
 
370
383
  ### Output Results
384
+ ```text
385
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
386
+ | Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
387
+ +=======================+================+=================+=================+===============+=======+=========+
388
+ | Qwen2.5-0.5B-Instruct | gsm8k | AverageAccuracy | default | main | 5 | 0.4 |
389
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
390
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Easy | 5 | 0.8 |
391
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
392
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Challenge | 5 | 0.4 |
393
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
371
394
  ```
372
- +-----------------------+-------------------+-----------------+
373
- | Model | ai2_arc | gsm8k |
374
- +=======================+===================+=================+
375
- | Qwen2.5-0.5B-Instruct | (ai2_arc/acc) 0.6 | (gsm8k/acc) 0.6 |
376
- +-----------------------+-------------------+-----------------+
395
+
396
+ ## 📈 Visualization of Evaluation Results
397
+
398
+ 1. Install the dependencies required for visualization, including gradio, plotly, etc.
399
+ ```bash
400
+ pip install 'evalscope[app]'
401
+ ```
402
+
403
+ 2. Start the Visualization Service
404
+
405
+ Run the following command to start the visualization service.
406
+ ```bash
407
+ evalscope app
377
408
  ```
409
+ You can access the visualization service in the browser if the following output appears.
410
+ ```text
411
+ * Running on local URL: http://127.0.0.1:7861
412
+
413
+ To create a public link, set `share=True` in `launch()`.
414
+ ```
415
+
416
+ <table>
417
+ <tr>
418
+ <td style="text-align: center;">
419
+ <img src="docs/zh/get_started/images/setting.png" alt="Setting" style="width: 100%;" />
420
+ <p>Setting Interface</p>
421
+ </td>
422
+ <td style="text-align: center;">
423
+ <img src="docs/zh/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
424
+ <p>Model Comparison</p>
425
+ </td>
426
+ </tr>
427
+ <tr>
428
+ <td style="text-align: center;">
429
+ <img src="docs/zh/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
430
+ <p>Report Overview</p>
431
+ </td>
432
+ <td style="text-align: center;">
433
+ <img src="docs/zh/get_started/images/report_details.png" alt="Report Details" style="width: 100%;" />
434
+ <p>Report Details</p>
435
+ </td>
436
+ </tr>
437
+ </table>
438
+
439
+ For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html)
440
+
441
+ ## 🌐 Evaluation of Specified Model API
442
+
443
+ Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
444
+
445
+ For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
446
+
447
+ ```shell
448
+ export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
449
+ ```
450
+ Then, you can use the following command to evaluate the model API service:
451
+ ```shell
452
+ evalscope eval \
453
+ --model qwen2.5 \
454
+ --api-url http://127.0.0.1:8801/v1/chat/completions \
455
+ --api-key EMPTY \
456
+ --eval-type service \
457
+ --datasets gsm8k \
458
+ --limit 10
459
+ ```
460
+
461
+ ## ⚙️ Custom Parameter Evaluation
378
462
 
379
- ## ⚙️ Complex Evaluation
380
463
  For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
381
464
 
382
465
  ```shell
@@ -414,7 +497,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
414
497
  - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
415
498
 
416
499
 
417
- ## Model Serving Performance Evaluation
500
+ ## 📈 Model Serving Performance Evaluation
418
501
  A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
419
502
 
420
503
  Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
@@ -439,19 +522,32 @@ Speed Benchmark Results:
439
522
  +---------------+-----------------+----------------+
440
523
  ```
441
524
 
442
- ## Custom Dataset Evaluation
525
+ ## 🖊️ Custom Dataset Evaluation
443
526
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
444
527
 
445
528
 
446
- ## Arena Mode
529
+ ## 🏟️ Arena Mode
447
530
  The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
448
531
 
449
532
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
450
533
 
534
+ ## 👷‍♂️ Contribution
451
535
 
536
+ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
452
537
 
538
+ <a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
539
+ <table>
540
+ <tr>
541
+ <th colspan="2">
542
+ <br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
543
+ </th>
544
+ </tr>
545
+ </table>
546
+ </a>
453
547
 
454
- ## TO-DO List
548
+ ## 🔜 Roadmap
549
+ - [ ] Support for better evaluation report visualization
550
+ - [x] Support for mixed evaluations across multiple datasets
455
551
  - [x] RAG evaluation
456
552
  - [x] VLM evaluation
457
553
  - [x] Agents evaluation
@@ -462,8 +558,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
462
558
  - [ ] GAIA
463
559
  - [ ] GPQA
464
560
  - [x] MBPP
465
- - [ ] Auto-reviewer
466
- - [ ] Qwen-max
467
561
 
468
562
 
469
563
  ## Star History
@@ -1,11 +1,11 @@
1
- evalscope/__init__.py,sha256=RY0EjssSquqqsysRobElYm9Ix6E41uTXeaeh7lI7kqs,106
2
- evalscope/arguments.py,sha256=nozBnog45l77jxTFH_lyyJkj04ER3yyIpICepc2tC1Y,3783
3
- evalscope/config.py,sha256=_4IRpoAssdHEg75UKPKVw6FVaCu2NaP2aOMA5DRsuGU,8444
4
- evalscope/constants.py,sha256=M5qJ8b7kp-RF52IwBjx5EMjeuiH1e1jdollCsbIT-c4,3753
5
- evalscope/run.py,sha256=s_qE1ukrt4HBfRVAPJjC1XiqD9k7rSH7lX8yysyf5do,7279
6
- evalscope/run_arena.py,sha256=6nc_S8KL7B3V4SsnpIexfvczHN9kQwHR9R1GXb2sqgI,8586
7
- evalscope/summarizer.py,sha256=FgdYz7LlNs5XpDMlj2ULkVQGIg5XVeeWdWJ1_OMweq0,5882
8
- evalscope/version.py,sha256=uvEbCM3fC0oZ2Rt82Q0oErXsM-iYBNxJtPPLXPwscAU,118
1
+ evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
+ evalscope/arguments.py,sha256=v6IyhjgBACDkapnZYi6DeBI1aZxRVA-mx7KR1j72lYs,4493
3
+ evalscope/config.py,sha256=4klkNziKT4r8a4Z1imkiY16-S8iER1BYPMOG4nJg9lU,8571
4
+ evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
5
+ evalscope/run.py,sha256=KKZBy2hr8_BscE0ZR1rN9U7iPc1eZYeeInfXe3EY7lA,5718
6
+ evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
+ evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
+ evalscope/version.py,sha256=59oai-Z2lJog2HCNhMbBxRg4D3vkwPK5sfffmDSPntE,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -22,7 +22,7 @@ evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y
22
22
  evalscope/backend/rag_eval/clip_benchmark/task_template.py,sha256=2NQRvlYY2SOzvOOj9WRLyxvRlyj8CAcgbQqgsv-Xjgw,3929
23
23
  evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=CQnWZZTQ0FOzDtmGv7OF0W4Cv4g6u4_LQ93koDu1pes,2556
25
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=L0WYiy3Rgar0uMZRI-kz1qCEuUaFXwcsVj0CACG13ms,7439
25
+ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=NwpxNECN7NFgtlVdKY7vet5m-gAmIp8MJYka0eexWu0,7424
26
26
  evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0Uq7W0sPDBJS1rqp70KgSfeRQ3c7u8YeGhj5Yiu6rk,5646
27
27
  evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
28
28
  evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
@@ -50,19 +50,19 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_
50
50
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
52
52
  evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
53
- evalscope/backend/rag_eval/utils/llm.py,sha256=619eP8pXUcwIBaktBrGNA17j53j9jfg_1JeFDYzMCIE,2582
53
+ evalscope/backend/rag_eval/utils/llm.py,sha256=IaNgdQBnURAmtpK5UPDqfCNrtV_J3wu0s4JWQqKedHA,2568
54
54
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
55
55
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
- evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
59
- evalscope/benchmarks/benchmark.py,sha256=DnLgr__CzE4DICK3u3ZMeFY0sVktefmYh2Yql2swEhg,1796
60
- evalscope/benchmarks/data_adapter.py,sha256=hSW-tyTXxUPS_FnsMYAxxw9e4N7jS5eLiBHgCFAQNeo,10287
61
- evalscope/benchmarks/arc/__init__.py,sha256=9GBWGArac-s9igD8lnoEEKnpSQYNaHA8fVKonLimkrQ,360
58
+ evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
+ evalscope/benchmarks/benchmark.py,sha256=SFDjyxd4t4KEcLBP82zE_KCJ_wXuv8J3XFzIR4M9fFI,2419
60
+ evalscope/benchmarks/data_adapter.py,sha256=Aaspp5dR1aINXAopm0y7LHeMwJbmYXfy5bNm9DpagRo,12051
61
+ evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
62
62
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
63
- evalscope/benchmarks/arc/arc_adapter.py,sha256=3q74hZB9G3X0-pQPzBk_a8wZIedmIlDHZBb4aUaBGRA,9197
64
- evalscope/benchmarks/bbh/__init__.py,sha256=PcIMfTe4h5m-efBhnYQt6J-6O0qHFHGfuosRhk1Lhfo,303
65
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=UeNEEea5jqT7sYLpGGzvnxDdy6SrffM8H7gnVRpfGTw,10699
63
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=TdDB3lazJNdUt2bBo1G7zaOAN6YkKXdcgMui1ygQj3Y,6591
64
+ evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
65
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=pkgIEr_4QyzngUcs0j4oOscFljGoYZcCAS861Afnt_0,8316
66
66
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
67
67
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
68
68
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -90,90 +90,108 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt
90
90
  evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
91
91
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
92
92
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
93
- evalscope/benchmarks/ceval/__init__.py,sha256=vBN_OgmcvKglYIu96nRoT2wD8FDdM3cRoTB-dqlmbLg,393
94
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=1J_WquXRPw-pRHBiYn7ZxRVSjjvWDqRUJLa8nvT1vYk,15050
93
+ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
94
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=2PvM5cvviyVNeFGnz-ymYVhEyPoea52OL_dg7dwVzQQ,11429
95
95
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
96
96
  evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
97
- evalscope/benchmarks/cmmlu/__init__.py,sha256=9M_Lo5-ePaD6hWG-Y-_i-U79yTOKadtHPG7zFvekwN4,393
97
+ evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
98
98
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
99
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=malBAKNtDbfJ-kJoQUQTYYQ18MTJST63bgcsLiiktlw,13956
99
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=O6FIsJDgg4OiHZSafaDq7jZ2gubWumPMhkdVb8WN-D8,10526
100
100
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
101
- evalscope/benchmarks/competition_math/__init__.py,sha256=CDK03RXT-X21WcIAlkrCs0rCSiHe-yTY0nwM6-l75nI,465
101
+ evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
102
102
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
103
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=cHWJ6LLIWvftFXjGrOidMlZ1RGUFxPgDjs4wmBPSm1Y,18862
104
- evalscope/benchmarks/general_qa/__init__.py,sha256=N2t-ehNrl9eVAarlSgJvRapm9yOjhfCWhNPPfcUUy-s,409
105
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=cSW0Mq9__-gh-tVoVXD9Rk6h3h2iZW-Fu3RQ16haJhQ,5878
106
- evalscope/benchmarks/gsm8k/__init__.py,sha256=CtcG_QM8m5zmvMs2N53d7kcm4_hIgsO2qYPyx-71aLw,313
103
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=ns2WPbqkR52rRKo244WoAeAO9VOESEl_sHCPhym2DnM,6768
104
+ evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
105
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=1MQXl3Wf_Dnzn7_7BSTu7RT6BOfhhiVyAnqECawxyfM,3899
106
+ evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
107
107
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
108
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=KBI9t5F7XW1Cs44QUA7ultkfsXxLyucH9zNYe-jOQQk,13866
109
- evalscope/benchmarks/hellaswag/__init__.py,sha256=cY1kluaTqC7AvyzwlQYc3BF_kB3LD1gOpg6i7RDr0cI,415
108
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=9DuNos8xCOVFOUSJ04LAoBRVPbtqgR4XmOVk6r8ADU8,11114
109
+ evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
110
110
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
111
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=IIesSMPw1Yya4-LjqJt1QVkpOx8RGKwBYTQtmc0VfaQ,8495
112
- evalscope/benchmarks/humaneval/__init__.py,sha256=lqSlAf1-8Nzhc1j89sj6yAcaLt9pGhqu15M84bmzamc,333
111
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=p7Nu-1B2mgbjfth1IhkMSWEC0TxOtD6tp_bOWeeRjts,6332
112
+ evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
113
113
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
114
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=VAO7siedusq9z3b1J3ztFE4XDopYKqmwe2n-Numg7HY,9149
115
- evalscope/benchmarks/mmlu/__init__.py,sha256=OGiN1J80WDM72y242o7diYT9Rl-jkVEqTNntCl8Vt4M,385
114
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=mjWkJqeRM1JVlrLXaCz1qscneLhYySZt8cgdXZSmJWY,5215
115
+ evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=JwJoXfAiawx9Rey1MsEtwCdo7QMl_wxOjspiWAuJFko,2074
117
+ evalscope/benchmarks/ifeval/instructions.py,sha256=8mV4f9H1vE8tEnbF1k8uVoDjzJL2tt7lCu2JQaqJelw,56247
118
+ evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
119
+ evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
120
+ evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
121
+ evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
+ evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=gByj-11KGRTQk2wF1UwNACl8i1svBAEDaj-KJm1XEmw,2387
123
+ evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
116
124
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
117
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8T-fN_Az0gWOyME9nHl3MvcD144TjWknFKcEOMHppAI,15494
125
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=-ONQW0EPAPXFPIpH_Y6zRE-t9j5dT7yABgAU8wxIH4M,11829
118
126
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
119
- evalscope/benchmarks/race/__init__.py,sha256=HVda-CB-Q-N8RbwiVLADXYNY6VLUH-frJ8VCc3jm0Mk,385
127
+ evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=9Mg7AKb2YL7aCilsXNA5_f1JmETfXQd1kOvLkGcKFEA,4372
129
+ evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
120
130
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
121
- evalscope/benchmarks/race/race_adapter.py,sha256=WgnWYSctc3VtWm2FAeVDTlxR2hwXsF2tala7n66f5mw,9841
131
+ evalscope/benchmarks/race/race_adapter.py,sha256=9uyQLDA9kVKGu0XhwcBoMyxcgUh3jqWXRO5DahRqUpg,6678
122
132
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
123
- evalscope/benchmarks/trivia_qa/__init__.py,sha256=eLMVC6tfwty5HqrQuGyWeAF2IhRNajWoO1SkLVemQj4,409
133
+ evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
124
134
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
125
135
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
126
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=YFatCVNM7I0YUttBznQMohmMkm3qxJpCSVxf6o_sgHk,7663
127
- evalscope/benchmarks/truthful_qa/__init__.py,sha256=EZOaHn13NS3ddHpS62ija8jz71SxOOsqcQRVg69e_Ho,429
136
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=e-jrcCvl8fbPzWCOYKq_sbl4XCulsPzAECGtvTPE-rM,5106
137
+ evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
128
138
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
129
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=XFnZXQJpHEe_xP_HImPHa8qrwojywnWAgeSaJAYB0oU,14916
139
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=tCVO0RTD_S7z1ky7su5z67dnpgbsEtcH5j0vCpfvUV8,12908
130
140
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
131
141
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
132
- evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
142
+ evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
143
+ evalscope/cli/start_app.py,sha256=icLwBq5yHVmJ4C9y-sYq_o_rPvCT-oO-F2r7RlegHv0,706
133
144
  evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
134
145
  evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
135
146
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
147
+ evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
148
+ evalscope/collections/evaluator.py,sha256=_XaLn_cSKvAW96aNwaaPbrBDPl9qn0VrsTjID_y7SpM,8910
149
+ evalscope/collections/sampler.py,sha256=6Tp0jN7bJQqG-7AQ2UDPDur6O5aC_nl0N-OV9HfuE9Q,4769
150
+ evalscope/collections/schema.py,sha256=Ns47HXt7Ym4sPdPyxStxALHUid2cW7kWhqvw_jK_p-4,4172
136
151
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
137
- evalscope/evaluator/evaluator.py,sha256=wrTWyvyD1eqSvsZRwDRV1UVBxXv7y-2A29UCD9F-5qI,18412
152
+ evalscope/evaluator/evaluator.py,sha256=0IOuWQ4KgWuMisNmFqh4-id3d1Kkbkf4JW-6hVz7tqU,16638
138
153
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
139
154
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
140
- evalscope/evaluator/reviewer/auto_reviewer.py,sha256=nL8k-i92L1iMwjPOnNxzQyZICfukZKJul4ZBvOWkHGw,16414
141
- evalscope/metrics/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
155
+ evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
156
+ evalscope/metrics/__init__.py,sha256=yzuZjXufrPqVhzNTNaJLJwhs7-Sgb-iNG0I3BdOX7Tg,291
142
157
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
143
- evalscope/metrics/math_accuracy.py,sha256=WqLfACuIeVFrX4q6_c2exnTLn2t10-rjv6sfxcqJJ14,1965
144
- evalscope/metrics/metrics.py,sha256=9Qj2KuSmaLOPhpGdBfiKGKVTIxHCuk0CPKI2b6L1zb8,12589
145
- evalscope/metrics/rouge_metric.py,sha256=oB-rBgMnavZSyOiAefg--OXdGfffKrET5bUmrx3nmx0,4408
158
+ evalscope/metrics/math_accuracy.py,sha256=a0L_YT70bsJYn5_POICJyj6ZVFbHek1ly6j_ssV9Xsc,5585
159
+ evalscope/metrics/metrics.py,sha256=H02Hhj9Me2qzUjSzdV57i5Gj6xP_w5kbuPcuPpejlI0,12860
160
+ evalscope/metrics/named_metrics.py,sha256=j-y-d5EJ4FJzOxlIKobKIMUNu--nzAIIc2j0TvDfFb0,574
161
+ evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
146
162
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
147
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=9YdE38duhBFsmFLkY7HXDCQqUNavB5Hh3kaB4WTjAII,11971
163
+ evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
148
164
  evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
149
165
  evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
150
- evalscope/models/__init__.py,sha256=b-jXJ2Cj6dH8notAU7lvCVKbGrcEaf8Gfr5w79qNHAk,111
151
- evalscope/models/dummy_chat_model.py,sha256=aG3yolnnIN_-gsfF9FsyjyGMewQteEnUfOxTGScROSE,1272
152
- evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
153
- evalscope/models/model_adapter.py,sha256=5jzDXpFp24ZZ25tjpIMJeDTz-lDSD_EHp040gJOZACc,19007
154
- evalscope/models/openai_model.py,sha256=-tPBu6v0Ogf_flmG88tFuu66QNKrOyxv3AjYwVtuR44,3313
155
- evalscope/models/api/__init__.py,sha256=0c75K78O1KaV02BqqtEp-hhtSSClXLawb8E0c2iqN_A,105
156
- evalscope/models/api/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
166
+ evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
167
+ evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
168
+ evalscope/models/chat_adapter.py,sha256=P6CE0JqWDsE7afNfU_wicdisHLfc46Rw3rwTA0sEGQQ,5398
169
+ evalscope/models/choice_adapter.py,sha256=Zb-UUFpF2tpMGuGH_wFleMxpSb__-SuN1cMF7yj25aI,7661
170
+ evalscope/models/custom_adapter.py,sha256=uj4kbBCwhrXjvSq9f6HgTJ5yJ9FJpvs1k5-9Ekm9RmA,2272
171
+ evalscope/models/local_model.py,sha256=EBclVq5tqUFNOZebRlNnZSvzwtSun7FsZRf2tx0cMt0,2486
172
+ evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
173
+ evalscope/models/server_adapter.py,sha256=VGk_nTwkLWO7Ln7lV_KSaIBzlSRZzyIs_bWDeJ_pOho,4469
157
174
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
158
175
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
159
- evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
+ evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
177
+ evalscope/perf/__init__.py,sha256=rgSXzxIJ67yB_SLUdl4ljem2-ilB-Gw3640f4KWLO1k,51
160
178
  evalscope/perf/arguments.py,sha256=8KiD4u51B_twEaIiI0_kw4Jknk3YG4S6XN-vgvutChA,9233
161
179
  evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
162
180
  evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
163
- evalscope/perf/main.py,sha256=Qg99KhGUjnVAMkNofbDsvMGFxijewH8ri3QoW1y1U7U,1292
181
+ evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
164
182
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
165
183
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
166
184
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
167
185
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
168
- evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
186
+ evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
169
187
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
170
- evalscope/perf/plugin/api/openai_api.py,sha256=WV2EUIl1PTg-Dj7HMSxJrAE7OUxJZqQmZLJZLHffcJo,6805
188
+ evalscope/perf/plugin/api/openai_api.py,sha256=JxQGlzAbM7MBWcr3MvWiAg6E4lqdQLfkk1qK0vUWvn8,6817
171
189
  evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
172
190
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
173
191
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
174
- evalscope/perf/plugin/datasets/flickr8k.py,sha256=CGYtmRw71-ycJIObAHm2gmmJl_1MXPJOwmHV-0WS8DY,1581
192
+ evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
175
193
  evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
176
- evalscope/perf/plugin/datasets/longalpaca.py,sha256=Yx5nxHGkmD4lJOJ-jcyqm2ZsGAxotJc77jUCkO1z0a4,1164
194
+ evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
177
195
  evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
178
196
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
179
197
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -203,6 +221,11 @@ evalscope/registry/tasks/general_qa.yaml,sha256=S3kdlrazWX2VAX2PMhNtBnFZVSnUKBNi
203
221
  evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHMG0LXiM,729
204
222
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
205
223
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
224
+ evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
225
+ evalscope/report/app.py,sha256=rqjKgo7BFow4cA-vN9GaihQCd2m55ndHgUkWVr4Koyk,19470
226
+ evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
227
+ evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
228
+ evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
206
229
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
207
230
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
208
231
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -219,6 +242,7 @@ evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=
219
242
  evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
220
243
  evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
221
244
  evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
245
+ evalscope/third_party/longbench_write/tools/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
222
246
  evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
223
247
  evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
224
248
  evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
@@ -229,27 +253,24 @@ evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP
229
253
  evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
230
254
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
231
255
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
232
- evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
233
- evalscope/tools/combine_reports.py,sha256=JFf3P_GJLPdlSqpv30D8ioPb7dup3tOTktsELmsKXLI,4900
234
- evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
235
- evalscope/tools/rewrite_eval_results.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
236
- evalscope/utils/__init__.py,sha256=ZOri8VHx8LpJBJS90uw8h0Z7gPhtxhjWlBPWuuZgoRE,121
256
+ evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
237
257
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
238
- evalscope/utils/chat_service.py,sha256=VdNPXdFSf-4zxe0Ht74LBcdRNbpb9vzVi86HDEqfXHc,8647
258
+ evalscope/utils/chat_service.py,sha256=Kh3hEUW_HF158a0QqHbWepHIHRQFJgUM-jCDAcQ_maw,8674
239
259
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
240
- evalscope/utils/io_utils.py,sha256=MnEi4llOYtXK81bUQ_XE_WP5qIsVrJ4MlKmWMH9vzFs,3993
241
- evalscope/utils/logger.py,sha256=4OGlkBsut_wzq-1UcM2DKQKdKs1FRNYGHw538TGvypU,3440
242
- evalscope/utils/model_utils.py,sha256=zMS1YRu4CzU4CVLZS6e_lgfHIDBqv3YBTJbPF1R2M90,443
243
- evalscope/utils/utils.py,sha256=lZl5lt4WqjoY5SEfsum8Sc-s_c9GSlmIZlkTAQkMnjE,10485
260
+ evalscope/utils/io_utils.py,sha256=vm6uJBBqx4fc7jsHGbwNQ6Hbx7XYhjT1Q2dQ7aHjDD0,4172
261
+ evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
262
+ evalscope/utils/model_utils.py,sha256=PK7pKNY8ovtGZHNRvDpZ-d8zBHMOkxd6fRVkM8VF06I,736
263
+ evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
244
264
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
245
265
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
246
266
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
247
- tests/cli/test_run.py,sha256=pMZvI3b0Vs-UFfciDoPwCYFAaYJzocQjxEaMLFTxYSo,4289
267
+ tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
268
+ tests/cli/test_run.py,sha256=aywruYPPweMEHaBOynf0G3liKBKMH_H_e4Znq2PcaR4,5821
248
269
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
249
270
  tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
250
271
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
251
272
  tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
252
- tests/rag/test_mteb.py,sha256=CaEJ0f1M06Z90c72FQb9z23IC_KZtkURWsc_oRMgQn8,4609
273
+ tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
253
274
  tests/rag/test_ragas.py,sha256=N_mUBIyxdQ1REzjkoI2sBNluKLLmKatLc3VY1o9uPck,3947
254
275
  tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
255
276
  tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
@@ -257,9 +278,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
257
278
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
258
279
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
259
280
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
260
- evalscope-0.8.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
261
- evalscope-0.8.2.dist-info/METADATA,sha256=Fk1p0gh2RycQ7yOBj7fMYym7G-SYj8sL-IZX8cgGxVQ,23709
262
- evalscope-0.8.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
263
- evalscope-0.8.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
264
- evalscope-0.8.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
265
- evalscope-0.8.2.dist-info/RECORD,,
281
+ evalscope-0.10.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
282
+ evalscope-0.10.0.dist-info/METADATA,sha256=BwbHLPw5NELgkYNQ90wn_iUoDyUQfQD2WSHRD5XkYcM,28975
283
+ evalscope-0.10.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
284
+ evalscope-0.10.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
285
+ evalscope-0.10.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
286
+ evalscope-0.10.0.dist-info/RECORD,,
@@ -0,0 +1,57 @@
1
+ import json
2
+ import unittest
3
+
4
+ from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
5
+ from evalscope.constants import EvalType
6
+ from evalscope.utils.io_utils import dump_jsonl_data
7
+ from evalscope.utils.utils import test_level_list
8
+
9
+
10
+ class TestCollection(unittest.TestCase):
11
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
12
+ def test_create_collection(self):
13
+ schema = CollectionSchema(name='math&reasoning', datasets=[
14
+ CollectionSchema(name='math', datasets=[
15
+ CollectionSchema(name='generation', datasets=[
16
+ DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
17
+ DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
18
+ ]),
19
+ CollectionSchema(name='multiple_choice', datasets=[
20
+ DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
21
+ DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
22
+ ]),
23
+ ]),
24
+ CollectionSchema(name='reasoning', datasets=[
25
+ DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
26
+ DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
27
+ DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
28
+ ]),
29
+ ])
30
+ print(schema.to_dict())
31
+ print(schema.flatten())
32
+ schema.dump_json('outputs/schema_test.json')
33
+
34
+
35
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
36
+ def test_generate_data(self):
37
+ schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
38
+ print(schema.to_dict())
39
+ mixed_data = WeightedSampler(schema).sample(100)
40
+ dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
41
+
42
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
43
+ def test_evaluate_collection(self):
44
+ from evalscope import TaskConfig, run_task
45
+
46
+ task_cfg = TaskConfig(
47
+ model='Qwen2.5-7B-Instruct',
48
+ api_url='http://127.0.0.1:8801/v1/chat/completions',
49
+ api_key='EMPTY',
50
+ eval_type=EvalType.SERVICE,
51
+ datasets=['data_collection'],
52
+ dataset_args={'data_collection': {
53
+ 'local_path': 'outputs/mixed_data_test.jsonl'
54
+ # 'local_path': 'outputs/weighted_mixed_data.jsonl'
55
+ }},
56
+ )
57
+ run_task(task_cfg=task_cfg)