psi-bench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- psi_bench-0.1.0/PKG-INFO +150 -0
- psi_bench-0.1.0/README.md +112 -0
- psi_bench-0.1.0/psi_bench/__init__.py +0 -0
- psi_bench-0.1.0/psi_bench/cli.py +242 -0
- psi_bench-0.1.0/psi_bench/download_data.py +133 -0
- psi_bench-0.1.0/psi_bench/inference.py +277 -0
- psi_bench-0.1.0/psi_bench/llm_judge_eval.py +325 -0
- psi_bench-0.1.0/psi_bench/prompts.py +507 -0
- psi_bench-0.1.0/psi_bench/utils.py +393 -0
- psi_bench-0.1.0/psi_bench.egg-info/PKG-INFO +150 -0
- psi_bench-0.1.0/psi_bench.egg-info/SOURCES.txt +15 -0
- psi_bench-0.1.0/psi_bench.egg-info/dependency_links.txt +1 -0
- psi_bench-0.1.0/psi_bench.egg-info/entry_points.txt +2 -0
- psi_bench-0.1.0/psi_bench.egg-info/requires.txt +17 -0
- psi_bench-0.1.0/psi_bench.egg-info/top_level.txt +1 -0
- psi_bench-0.1.0/pyproject.toml +67 -0
- psi_bench-0.1.0/setup.cfg +4 -0
psi_bench-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: psi-bench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Ψ-Bench: Evaluating Persona-Sensitive Influencing in Persuasive Dialogues
|
|
5
|
+
Author-email: Peixuan Han <ph16@illinois.edu>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/yourusername/psi_bench
|
|
8
|
+
Project-URL: Documentation, https://github.com/yourusername/psi_bench#readme
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/yourusername/psi_bench/issues
|
|
10
|
+
Keywords: benchmark,persuasion,personalization,LLM,evaluation
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: openai>=1.0.0
|
|
23
|
+
Requires-Dist: transformers>=4.30.0
|
|
24
|
+
Requires-Dist: torch>=2.0.0
|
|
25
|
+
Requires-Dist: numpy>=1.24.0
|
|
26
|
+
Requires-Dist: pandas>=1.5.0
|
|
27
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
28
|
+
Requires-Dist: scikit-learn>=1.2.0
|
|
29
|
+
Requires-Dist: scipy>=1.10.0
|
|
30
|
+
Requires-Dist: tqdm>=4.65.0
|
|
31
|
+
Requires-Dist: nltk>=3.8.0
|
|
32
|
+
Requires-Dist: json-repair>=0.17.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
35
|
+
Requires-Dist: black>=22.0; extra == "dev"
|
|
36
|
+
Requires-Dist: flake8>=4.0; extra == "dev"
|
|
37
|
+
Requires-Dist: isort>=5.0; extra == "dev"
|
|
38
|
+
|
|
39
|
+
<div align="center">
|
|
40
|
+
<img src="figs/title_fig.jpg" width="100%">
|
|
41
|
+
</div>
|
|
42
|
+
|
|
43
|
+
<!-- <h1 align="center">Ψ-Bench: Evaluating Persona-Sensitive Influencing in Persuasive Dialogues</h1> -->
|
|
44
|
+
<h2 align="center">Peixuan Han, Hongyi Du, Jiayu Liu, Yihang Sun, Yutong Liu, Jiaxuan You</h2>
|
|
45
|
+
|
|
46
|
+
<p align="center">
|
|
47
|
+
<a href="https://pypi.org/project/psi-bench">
|
|
48
|
+
<img src="https://img.shields.io/pypi/v/your-package?logo=pypi&logoColor=white" />
|
|
49
|
+
</a>
|
|
50
|
+
·
|
|
51
|
+
<a href="https://arxiv.org/pdf/2606.02754">
|
|
52
|
+
<img src="https://img.shields.io/badge/arXiv-2401.12345-b31b1b?logo=arxiv" />
|
|
53
|
+
</a>
|
|
54
|
+
</p>
|
|
55
|
+
|
|
56
|
+
## Introduction
|
|
57
|
+
**Ψ-Bench (Psi-Bench)** is a benchmark for assessing LLMs' ability to influence realistic users through conversation. We design three real-world interaction scenarios involving persuasion in Psi-Bench and endow simulated clients with personal characteristics via explicit user profiles derived from dialogue histories.
|
|
58
|
+
|
|
59
|
+

|
|
60
|
+
|
|
61
|
+
## Benchmarking Results
|
|
62
|
+
|
|
63
|
+
Try Psi-Bench to see if your LLM is a personalized expert!
|
|
64
|
+
|
|
65
|
+

|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
## Get Started
|
|
69
|
+
|
|
70
|
+
Before using Psi-bench, you need to configure your API through environment variables. Psi-bench uses DeepSeek-v3.2 to serve as the client and judge. Since the DeepSeek official API no longer supports this model, it's recommended to access it via [vocanic platform](https://console.volcengine.com/ark) (the model identifier is deepseek-v3-2-251201). Other LLMs like GPT-4o and DeepSeek-v4 can also serve as clients and judges; however, be cautious when comparing scores from different judges.
|
|
71
|
+
|
|
72
|
+
For example:
|
|
73
|
+
```bash
|
|
74
|
+
export CLIENT_BASE_URL=https://ark.cn-beijing.volces.com/api/v3
|
|
75
|
+
export CLIENT_API_KEY=sk-...
|
|
76
|
+
export JUDGE_BASE_URL=https://ark.cn-beijing.volces.com/api/v3
|
|
77
|
+
export JUDGE_API_KEY=sk-...
|
|
78
|
+
export PERSUADER_BASE_URL=... # If you want to evaluate API-based persuader models
|
|
79
|
+
export PERSUADER_API_KEY=sk-...
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Download the Package
|
|
83
|
+
This is the easiest way of using Psi-bench.
|
|
84
|
+
```bash
|
|
85
|
+
pip install psi-bench
|
|
86
|
+
psi-bench download all # data will be saved in ./data
|
|
87
|
+
|
|
88
|
+
# Run evaluation with local persuader model, results will be saved in ./eval
|
|
89
|
+
CUDA_VISIBLE_DEVICES=0 psi-bench eval all \
|
|
90
|
+
--tested_model Qwen/Qwen3-8B \
|
|
91
|
+
--persuader_local \
|
|
92
|
+
--client_model deepseek-v3-2-251201 \
|
|
93
|
+
--judge_model deepseek-v3-2-251201
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Clone the Repository
|
|
97
|
+
If you wish to develop using Psi-bench or evaluate in more advanced settings (Oracle, profile analyzer, ...), you can clone the Git repo. Below are some examples:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
git clone https://github.com/Hanpx20/Psi-Bench
|
|
101
|
+
cd Psi-Bench
|
|
102
|
+
|
|
103
|
+
# Basic evaluation with local persuader
|
|
104
|
+
CUDA_VISIBLE_DEVICES=0 bash eval.sh all \
|
|
105
|
+
--tested_model Qwen/Qwen3-8B \
|
|
106
|
+
--client_model deepseek-v3-2-251201 \
|
|
107
|
+
--judge_model deepseek-v3-2-251201 \
|
|
108
|
+
--persuader_local
|
|
109
|
+
|
|
110
|
+
# Inference with oracle setting (client profile provided)
|
|
111
|
+
CUDA_VISIBLE_DEVICES=0 bash eval.sh all \
|
|
112
|
+
--tested_model Qwen/Qwen3-8B \
|
|
113
|
+
--client_model deepseek-v3-2-251201 \
|
|
114
|
+
--judge_model deepseek-v3-2-251201 \
|
|
115
|
+
--persuader_local \
|
|
116
|
+
--test_oracle
|
|
117
|
+
|
|
118
|
+
# Inference with profile analyzer (client profile predicted by an LLM)
|
|
119
|
+
CUDA_VISIBLE_DEVICES=0 python psi_bench/inference.py \
|
|
120
|
+
--client_model deepseek-v3-2-251201 \
|
|
121
|
+
--task request \
|
|
122
|
+
--conv_file data/request/queries.json \
|
|
123
|
+
--persona_file data/request/persona_profile.json \
|
|
124
|
+
--persuader_model Qwen/Qwen3-8B \
|
|
125
|
+
--persuader_local \
|
|
126
|
+
--profile_mode infer \
|
|
127
|
+
--persona_infer_model deepseek-v3.2 \
|
|
128
|
+
--output eval/test.json
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Notations in the Repo
|
|
132
|
+
- CMV, counsel, and request correspond to "Viewpoint Debate", "Psychological Consultation," and "Everyday Request" scenarios, respectively.
|
|
133
|
+
|
|
134
|
+
- `size` is set to 500 by default, as the first 500 queries in CMV are the test set; the other two scenarios only have 90 and 100 queries in total.
|
|
135
|
+
|
|
136
|
+
- The LLM judge returns 4 metrics, whereas "general_conversation_quality", "personalized_response" and "persuasion_effect" are what's shown in the paper; "personality_perception" is mainly for investigation purposes.
|
|
137
|
+
|
|
138
|
+
## Cite this paper
|
|
139
|
+
If you find this repo or the paper useful, please cite:
|
|
140
|
+
```
|
|
141
|
+
@article{han2026psibench,
|
|
142
|
+
title={$\Psi$-Bench: Evaluating Persona-Sensitive Influencing in Persuasive Dialogues},
|
|
143
|
+
author={Peixuan Han and Hongyi Du and Jiayu Liu and Yihang Sun and Yutong Liu and Jiaxuan You},
|
|
144
|
+
journal={arXiv preprint arXiv:2606.02754},
|
|
145
|
+
year={2026},
|
|
146
|
+
url={https://arxiv.org/abs/2606.02754},
|
|
147
|
+
}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Reach out to [Peixuan Han](mailto:ph16@illinois.edu) for any questions.
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="figs/title_fig.jpg" width="100%">
|
|
3
|
+
</div>
|
|
4
|
+
|
|
5
|
+
<!-- <h1 align="center">Ψ-Bench: Evaluating Persona-Sensitive Influencing in Persuasive Dialogues</h1> -->
|
|
6
|
+
<h2 align="center">Peixuan Han, Hongyi Du, Jiayu Liu, Yihang Sun, Yutong Liu, Jiaxuan You</h2>
|
|
7
|
+
|
|
8
|
+
<p align="center">
|
|
9
|
+
<a href="https://pypi.org/project/psi-bench">
|
|
10
|
+
<img src="https://img.shields.io/pypi/v/your-package?logo=pypi&logoColor=white" />
|
|
11
|
+
</a>
|
|
12
|
+
·
|
|
13
|
+
<a href="https://arxiv.org/pdf/2606.02754">
|
|
14
|
+
<img src="https://img.shields.io/badge/arXiv-2401.12345-b31b1b?logo=arxiv" />
|
|
15
|
+
</a>
|
|
16
|
+
</p>
|
|
17
|
+
|
|
18
|
+
## Introduction
|
|
19
|
+
**Ψ-Bench (Psi-Bench)** is a benchmark for assessing LLMs' ability to influence realistic users through conversation. We design three real-world interaction scenarios involving persuasion in Psi-Bench and endow simulated clients with personal characteristics via explicit user profiles derived from dialogue histories.
|
|
20
|
+
|
|
21
|
+

|
|
22
|
+
|
|
23
|
+
## Benchmarking Results
|
|
24
|
+
|
|
25
|
+
Try Psi-Bench to see if your LLM is a personalized expert!
|
|
26
|
+
|
|
27
|
+

|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
## Get Started
|
|
31
|
+
|
|
32
|
+
Before using Psi-bench, you need to configure your API through environment variables. Psi-bench uses DeepSeek-v3.2 to serve as the client and judge. Since the DeepSeek official API no longer supports this model, it's recommended to access it via [vocanic platform](https://console.volcengine.com/ark) (the model identifier is deepseek-v3-2-251201). Other LLMs like GPT-4o and DeepSeek-v4 can also serve as clients and judges; however, be cautious when comparing scores from different judges.
|
|
33
|
+
|
|
34
|
+
For example:
|
|
35
|
+
```bash
|
|
36
|
+
export CLIENT_BASE_URL=https://ark.cn-beijing.volces.com/api/v3
|
|
37
|
+
export CLIENT_API_KEY=sk-...
|
|
38
|
+
export JUDGE_BASE_URL=https://ark.cn-beijing.volces.com/api/v3
|
|
39
|
+
export JUDGE_API_KEY=sk-...
|
|
40
|
+
export PERSUADER_BASE_URL=... # If you want to evaluate API-based persuader models
|
|
41
|
+
export PERSUADER_API_KEY=sk-...
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Download the Package
|
|
45
|
+
This is the easiest way of using Psi-bench.
|
|
46
|
+
```bash
|
|
47
|
+
pip install psi-bench
|
|
48
|
+
psi-bench download all # data will be saved in ./data
|
|
49
|
+
|
|
50
|
+
# Run evaluation with local persuader model, results will be saved in ./eval
|
|
51
|
+
CUDA_VISIBLE_DEVICES=0 psi-bench eval all \
|
|
52
|
+
--tested_model Qwen/Qwen3-8B \
|
|
53
|
+
--persuader_local \
|
|
54
|
+
--client_model deepseek-v3-2-251201 \
|
|
55
|
+
--judge_model deepseek-v3-2-251201
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Clone the Repository
|
|
59
|
+
If you wish to develop using Psi-bench or evaluate in more advanced settings (Oracle, profile analyzer, ...), you can clone the Git repo. Below are some examples:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
git clone https://github.com/Hanpx20/Psi-Bench
|
|
63
|
+
cd Psi-Bench
|
|
64
|
+
|
|
65
|
+
# Basic evaluation with local persuader
|
|
66
|
+
CUDA_VISIBLE_DEVICES=0 bash eval.sh all \
|
|
67
|
+
--tested_model Qwen/Qwen3-8B \
|
|
68
|
+
--client_model deepseek-v3-2-251201 \
|
|
69
|
+
--judge_model deepseek-v3-2-251201 \
|
|
70
|
+
--persuader_local
|
|
71
|
+
|
|
72
|
+
# Inference with oracle setting (client profile provided)
|
|
73
|
+
CUDA_VISIBLE_DEVICES=0 bash eval.sh all \
|
|
74
|
+
--tested_model Qwen/Qwen3-8B \
|
|
75
|
+
--client_model deepseek-v3-2-251201 \
|
|
76
|
+
--judge_model deepseek-v3-2-251201 \
|
|
77
|
+
--persuader_local \
|
|
78
|
+
--test_oracle
|
|
79
|
+
|
|
80
|
+
# Inference with profile analyzer (client profile predicted by an LLM)
|
|
81
|
+
CUDA_VISIBLE_DEVICES=0 python psi_bench/inference.py \
|
|
82
|
+
--client_model deepseek-v3-2-251201 \
|
|
83
|
+
--task request \
|
|
84
|
+
--conv_file data/request/queries.json \
|
|
85
|
+
--persona_file data/request/persona_profile.json \
|
|
86
|
+
--persuader_model Qwen/Qwen3-8B \
|
|
87
|
+
--persuader_local \
|
|
88
|
+
--profile_mode infer \
|
|
89
|
+
--persona_infer_model deepseek-v3.2 \
|
|
90
|
+
--output eval/test.json
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Notations in the Repo
|
|
94
|
+
- CMV, counsel, and request correspond to "Viewpoint Debate", "Psychological Consultation," and "Everyday Request" scenarios, respectively.
|
|
95
|
+
|
|
96
|
+
- `size` is set to 500 by default, as the first 500 queries in CMV are the test set; the other two scenarios only have 90 and 100 queries in total.
|
|
97
|
+
|
|
98
|
+
- The LLM judge returns 4 metrics, whereas "general_conversation_quality", "personalized_response" and "persuasion_effect" are what's shown in the paper; "personality_perception" is mainly for investigation purposes.
|
|
99
|
+
|
|
100
|
+
## Cite this paper
|
|
101
|
+
If you find this repo or the paper useful, please cite:
|
|
102
|
+
```
|
|
103
|
+
@article{han2026psibench,
|
|
104
|
+
title={$\Psi$-Bench: Evaluating Persona-Sensitive Influencing in Persuasive Dialogues},
|
|
105
|
+
author={Peixuan Han and Hongyi Du and Jiayu Liu and Yihang Sun and Yutong Liu and Jiaxuan You},
|
|
106
|
+
journal={arXiv preprint arXiv:2606.02754},
|
|
107
|
+
year={2026},
|
|
108
|
+
url={https://arxiv.org/abs/2606.02754},
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Reach out to [Peixuan Han](mailto:ph16@illinois.edu) for any questions.
|
|
File without changes
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
psi_bench CLI - Command line interface for running evaluations
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import sys
|
|
8
|
+
import subprocess
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
def print_help():
|
|
13
|
+
"""Print help message"""
|
|
14
|
+
help_text = """
|
|
15
|
+
psi_bench Evaluation CLI
|
|
16
|
+
|
|
17
|
+
USAGE:
|
|
18
|
+
psi_bench [COMMAND] [OPTIONS]
|
|
19
|
+
|
|
20
|
+
COMMANDS:
|
|
21
|
+
eval - Run evaluations on benchmark datasets
|
|
22
|
+
download - Download benchmark datasets
|
|
23
|
+
help - Show this help message
|
|
24
|
+
|
|
25
|
+
EVAL USAGE:
|
|
26
|
+
psi_bench eval [TASK] [OPTIONS]
|
|
27
|
+
|
|
28
|
+
TASKS:
|
|
29
|
+
cmv - Evaluate on CMV (Change My View) dataset
|
|
30
|
+
counsel - Evaluate on Counsel dataset
|
|
31
|
+
request - Evaluate on Request dataset
|
|
32
|
+
all - Run all three evaluations
|
|
33
|
+
|
|
34
|
+
OPTIONS:
|
|
35
|
+
--tested_model MODEL (required) Model to test
|
|
36
|
+
--client_model MODEL Client model (default: deepseek-v3-2-251201)
|
|
37
|
+
--judge_model MODEL Judge model (default: deepseek-v3-2-251201)
|
|
38
|
+
--n_turns N Number of turns (default: 3)
|
|
39
|
+
--size M Number of conversations to evaluate (default: 500 for cmv, full for other scenarios)
|
|
40
|
+
--test_oracle Test oracle mode
|
|
41
|
+
--persuader_local Use local persuader model
|
|
42
|
+
--inference_parallel N Parallel workers (default: 16)
|
|
43
|
+
|
|
44
|
+
DOWNLOAD USAGE:
|
|
45
|
+
psi_bench download [DATASET]
|
|
46
|
+
|
|
47
|
+
DATASETS:
|
|
48
|
+
cmv - Change My View dataset
|
|
49
|
+
counsel - Counsel dataset
|
|
50
|
+
request - Request dataset
|
|
51
|
+
all - All datasets (default)
|
|
52
|
+
|
|
53
|
+
OPTIONS:
|
|
54
|
+
--output PATH Output directory (default: ./data)
|
|
55
|
+
|
|
56
|
+
EXAMPLES:
|
|
57
|
+
# Evaluate model on all datasets
|
|
58
|
+
psi_bench eval all --tested_model gpt-4
|
|
59
|
+
|
|
60
|
+
# Evaluate on specific dataset with local model
|
|
61
|
+
psi_bench eval cmv --tested_model Qwen/Qwen3-8B --persuader_local
|
|
62
|
+
|
|
63
|
+
# Download all datasets
|
|
64
|
+
psi_bench download all
|
|
65
|
+
|
|
66
|
+
For more details, see:
|
|
67
|
+
- API Key Setup: https://github.com/yourusername/psi_bench/blob/main/API_KEY_CONFIG.md
|
|
68
|
+
- User Guide: https://github.com/yourusername/psi_bench/blob/main/USER_GUIDE.md
|
|
69
|
+
"""
|
|
70
|
+
print(help_text)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def run_eval_inference(task, tested_model, client_model, judge_model, n_turns, size, test_oracle,
|
|
74
|
+
persuader_local, inference_parallel):
|
|
75
|
+
"""Run inference for a specific task"""
|
|
76
|
+
project_root = Path(__file__).resolve().parent.parent
|
|
77
|
+
DATA_ROOT = Path.cwd()
|
|
78
|
+
|
|
79
|
+
# Data and output file configurations
|
|
80
|
+
data_config = {
|
|
81
|
+
"cmv": {
|
|
82
|
+
"persona_file": "data/cmv/persona_profile.json",
|
|
83
|
+
"conv_file": "data/cmv/queries.json",
|
|
84
|
+
},
|
|
85
|
+
"counsel": {
|
|
86
|
+
"persona_file": "data/counsel/persona_profile.json",
|
|
87
|
+
"conv_file": "data/counsel/queries.json",
|
|
88
|
+
},
|
|
89
|
+
"request": {
|
|
90
|
+
"persona_file": "data/request/persona_profile.json",
|
|
91
|
+
"conv_file": "data/request/queries.json",
|
|
92
|
+
},
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if task not in data_config:
|
|
96
|
+
print(f"Error: Unknown task '{task}'")
|
|
97
|
+
return 1
|
|
98
|
+
|
|
99
|
+
config = data_config[task]
|
|
100
|
+
model_name = tested_model.split("/")[-1]
|
|
101
|
+
|
|
102
|
+
# Build inference.py command
|
|
103
|
+
inference_cmd = [
|
|
104
|
+
sys.executable,
|
|
105
|
+
os.path.join(project_root, "psi_bench", "inference.py"),
|
|
106
|
+
"--client_model", client_model,
|
|
107
|
+
"--persona_file", os.path.join(DATA_ROOT, config["persona_file"]),
|
|
108
|
+
"--conv_file", os.path.join(DATA_ROOT, config["conv_file"]),
|
|
109
|
+
"--persuader_model", tested_model,
|
|
110
|
+
"--output", os.path.join(DATA_ROOT, f"eval/{task}/convs/{model_name}{'_oracle' if test_oracle else ''}.json"),
|
|
111
|
+
"--size", str(size),
|
|
112
|
+
"--n_turns", str(n_turns),
|
|
113
|
+
"--inference_parallel", str(inference_parallel),
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
if test_oracle:
|
|
117
|
+
inference_cmd.extend(["--profile_mode", "oracle"])
|
|
118
|
+
|
|
119
|
+
if persuader_local:
|
|
120
|
+
inference_cmd.append("--persuader_local")
|
|
121
|
+
|
|
122
|
+
print(f"\n{'='*60}")
|
|
123
|
+
print(f"Running {task.upper()} Inference")
|
|
124
|
+
print(f"{'='*60}")
|
|
125
|
+
print(f"Command: {' '.join(inference_cmd)}\n")
|
|
126
|
+
|
|
127
|
+
result = subprocess.run(inference_cmd, cwd=project_root)
|
|
128
|
+
if result.returncode != 0:
|
|
129
|
+
print(f"Error: Inference failed for {task}")
|
|
130
|
+
return result.returncode
|
|
131
|
+
|
|
132
|
+
# Build llm_judge_eval.py command
|
|
133
|
+
judge_cmd = [
|
|
134
|
+
sys.executable,
|
|
135
|
+
os.path.join(project_root, "psi_bench", "llm_judge_eval.py"),
|
|
136
|
+
"--judge_model", judge_model,
|
|
137
|
+
"--conv_file", os.path.join(DATA_ROOT, f"eval/{task}/convs/{model_name}{'_oracle' if test_oracle else ''}.json"),
|
|
138
|
+
"--persona_file", os.path.join(DATA_ROOT, config["persona_file"]),
|
|
139
|
+
"--output", os.path.join(DATA_ROOT, f"eval/{task}/{model_name}{'_oracle' if test_oracle else ''}_judge.json"),
|
|
140
|
+
"--inference_parallel", str(inference_parallel),
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
print(f"\n{'='*60}")
|
|
144
|
+
print(f"Running {task.upper()} Judge Evaluation")
|
|
145
|
+
print(f"{'='*60}")
|
|
146
|
+
print(f"Command: {' '.join(judge_cmd)}\n")
|
|
147
|
+
|
|
148
|
+
result = subprocess.run(judge_cmd, cwd=project_root)
|
|
149
|
+
if result.returncode != 0:
|
|
150
|
+
print(f"Error: Judge evaluation failed for {task}")
|
|
151
|
+
return result.returncode
|
|
152
|
+
|
|
153
|
+
return 0
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def main():
|
|
157
|
+
"""Main CLI entry point"""
|
|
158
|
+
parser = argparse.ArgumentParser(
|
|
159
|
+
description="psi_bench: Evaluating Persona-Sensitive Influencing in Persuasive Dialogues",
|
|
160
|
+
add_help=False
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
parser.add_argument('command', nargs='?', help='Command to run (eval, download, help)')
|
|
164
|
+
parser.add_argument('task', nargs='?', help='Task for eval command (cmv, counsel, request, all)')
|
|
165
|
+
|
|
166
|
+
# Eval options
|
|
167
|
+
parser.add_argument('--tested_model', type=str, help='Model to test')
|
|
168
|
+
parser.add_argument('--client_model', type=str, default='deepseek-v3-2-251201', help='Client model')
|
|
169
|
+
parser.add_argument('--judge_model', type=str, default='deepseek-v3-2-251201', help='Judge model')
|
|
170
|
+
parser.add_argument('--n_turns', type=int, default=3, help='Number of turns')
|
|
171
|
+
parser.add_argument('--size', type=int, default=500, help='Number of conversations to evaluate')
|
|
172
|
+
parser.add_argument('--test_oracle', action='store_true', help='Test oracle mode')
|
|
173
|
+
parser.add_argument('--persuader_local', action='store_true', help='Use local persuader model')
|
|
174
|
+
parser.add_argument('--inference_parallel', type=int, default=16, help='Number of parallel workers')
|
|
175
|
+
|
|
176
|
+
# Download options
|
|
177
|
+
parser.add_argument('--output', type=str, default='./data', help='Output directory')
|
|
178
|
+
|
|
179
|
+
parser.add_argument('--help', '-h', action='store_true', help='Show help message')
|
|
180
|
+
|
|
181
|
+
args, remaining = parser.parse_known_args()
|
|
182
|
+
|
|
183
|
+
# Handle help
|
|
184
|
+
if args.help or not args.command or args.command in ['help', '-h', '--help']:
|
|
185
|
+
print_help()
|
|
186
|
+
return 0
|
|
187
|
+
|
|
188
|
+
if args.command == 'eval':
|
|
189
|
+
# Validate eval command
|
|
190
|
+
if not args.task:
|
|
191
|
+
print("Error: eval command requires a task (cmv, counsel, request, or all)")
|
|
192
|
+
print("\nUsage: psi_bench eval [cmv|counsel|request|all] [OPTIONS]")
|
|
193
|
+
return 1
|
|
194
|
+
|
|
195
|
+
if not args.tested_model:
|
|
196
|
+
print("Error: --tested_model is required")
|
|
197
|
+
return 1
|
|
198
|
+
|
|
199
|
+
if args.task not in ['cmv', 'counsel', 'request', 'all']:
|
|
200
|
+
print(f"Error: Unknown task '{args.task}'")
|
|
201
|
+
return 1
|
|
202
|
+
|
|
203
|
+
# Run evaluation(s)
|
|
204
|
+
tasks = ['cmv', 'counsel', 'request'] if args.task == 'all' else [args.task]
|
|
205
|
+
|
|
206
|
+
# Ensure output directories exist
|
|
207
|
+
DATA_ROOT = Path.cwd()
|
|
208
|
+
|
|
209
|
+
for task in tasks:
|
|
210
|
+
(DATA_ROOT / f"eval/{task}/convs").mkdir(parents=True, exist_ok=True)
|
|
211
|
+
|
|
212
|
+
result = run_eval_inference(
|
|
213
|
+
task=task,
|
|
214
|
+
tested_model=args.tested_model,
|
|
215
|
+
client_model=args.client_model,
|
|
216
|
+
judge_model=args.judge_model,
|
|
217
|
+
n_turns=args.n_turns,
|
|
218
|
+
size=args.size,
|
|
219
|
+
test_oracle=args.test_oracle,
|
|
220
|
+
persuader_local=args.persuader_local,
|
|
221
|
+
inference_parallel=args.inference_parallel
|
|
222
|
+
)
|
|
223
|
+
if result != 0:
|
|
224
|
+
return result
|
|
225
|
+
|
|
226
|
+
print(f"\n{'='*60}")
|
|
227
|
+
print("✓ All evaluations completed successfully!")
|
|
228
|
+
print(f"{'='*60}\n")
|
|
229
|
+
return 0
|
|
230
|
+
|
|
231
|
+
elif args.command == 'download':
|
|
232
|
+
from . import download_data
|
|
233
|
+
sys.argv = ['psi_bench-download'] + ([args.task] if args.task else []) + remaining
|
|
234
|
+
return download_data.main()
|
|
235
|
+
|
|
236
|
+
else:
|
|
237
|
+
print(f"Error: Unknown command '{args.command}'")
|
|
238
|
+
print_help()
|
|
239
|
+
return 1
|
|
240
|
+
|
|
241
|
+
if __name__ == '__main__':
|
|
242
|
+
sys.exit(main())
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Data download script for psi_bench
|
|
4
|
+
Downloads datasets from GitHub or Hugging Face
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import json
|
|
10
|
+
import argparse
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import urllib.request
|
|
13
|
+
import tarfile
|
|
14
|
+
import zipfile
|
|
15
|
+
|
|
16
|
+
def download_file(url, output_path, chunk_size=8192):
|
|
17
|
+
"""Download file with progress bar"""
|
|
18
|
+
output_path = Path(output_path)
|
|
19
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
print(f"Downloading from: {url}")
|
|
23
|
+
with urllib.request.urlopen(url) as response:
|
|
24
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
25
|
+
downloaded = 0
|
|
26
|
+
|
|
27
|
+
with open(output_path, 'wb') as f:
|
|
28
|
+
while True:
|
|
29
|
+
chunk = response.read(chunk_size)
|
|
30
|
+
if not chunk:
|
|
31
|
+
break
|
|
32
|
+
f.write(chunk)
|
|
33
|
+
downloaded += len(chunk)
|
|
34
|
+
|
|
35
|
+
if total_size > 0:
|
|
36
|
+
percent = (downloaded / total_size) * 100
|
|
37
|
+
print(f" Progress: {percent:.1f}% ({downloaded}/{total_size} bytes)", end='\r')
|
|
38
|
+
|
|
39
|
+
print(f"\n✅ Downloaded to: {output_path}")
|
|
40
|
+
return True
|
|
41
|
+
except Exception as e:
|
|
42
|
+
print(f"❌ Download failed: {e}")
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
def extract_archive(archive_path, extract_to):
|
|
46
|
+
"""Extract tar.gz or zip file"""
|
|
47
|
+
extract_to = Path(extract_to)
|
|
48
|
+
extract_to.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
if str(archive_path).endswith('.tar.gz'):
|
|
52
|
+
print(f"Extracting tar.gz file...")
|
|
53
|
+
with tarfile.open(archive_path, 'r:gz') as tar:
|
|
54
|
+
tar.extractall(extract_to)
|
|
55
|
+
elif str(archive_path).endswith('.zip'):
|
|
56
|
+
print(f"Extracting zip file...")
|
|
57
|
+
with zipfile.ZipFile(archive_path, 'r') as zf:
|
|
58
|
+
zf.extractall(extract_to)
|
|
59
|
+
else:
|
|
60
|
+
print("❌ Unsupported archive format")
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
print(f"✅ Extracted to: {extract_to}")
|
|
64
|
+
return True
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"❌ Extraction failed: {e}")
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
def download_from_github(dataset_name, output_dir="./data"):
|
|
70
|
+
"""
|
|
71
|
+
Download dataset from GitHub releases
|
|
72
|
+
Requires: psi_bench releases with data attached
|
|
73
|
+
"""
|
|
74
|
+
base_url = "https://github.com/Hanpx20/psi_bench/releases/download"
|
|
75
|
+
|
|
76
|
+
datasets = {
|
|
77
|
+
"cmv": f"{base_url}/v0.1.0/psi_bench-data-cmv.tar.gz",
|
|
78
|
+
"counsel": f"{base_url}/v0.1.0/psi_bench-data-counsel.tar.gz",
|
|
79
|
+
"request": f"{base_url}/v0.1.0/psi_bench-data-request.tar.gz",
|
|
80
|
+
"all": f"{base_url}/v0.1.0/psi_bench-data-all.tar.gz",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if dataset_name not in datasets:
|
|
84
|
+
print(f"Available datasets: {', '.join(datasets.keys())}")
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
url = datasets[dataset_name]
|
|
88
|
+
archive_path = Path(output_dir) / f"psi_bench-data-{dataset_name}.tar.gz"
|
|
89
|
+
|
|
90
|
+
# Download
|
|
91
|
+
if not download_file(url, archive_path):
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
# Extract
|
|
95
|
+
if not extract_archive(archive_path, output_dir):
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
# Cleanup
|
|
99
|
+
archive_path.unlink()
|
|
100
|
+
print(f"✅ {dataset_name} dataset ready in {output_dir}/")
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
def main():
|
|
104
|
+
parser = argparse.ArgumentParser(
|
|
105
|
+
description="Download psi_bench datasets from GitHub"
|
|
106
|
+
)
|
|
107
|
+
parser.add_argument(
|
|
108
|
+
"dataset",
|
|
109
|
+
nargs="?",
|
|
110
|
+
choices=["cmv", "counsel", "request", "all"],
|
|
111
|
+
help="Dataset to download (or 'all' for complete dataset)"
|
|
112
|
+
)
|
|
113
|
+
parser.add_argument(
|
|
114
|
+
"--output", "-o",
|
|
115
|
+
default="./data",
|
|
116
|
+
help="Output directory (default: ./data)"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
args = parser.parse_args()
|
|
120
|
+
|
|
121
|
+
if not args.dataset:
|
|
122
|
+
parser.print_help()
|
|
123
|
+
print("\n💡 Hint: Run 'psi_bench download all' to get all datasets")
|
|
124
|
+
return 1
|
|
125
|
+
|
|
126
|
+
print(f"📥 psi_bench Data Download (from GitHub Hanpx20/psi_bench)\n")
|
|
127
|
+
|
|
128
|
+
success = download_from_github(args.dataset, args.output)
|
|
129
|
+
|
|
130
|
+
return 0 if success else 1
|
|
131
|
+
|
|
132
|
+
if __name__ == "__main__":
|
|
133
|
+
sys.exit(main())
|