llmdep 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llmdep-0.2.0/LICENSE +21 -0
- llmdep-0.2.0/MANIFEST.in +33 -0
- llmdep-0.2.0/PKG-INFO +233 -0
- llmdep-0.2.0/README.md +206 -0
- llmdep-0.2.0/llm/BaseModel.py +40 -0
- llmdep-0.2.0/llm/__init__.py +1 -0
- llmdep-0.2.0/llmdep.egg-info/PKG-INFO +233 -0
- llmdep-0.2.0/llmdep.egg-info/SOURCES.txt +85 -0
- llmdep-0.2.0/llmdep.egg-info/dependency_links.txt +1 -0
- llmdep-0.2.0/llmdep.egg-info/entry_points.txt +3 -0
- llmdep-0.2.0/llmdep.egg-info/requires.txt +18 -0
- llmdep-0.2.0/llmdep.egg-info/top_level.txt +5 -0
- llmdep-0.2.0/llmdep_cli.py +1384 -0
- llmdep-0.2.0/llmdep_frontend/__init__.py +0 -0
- llmdep-0.2.0/llmdep_frontend/index.html +163 -0
- llmdep-0.2.0/llmdep_frontend/main.js +356 -0
- llmdep-0.2.0/llmdep_frontend/server.py +416 -0
- llmdep-0.2.0/llmdep_frontend/styles.css +472 -0
- llmdep-0.2.0/mep_client/API/__init__.py +29 -0
- llmdep-0.2.0/mep_client/API/auto_discover.py +141 -0
- llmdep-0.2.0/mep_client/API/clear_api.py +172 -0
- llmdep-0.2.0/mep_client/API/evaluation_api.py +395 -0
- llmdep-0.2.0/mep_client/API/main.py +81 -0
- llmdep-0.2.0/mep_client/__init__.py +0 -0
- llmdep-0.2.0/mep_client/benchmark_remote.py +124 -0
- llmdep-0.2.0/mep_client/config.py +268 -0
- llmdep-0.2.0/mep_client/run/__init__.py +0 -0
- llmdep-0.2.0/mep_client/run/all.json +3 -0
- llmdep-0.2.0/mep_client/run/generate_and_evaluate.py +113 -0
- llmdep-0.2.0/mep_client/run/generate_and_evaluate_all.py +128 -0
- llmdep-0.2.0/mep_client/run/run.py +98 -0
- llmdep-0.2.0/mep_client/run/utils/__init__.py +1 -0
- llmdep-0.2.0/mep_client/run/utils/benchmark_remote.py +1 -0
- llmdep-0.2.0/mep_client/run/utils/generate_user_answers.py +670 -0
- llmdep-0.2.0/mep_client/run/utils/server_data_access.py +199 -0
- llmdep-0.2.0/method_server/__init__.py +0 -0
- llmdep-0.2.0/method_server/eval/icl_evaluator/__init__.py +8 -0
- llmdep-0.2.0/method_server/eval/icl_evaluator/icl_aucroc_evaluator.py +40 -0
- llmdep-0.2.0/method_server/eval/icl_evaluator/icl_base_evaluator.py +10 -0
- llmdep-0.2.0/method_server/eval/icl_evaluator/icl_em_evaluator.py +82 -0
- llmdep-0.2.0/method_server/eval/icl_evaluator/icl_f1_evaluator.py +50 -0
- llmdep-0.2.0/method_server/eval/icl_evaluator/icl_hf_evaluator.py +321 -0
- llmdep-0.2.0/method_server/eval/icl_evaluator/icl_jieba_rouge_evaluator.py +41 -0
- llmdep-0.2.0/method_server/eval/icl_evaluator/icl_matchAcc_evaluator.py +41 -0
- llmdep-0.2.0/method_server/eval/metrics/accuracy/README.md +119 -0
- llmdep-0.2.0/method_server/eval/metrics/accuracy/accuracy.py +106 -0
- llmdep-0.2.0/method_server/eval/metrics/accuracy/app.py +6 -0
- llmdep-0.2.0/method_server/eval/metrics/accuracy/requirements.txt +2 -0
- llmdep-0.2.0/method_server/eval/metrics/bleu/README.md +160 -0
- llmdep-0.2.0/method_server/eval/metrics/bleu/app.py +6 -0
- llmdep-0.2.0/method_server/eval/metrics/bleu/bleu.py +133 -0
- llmdep-0.2.0/method_server/eval/metrics/bleu/nmt_bleu.py +112 -0
- llmdep-0.2.0/method_server/eval/metrics/bleu/requirements.txt +1 -0
- llmdep-0.2.0/method_server/eval/metrics/bleu/tokenizer_13a.py +100 -0
- llmdep-0.2.0/method_server/eval/metrics/exact_match/README.md +119 -0
- llmdep-0.2.0/method_server/eval/metrics/exact_match/app.py +6 -0
- llmdep-0.2.0/method_server/eval/metrics/exact_match/exact_match.py +136 -0
- llmdep-0.2.0/method_server/eval/metrics/exact_match/requirements.txt +1 -0
- llmdep-0.2.0/method_server/eval/metrics/f1/README.md +137 -0
- llmdep-0.2.0/method_server/eval/metrics/f1/app.py +6 -0
- llmdep-0.2.0/method_server/eval/metrics/f1/f1.py +130 -0
- llmdep-0.2.0/method_server/eval/metrics/f1/requirements.txt +2 -0
- llmdep-0.2.0/method_server/eval/metrics/precision/README.md +142 -0
- llmdep-0.2.0/method_server/eval/metrics/precision/app.py +6 -0
- llmdep-0.2.0/method_server/eval/metrics/precision/precision.py +145 -0
- llmdep-0.2.0/method_server/eval/metrics/precision/requirements.txt +2 -0
- llmdep-0.2.0/method_server/eval/metrics/recall/README.md +132 -0
- llmdep-0.2.0/method_server/eval/metrics/recall/app.py +6 -0
- llmdep-0.2.0/method_server/eval/metrics/recall/recall.py +135 -0
- llmdep-0.2.0/method_server/eval/metrics/recall/requirements.txt +2 -0
- llmdep-0.2.0/method_server/eval/metrics/roc_auc/README.md +204 -0
- llmdep-0.2.0/method_server/eval/metrics/roc_auc/app.py +6 -0
- llmdep-0.2.0/method_server/eval/metrics/roc_auc/requirements.txt +2 -0
- llmdep-0.2.0/method_server/eval/metrics/roc_auc/roc_auc.py +191 -0
- llmdep-0.2.0/method_server/eval/metrics/rouge/README.md +160 -0
- llmdep-0.2.0/method_server/eval/metrics/rouge/app.py +6 -0
- llmdep-0.2.0/method_server/eval/metrics/rouge/requirements.txt +4 -0
- llmdep-0.2.0/method_server/eval/metrics/rouge/rouge.py +158 -0
- llmdep-0.2.0/method_server/eval/metrics/sacrebleu/README.md +119 -0
- llmdep-0.2.0/method_server/eval/metrics/sacrebleu/app.py +11 -0
- llmdep-0.2.0/method_server/eval/metrics/sacrebleu/requirements.txt +2 -0
- llmdep-0.2.0/method_server/eval/metrics/sacrebleu/sacrebleu.py +178 -0
- llmdep-0.2.0/method_server/eval/utils/text_postprocessors.py +148 -0
- llmdep-0.2.0/pyproject.toml +3 -0
- llmdep-0.2.0/setup.cfg +66 -0
- llmdep-0.2.0/setup.py +2 -0
llmdep-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anonymous
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
llmdep-0.2.0/MANIFEST.in
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE
|
|
3
|
+
include setup.py
|
|
4
|
+
include setup.cfg
|
|
5
|
+
include MANIFEST.in
|
|
6
|
+
|
|
7
|
+
recursive-include mep_client *.py *.json
|
|
8
|
+
recursive-exclude mep_client/final_results *
|
|
9
|
+
recursive-exclude mep_client/log *
|
|
10
|
+
|
|
11
|
+
include llm/BaseModel.py
|
|
12
|
+
include llm/__init__.py
|
|
13
|
+
include method_server/*.py
|
|
14
|
+
recursive-include llmdep_frontend *.py *.html *.js *.css
|
|
15
|
+
|
|
16
|
+
recursive-include method_server/eval *.py *.txt *.md
|
|
17
|
+
recursive-include dep_server *.py *.md
|
|
18
|
+
|
|
19
|
+
recursive-exclude method_server *.csv *.jsonl *.parquet *.arrow *.pkl *.pt *.pth *.safetensors
|
|
20
|
+
recursive-exclude method_server/*/cache *
|
|
21
|
+
recursive-exclude method_server/*/data *
|
|
22
|
+
|
|
23
|
+
global-exclude config_backup.py
|
|
24
|
+
global-exclude __pycache__
|
|
25
|
+
global-exclude *.pyc
|
|
26
|
+
global-exclude *.pyo
|
|
27
|
+
global-exclude *.pyd
|
|
28
|
+
global-exclude *.parquet
|
|
29
|
+
global-exclude *.arrow
|
|
30
|
+
global-exclude .gitignore
|
|
31
|
+
global-exclude .git
|
|
32
|
+
global-exclude .idea
|
|
33
|
+
global-exclude *.egg-info
|
llmdep-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: llmdep
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Distributed LLM Evaluation Benchmark Server
|
|
5
|
+
Home-page: https://github.com/tjunlp-lab/DEP
|
|
6
|
+
Author: TJUNLP
|
|
7
|
+
License: MIT
|
|
8
|
+
Platform: UNKNOWN
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
|
|
26
|
+
# llmdep - Distributed LLM Evaluation Benchmark Server
|
|
27
|
+
|
|
28
|
+
[](https://www.python.org/)
|
|
29
|
+
[](LICENSE)
|
|
30
|
+
|
|
31
|
+
A distributed benchmark server for evaluating Large Language Models (LLMs) across multiple benchmarks.
|
|
32
|
+
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
- **Distributed Evaluation**: Support concurrent worker processes
|
|
36
|
+
- **Multiple Benchmarks**: ARC, ETHICS, MMLU-Pro, BiPaR, and more
|
|
37
|
+
- **Rate Limiting**: Token bucket based rate limiting for API calls
|
|
38
|
+
- **Batch Processing**: Process multiple model-benchmark pairs in batch
|
|
39
|
+
- **Resume Support**: Continue interrupted tasks with `--run_id`
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install llmdep
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Or install from source:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install -e .
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
### Configure Benchmark Workspace
|
|
56
|
+
|
|
57
|
+
Benchmarks should live outside the installed pip package. Configure a benchmark root once:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
llmdep config set dataset_root /data/eval_datasets
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The directory should contain benchmark folders directly:
|
|
64
|
+
|
|
65
|
+
```text
|
|
66
|
+
/data/eval_datasets/
|
|
67
|
+
├── BiPaR/
|
|
68
|
+
│ ├── dataset_card.json
|
|
69
|
+
│ ├── methods/
|
|
70
|
+
│ ├── data/
|
|
71
|
+
│ └── cache/
|
|
72
|
+
└── gsm8k/
|
|
73
|
+
├── dataset_card.json
|
|
74
|
+
├── methods/
|
|
75
|
+
├── data/
|
|
76
|
+
└── cache/
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Check the active benchmark root:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
llmdep list benchmark
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Configure a model root once as well:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
llmdep config set model_root /data/llm_models
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
The directory should contain only user model folders directly. `BaseModel` is not placed here; it remains inside the installed `llmdep` package.
|
|
92
|
+
|
|
93
|
+
```text
|
|
94
|
+
/data/llm_models/
|
|
95
|
+
└── qwen-plus/
|
|
96
|
+
├── model.py
|
|
97
|
+
└── model_card.json
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Each `model.py` should expose a `Model` class with `call_with_prompt(...)`. Import the base class from the installed package:
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from llm.BaseModel import BaseModel
|
|
104
|
+
|
|
105
|
+
class Model(BaseModel):
|
|
106
|
+
def call_with_prompt(self, input_data):
|
|
107
|
+
...
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Single Task
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
# Generate answers
|
|
114
|
+
llmdep gen_eval -b ARC -m qwen-plus -w 10
|
|
115
|
+
|
|
116
|
+
# Generate answers and evaluate
|
|
117
|
+
llmdep gen_eval -b ARC -m qwen-plus -w 10 --evaluate
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Batch Processing
|
|
121
|
+
|
|
122
|
+
Create a task configuration file `tasks.json`:
|
|
123
|
+
|
|
124
|
+
```json
|
|
125
|
+
[
|
|
126
|
+
{"model": "qwen-plus", "benchmark": "ARC_parquet"},
|
|
127
|
+
{"model": "deepseek", "benchmark": "ethics"}
|
|
128
|
+
]
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Then run:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
llmdep batch --config tasks.json -w 10
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Resume Interrupted Task
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
llmdep gen_eval --run_id 20260210-164048-322281
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Commands
|
|
144
|
+
|
|
145
|
+
### gen_eval
|
|
146
|
+
|
|
147
|
+
Generate model answers and optionally evaluate.
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
llmdep gen_eval [options]
|
|
151
|
+
|
|
152
|
+
Options:
|
|
153
|
+
-b, --benchmark Benchmark name (e.g., ARC, ethics)
|
|
154
|
+
-m, --model Model name (e.g., qwen-plus, deepseek)
|
|
155
|
+
-w, --worker_nums Number of concurrent workers (default: 2)
|
|
156
|
+
--batch_size Batch size (default: 1)
|
|
157
|
+
--rate_limit Token bucket fill rate (default: 5.0)
|
|
158
|
+
--bucket_capacity Token bucket capacity (default: 10.0)
|
|
159
|
+
--limited_test Quick test with first 10 questions, or pass N
|
|
160
|
+
--evaluate Run evaluation after generation
|
|
161
|
+
--run_id Resume from run_id
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### evaluate
|
|
165
|
+
|
|
166
|
+
Run evaluation on existing results.
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
llmdep evaluate -b ARC -r 20260210-164048-322281_qwen-plus
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### list
|
|
173
|
+
|
|
174
|
+
List available benchmarks or models.
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
llmdep list benchmark
|
|
178
|
+
llmdep list model
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### dashboard
|
|
182
|
+
|
|
183
|
+
Start the local web control panel.
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
llmdep dashboard
|
|
187
|
+
llmdep dashboard --host 127.0.0.1 --port 8765
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### batch
|
|
191
|
+
|
|
192
|
+
Batch process tasks from config file.
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
llmdep batch --config tasks.json
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Project Structure
|
|
199
|
+
|
|
200
|
+
```
|
|
201
|
+
llmdep/
|
|
202
|
+
├── llm/ # Packaged BaseModel compatibility module
|
|
203
|
+
├── mep_client/ # Client for generating answers
|
|
204
|
+
│ ├── run/
|
|
205
|
+
│ └── API/
|
|
206
|
+
├── method_server/ # Benchmark implementations
|
|
207
|
+
│ ├── ARC_parquet/
|
|
208
|
+
│ ├── ethics/
|
|
209
|
+
│ ├── BiPaR/
|
|
210
|
+
│ └── ...
|
|
211
|
+
└── llmdep_cli.py # CLI entry point
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## Adding New Models
|
|
215
|
+
|
|
216
|
+
1. Configure `model_root`: `llmdep config set model_root /data/llm_models`
|
|
217
|
+
2. Create folder under `/data/llm_models/<your_model>/`
|
|
218
|
+
2. Implement `model.py` inheriting from `BaseModel`
|
|
219
|
+
3. Create `model_card.json`
|
|
220
|
+
|
|
221
|
+
## Adding New Benchmarks
|
|
222
|
+
|
|
223
|
+
1. Configure `dataset_root`: `llmdep config set dataset_root /data/eval_datasets`
|
|
224
|
+
1. Create folder under `/data/eval_datasets/<benchmark_name>/`
|
|
225
|
+
2. Implement `prepare_ques.py` - Prepare questions and golden answers
|
|
226
|
+
3. Implement `eval_script.py` - Run evaluation
|
|
227
|
+
4. Create `dataset_card.json`
|
|
228
|
+
|
|
229
|
+
## License
|
|
230
|
+
|
|
231
|
+
MIT License
|
|
232
|
+
|
|
233
|
+
|
llmdep-0.2.0/README.md
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# llmdep - Distributed LLM Evaluation Benchmark Server
|
|
2
|
+
|
|
3
|
+
[](https://www.python.org/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
|
|
6
|
+
A distributed benchmark server for evaluating Large Language Models (LLMs) across multiple benchmarks.
|
|
7
|
+
|
|
8
|
+
## Features
|
|
9
|
+
|
|
10
|
+
- **Distributed Evaluation**: Support concurrent worker processes
|
|
11
|
+
- **Multiple Benchmarks**: ARC, ETHICS, MMLU-Pro, BiPaR, and more
|
|
12
|
+
- **Rate Limiting**: Token bucket based rate limiting for API calls
|
|
13
|
+
- **Batch Processing**: Process multiple model-benchmark pairs in batch
|
|
14
|
+
- **Resume Support**: Continue interrupted tasks with `--run_id`
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install llmdep
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Or install from source:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install -e .
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
### Configure Benchmark Workspace
|
|
31
|
+
|
|
32
|
+
Benchmarks should live outside the installed pip package. Configure a benchmark root once:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
llmdep config set dataset_root /data/eval_datasets
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
The directory should contain benchmark folders directly:
|
|
39
|
+
|
|
40
|
+
```text
|
|
41
|
+
/data/eval_datasets/
|
|
42
|
+
├── BiPaR/
|
|
43
|
+
│ ├── dataset_card.json
|
|
44
|
+
│ ├── methods/
|
|
45
|
+
│ ├── data/
|
|
46
|
+
│ └── cache/
|
|
47
|
+
└── gsm8k/
|
|
48
|
+
├── dataset_card.json
|
|
49
|
+
├── methods/
|
|
50
|
+
├── data/
|
|
51
|
+
└── cache/
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Check the active benchmark root:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
llmdep list benchmark
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Configure a model root once as well:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
llmdep config set model_root /data/llm_models
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
The directory should contain only user model folders directly. `BaseModel` is not placed here; it remains inside the installed `llmdep` package.
|
|
67
|
+
|
|
68
|
+
```text
|
|
69
|
+
/data/llm_models/
|
|
70
|
+
└── qwen-plus/
|
|
71
|
+
├── model.py
|
|
72
|
+
└── model_card.json
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Each `model.py` should expose a `Model` class with `call_with_prompt(...)`. Import the base class from the installed package:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from llm.BaseModel import BaseModel
|
|
79
|
+
|
|
80
|
+
class Model(BaseModel):
|
|
81
|
+
def call_with_prompt(self, input_data):
|
|
82
|
+
...
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Single Task
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Generate answers
|
|
89
|
+
llmdep gen_eval -b ARC -m qwen-plus -w 10
|
|
90
|
+
|
|
91
|
+
# Generate answers and evaluate
|
|
92
|
+
llmdep gen_eval -b ARC -m qwen-plus -w 10 --evaluate
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Batch Processing
|
|
96
|
+
|
|
97
|
+
Create a task configuration file `tasks.json`:
|
|
98
|
+
|
|
99
|
+
```json
|
|
100
|
+
[
|
|
101
|
+
{"model": "qwen-plus", "benchmark": "ARC_parquet"},
|
|
102
|
+
{"model": "deepseek", "benchmark": "ethics"}
|
|
103
|
+
]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Then run:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
llmdep batch --config tasks.json -w 10
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Resume Interrupted Task
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
llmdep gen_eval --run_id 20260210-164048-322281
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Commands
|
|
119
|
+
|
|
120
|
+
### gen_eval
|
|
121
|
+
|
|
122
|
+
Generate model answers and optionally evaluate.
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
llmdep gen_eval [options]
|
|
126
|
+
|
|
127
|
+
Options:
|
|
128
|
+
-b, --benchmark Benchmark name (e.g., ARC, ethics)
|
|
129
|
+
-m, --model Model name (e.g., qwen-plus, deepseek)
|
|
130
|
+
-w, --worker_nums Number of concurrent workers (default: 2)
|
|
131
|
+
--batch_size Batch size (default: 1)
|
|
132
|
+
--rate_limit Token bucket fill rate (default: 5.0)
|
|
133
|
+
--bucket_capacity Token bucket capacity (default: 10.0)
|
|
134
|
+
--limited_test Quick test with first 10 questions, or pass N
|
|
135
|
+
--evaluate Run evaluation after generation
|
|
136
|
+
--run_id Resume from run_id
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### evaluate
|
|
140
|
+
|
|
141
|
+
Run evaluation on existing results.
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
llmdep evaluate -b ARC -r 20260210-164048-322281_qwen-plus
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### list
|
|
148
|
+
|
|
149
|
+
List available benchmarks or models.
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
llmdep list benchmark
|
|
153
|
+
llmdep list model
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### dashboard
|
|
157
|
+
|
|
158
|
+
Start the local web control panel.
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
llmdep dashboard
|
|
162
|
+
llmdep dashboard --host 127.0.0.1 --port 8765
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### batch
|
|
166
|
+
|
|
167
|
+
Batch process tasks from config file.
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
llmdep batch --config tasks.json
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Project Structure
|
|
174
|
+
|
|
175
|
+
```
|
|
176
|
+
llmdep/
|
|
177
|
+
├── llm/ # Packaged BaseModel compatibility module
|
|
178
|
+
├── mep_client/ # Client for generating answers
|
|
179
|
+
│ ├── run/
|
|
180
|
+
│ └── API/
|
|
181
|
+
├── method_server/ # Benchmark implementations
|
|
182
|
+
│ ├── ARC_parquet/
|
|
183
|
+
│ ├── ethics/
|
|
184
|
+
│ ├── BiPaR/
|
|
185
|
+
│ └── ...
|
|
186
|
+
└── llmdep_cli.py # CLI entry point
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Adding New Models
|
|
190
|
+
|
|
191
|
+
1. Configure `model_root`: `llmdep config set model_root /data/llm_models`
|
|
192
|
+
2. Create folder under `/data/llm_models/<your_model>/`
|
|
193
|
+
2. Implement `model.py` inheriting from `BaseModel`
|
|
194
|
+
3. Create `model_card.json`
|
|
195
|
+
|
|
196
|
+
## Adding New Benchmarks
|
|
197
|
+
|
|
198
|
+
1. Configure `dataset_root`: `llmdep config set dataset_root /data/eval_datasets`
|
|
199
|
+
1. Create folder under `/data/eval_datasets/<benchmark_name>/`
|
|
200
|
+
2. Implement `prepare_ques.py` - Prepare questions and golden answers
|
|
201
|
+
3. Implement `eval_script.py` - Run evaluation
|
|
202
|
+
4. Create `dataset_card.json`
|
|
203
|
+
|
|
204
|
+
## License
|
|
205
|
+
|
|
206
|
+
MIT License
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import datetime
|
|
4
|
+
|
|
5
|
+
class BaseModel:
|
|
6
|
+
def __init__(self, args):
|
|
7
|
+
self.args = args
|
|
8
|
+
self.worker_num = getattr(args, 'worker_nums', 1)
|
|
9
|
+
self.model_name = getattr(args, 'model_name', getattr(args, 'modelname', None))
|
|
10
|
+
run_id = getattr(args, 'run_id', None)
|
|
11
|
+
name = self.model_name if self.model_name else 'model'
|
|
12
|
+
if run_id:
|
|
13
|
+
self.run_tag = f"{run_id}_{name}"
|
|
14
|
+
else:
|
|
15
|
+
ts = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
|
|
16
|
+
self.run_tag = f"{ts}_{name}"
|
|
17
|
+
self._set_logger()
|
|
18
|
+
|
|
19
|
+
def _set_logger(self):
|
|
20
|
+
self.generation_logging = logging.getLogger(f"generation_logging_{self.run_tag}")
|
|
21
|
+
self.error_logging = logging.getLogger(f"error_logging_{self.run_tag}")
|
|
22
|
+
project_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
23
|
+
log_dir = os.path.join(project_path, 'mep_client', 'log', self.run_tag)
|
|
24
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
25
|
+
for logger in (self.generation_logging, self.error_logging):
|
|
26
|
+
for handler in logger.handlers[:]:
|
|
27
|
+
handler.close()
|
|
28
|
+
logger.removeHandler(handler)
|
|
29
|
+
generation_handler = logging.FileHandler(os.path.join(log_dir, 'generation.log'), mode='a', encoding='utf-8')
|
|
30
|
+
error_handler = logging.FileHandler(os.path.join(log_dir, 'error.log'), mode='a', encoding='utf-8')
|
|
31
|
+
formatter = logging.Formatter('\n%(asctime)s %(levelname)s\n%(message)s')
|
|
32
|
+
generation_handler.setFormatter(formatter)
|
|
33
|
+
error_handler.setFormatter(formatter)
|
|
34
|
+
self.generation_logging.setLevel(logging.INFO)
|
|
35
|
+
self.error_logging.setLevel(logging.INFO)
|
|
36
|
+
self.generation_logging.addHandler(generation_handler)
|
|
37
|
+
self.error_logging.addHandler(error_handler)
|
|
38
|
+
|
|
39
|
+
def list_prompts_to_questions(self, prompts):
|
|
40
|
+
return [[{"role": "user", "content": p}] for p in prompts]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .BaseModel import BaseModel
|