llmdep 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. llmdep-0.2.0/LICENSE +21 -0
  2. llmdep-0.2.0/MANIFEST.in +33 -0
  3. llmdep-0.2.0/PKG-INFO +233 -0
  4. llmdep-0.2.0/README.md +206 -0
  5. llmdep-0.2.0/llm/BaseModel.py +40 -0
  6. llmdep-0.2.0/llm/__init__.py +1 -0
  7. llmdep-0.2.0/llmdep.egg-info/PKG-INFO +233 -0
  8. llmdep-0.2.0/llmdep.egg-info/SOURCES.txt +85 -0
  9. llmdep-0.2.0/llmdep.egg-info/dependency_links.txt +1 -0
  10. llmdep-0.2.0/llmdep.egg-info/entry_points.txt +3 -0
  11. llmdep-0.2.0/llmdep.egg-info/requires.txt +18 -0
  12. llmdep-0.2.0/llmdep.egg-info/top_level.txt +5 -0
  13. llmdep-0.2.0/llmdep_cli.py +1384 -0
  14. llmdep-0.2.0/llmdep_frontend/__init__.py +0 -0
  15. llmdep-0.2.0/llmdep_frontend/index.html +163 -0
  16. llmdep-0.2.0/llmdep_frontend/main.js +356 -0
  17. llmdep-0.2.0/llmdep_frontend/server.py +416 -0
  18. llmdep-0.2.0/llmdep_frontend/styles.css +472 -0
  19. llmdep-0.2.0/mep_client/API/__init__.py +29 -0
  20. llmdep-0.2.0/mep_client/API/auto_discover.py +141 -0
  21. llmdep-0.2.0/mep_client/API/clear_api.py +172 -0
  22. llmdep-0.2.0/mep_client/API/evaluation_api.py +395 -0
  23. llmdep-0.2.0/mep_client/API/main.py +81 -0
  24. llmdep-0.2.0/mep_client/__init__.py +0 -0
  25. llmdep-0.2.0/mep_client/benchmark_remote.py +124 -0
  26. llmdep-0.2.0/mep_client/config.py +268 -0
  27. llmdep-0.2.0/mep_client/run/__init__.py +0 -0
  28. llmdep-0.2.0/mep_client/run/all.json +3 -0
  29. llmdep-0.2.0/mep_client/run/generate_and_evaluate.py +113 -0
  30. llmdep-0.2.0/mep_client/run/generate_and_evaluate_all.py +128 -0
  31. llmdep-0.2.0/mep_client/run/run.py +98 -0
  32. llmdep-0.2.0/mep_client/run/utils/__init__.py +1 -0
  33. llmdep-0.2.0/mep_client/run/utils/benchmark_remote.py +1 -0
  34. llmdep-0.2.0/mep_client/run/utils/generate_user_answers.py +670 -0
  35. llmdep-0.2.0/mep_client/run/utils/server_data_access.py +199 -0
  36. llmdep-0.2.0/method_server/__init__.py +0 -0
  37. llmdep-0.2.0/method_server/eval/icl_evaluator/__init__.py +8 -0
  38. llmdep-0.2.0/method_server/eval/icl_evaluator/icl_aucroc_evaluator.py +40 -0
  39. llmdep-0.2.0/method_server/eval/icl_evaluator/icl_base_evaluator.py +10 -0
  40. llmdep-0.2.0/method_server/eval/icl_evaluator/icl_em_evaluator.py +82 -0
  41. llmdep-0.2.0/method_server/eval/icl_evaluator/icl_f1_evaluator.py +50 -0
  42. llmdep-0.2.0/method_server/eval/icl_evaluator/icl_hf_evaluator.py +321 -0
  43. llmdep-0.2.0/method_server/eval/icl_evaluator/icl_jieba_rouge_evaluator.py +41 -0
  44. llmdep-0.2.0/method_server/eval/icl_evaluator/icl_matchAcc_evaluator.py +41 -0
  45. llmdep-0.2.0/method_server/eval/metrics/accuracy/README.md +119 -0
  46. llmdep-0.2.0/method_server/eval/metrics/accuracy/accuracy.py +106 -0
  47. llmdep-0.2.0/method_server/eval/metrics/accuracy/app.py +6 -0
  48. llmdep-0.2.0/method_server/eval/metrics/accuracy/requirements.txt +2 -0
  49. llmdep-0.2.0/method_server/eval/metrics/bleu/README.md +160 -0
  50. llmdep-0.2.0/method_server/eval/metrics/bleu/app.py +6 -0
  51. llmdep-0.2.0/method_server/eval/metrics/bleu/bleu.py +133 -0
  52. llmdep-0.2.0/method_server/eval/metrics/bleu/nmt_bleu.py +112 -0
  53. llmdep-0.2.0/method_server/eval/metrics/bleu/requirements.txt +1 -0
  54. llmdep-0.2.0/method_server/eval/metrics/bleu/tokenizer_13a.py +100 -0
  55. llmdep-0.2.0/method_server/eval/metrics/exact_match/README.md +119 -0
  56. llmdep-0.2.0/method_server/eval/metrics/exact_match/app.py +6 -0
  57. llmdep-0.2.0/method_server/eval/metrics/exact_match/exact_match.py +136 -0
  58. llmdep-0.2.0/method_server/eval/metrics/exact_match/requirements.txt +1 -0
  59. llmdep-0.2.0/method_server/eval/metrics/f1/README.md +137 -0
  60. llmdep-0.2.0/method_server/eval/metrics/f1/app.py +6 -0
  61. llmdep-0.2.0/method_server/eval/metrics/f1/f1.py +130 -0
  62. llmdep-0.2.0/method_server/eval/metrics/f1/requirements.txt +2 -0
  63. llmdep-0.2.0/method_server/eval/metrics/precision/README.md +142 -0
  64. llmdep-0.2.0/method_server/eval/metrics/precision/app.py +6 -0
  65. llmdep-0.2.0/method_server/eval/metrics/precision/precision.py +145 -0
  66. llmdep-0.2.0/method_server/eval/metrics/precision/requirements.txt +2 -0
  67. llmdep-0.2.0/method_server/eval/metrics/recall/README.md +132 -0
  68. llmdep-0.2.0/method_server/eval/metrics/recall/app.py +6 -0
  69. llmdep-0.2.0/method_server/eval/metrics/recall/recall.py +135 -0
  70. llmdep-0.2.0/method_server/eval/metrics/recall/requirements.txt +2 -0
  71. llmdep-0.2.0/method_server/eval/metrics/roc_auc/README.md +204 -0
  72. llmdep-0.2.0/method_server/eval/metrics/roc_auc/app.py +6 -0
  73. llmdep-0.2.0/method_server/eval/metrics/roc_auc/requirements.txt +2 -0
  74. llmdep-0.2.0/method_server/eval/metrics/roc_auc/roc_auc.py +191 -0
  75. llmdep-0.2.0/method_server/eval/metrics/rouge/README.md +160 -0
  76. llmdep-0.2.0/method_server/eval/metrics/rouge/app.py +6 -0
  77. llmdep-0.2.0/method_server/eval/metrics/rouge/requirements.txt +4 -0
  78. llmdep-0.2.0/method_server/eval/metrics/rouge/rouge.py +158 -0
  79. llmdep-0.2.0/method_server/eval/metrics/sacrebleu/README.md +119 -0
  80. llmdep-0.2.0/method_server/eval/metrics/sacrebleu/app.py +11 -0
  81. llmdep-0.2.0/method_server/eval/metrics/sacrebleu/requirements.txt +2 -0
  82. llmdep-0.2.0/method_server/eval/metrics/sacrebleu/sacrebleu.py +178 -0
  83. llmdep-0.2.0/method_server/eval/utils/text_postprocessors.py +148 -0
  84. llmdep-0.2.0/pyproject.toml +3 -0
  85. llmdep-0.2.0/setup.cfg +66 -0
  86. llmdep-0.2.0/setup.py +2 -0
llmdep-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Anonymous
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,33 @@
1
+ include README.md
2
+ include LICENSE
3
+ include setup.py
4
+ include setup.cfg
5
+ include MANIFEST.in
6
+
7
+ recursive-include mep_client *.py *.json
8
+ recursive-exclude mep_client/final_results *
9
+ recursive-exclude mep_client/log *
10
+
11
+ include llm/BaseModel.py
12
+ include llm/__init__.py
13
+ include method_server/*.py
14
+ recursive-include llmdep_frontend *.py *.html *.js *.css
15
+
16
+ recursive-include method_server/eval *.py *.txt *.md
17
+ recursive-include dep_server *.py *.md
18
+
19
+ recursive-exclude method_server *.csv *.jsonl *.parquet *.arrow *.pkl *.pt *.pth *.safetensors
20
+ recursive-exclude method_server/*/cache *
21
+ recursive-exclude method_server/*/data *
22
+
23
+ global-exclude config_backup.py
24
+ global-exclude __pycache__
25
+ global-exclude *.pyc
26
+ global-exclude *.pyo
27
+ global-exclude *.pyd
28
+ global-exclude *.parquet
29
+ global-exclude *.arrow
30
+ global-exclude .gitignore
31
+ global-exclude .git
32
+ global-exclude .idea
33
+ global-exclude *.egg-info
llmdep-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,233 @@
1
+ Metadata-Version: 2.1
2
+ Name: llmdep
3
+ Version: 0.2.0
4
+ Summary: Distributed LLM Evaluation Benchmark Server
5
+ Home-page: https://github.com/tjunlp-lab/DEP
6
+ Author: TJUNLP
7
+ License: MIT
8
+ Platform: UNKNOWN
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Scientific/Engineering
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+
26
+ # llmdep - Distributed LLM Evaluation Benchmark Server
27
+
28
+ [![Python](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/)
29
+ [![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
30
+
31
+ A distributed benchmark server for evaluating Large Language Models (LLMs) across multiple benchmarks.
32
+
33
+ ## Features
34
+
35
+ - **Distributed Evaluation**: Support concurrent worker processes
36
+ - **Multiple Benchmarks**: ARC, ETHICS, MMLU-Pro, BiPaR, and more
37
+ - **Rate Limiting**: Token bucket based rate limiting for API calls
38
+ - **Batch Processing**: Process multiple model-benchmark pairs in batch
39
+ - **Resume Support**: Continue interrupted tasks with `--run_id`
40
+
41
+ ## Installation
42
+
43
+ ```bash
44
+ pip install llmdep
45
+ ```
46
+
47
+ Or install from source:
48
+
49
+ ```bash
50
+ pip install -e .
51
+ ```
52
+
53
+ ## Quick Start
54
+
55
+ ### Configure Benchmark Workspace
56
+
57
+ Benchmarks should live outside the installed pip package. Configure a benchmark root once:
58
+
59
+ ```bash
60
+ llmdep config set dataset_root /data/eval_datasets
61
+ ```
62
+
63
+ The directory should contain benchmark folders directly:
64
+
65
+ ```text
66
+ /data/eval_datasets/
67
+ ├── BiPaR/
68
+ │ ├── dataset_card.json
69
+ │ ├── methods/
70
+ │ ├── data/
71
+ │ └── cache/
72
+ └── gsm8k/
73
+ ├── dataset_card.json
74
+ ├── methods/
75
+ ├── data/
76
+ └── cache/
77
+ ```
78
+
79
+ Check the active benchmark root:
80
+
81
+ ```bash
82
+ llmdep list benchmark
83
+ ```
84
+
85
+ Configure a model root once as well:
86
+
87
+ ```bash
88
+ llmdep config set model_root /data/llm_models
89
+ ```
90
+
91
+ The directory should contain only user model folders directly. `BaseModel` is not placed here; it remains inside the installed `llmdep` package.
92
+
93
+ ```text
94
+ /data/llm_models/
95
+ └── qwen-plus/
96
+ ├── model.py
97
+ └── model_card.json
98
+ ```
99
+
100
+ Each `model.py` should expose a `Model` class with `call_with_prompt(...)`. Import the base class from the installed package:
101
+
102
+ ```python
103
+ from llm.BaseModel import BaseModel
104
+
105
+ class Model(BaseModel):
106
+ def call_with_prompt(self, input_data):
107
+ ...
108
+ ```
109
+
110
+ ### Single Task
111
+
112
+ ```bash
113
+ # Generate answers
114
+ llmdep gen_eval -b ARC -m qwen-plus -w 10
115
+
116
+ # Generate answers and evaluate
117
+ llmdep gen_eval -b ARC -m qwen-plus -w 10 --evaluate
118
+ ```
119
+
120
+ ### Batch Processing
121
+
122
+ Create a task configuration file `tasks.json`:
123
+
124
+ ```json
125
+ [
126
+ {"model": "qwen-plus", "benchmark": "ARC_parquet"},
127
+ {"model": "deepseek", "benchmark": "ethics"}
128
+ ]
129
+ ```
130
+
131
+ Then run:
132
+
133
+ ```bash
134
+ llmdep batch --config tasks.json -w 10
135
+ ```
136
+
137
+ ### Resume Interrupted Task
138
+
139
+ ```bash
140
+ llmdep gen_eval --run_id 20260210-164048-322281
141
+ ```
142
+
143
+ ## Commands
144
+
145
+ ### gen_eval
146
+
147
+ Generate model answers and optionally evaluate.
148
+
149
+ ```bash
150
+ llmdep gen_eval [options]
151
+
152
+ Options:
153
+ -b, --benchmark Benchmark name (e.g., ARC, ethics)
154
+ -m, --model Model name (e.g., qwen-plus, deepseek)
155
+ -w, --worker_nums Number of concurrent workers (default: 2)
156
+ --batch_size Batch size (default: 1)
157
+ --rate_limit Token bucket fill rate (default: 5.0)
158
+ --bucket_capacity Token bucket capacity (default: 10.0)
159
+ --limited_test Quick test with first 10 questions, or pass N
160
+ --evaluate Run evaluation after generation
161
+ --run_id Resume from run_id
162
+ ```
163
+
164
+ ### evaluate
165
+
166
+ Run evaluation on existing results.
167
+
168
+ ```bash
169
+ llmdep evaluate -b ARC -r 20260210-164048-322281_qwen-plus
170
+ ```
171
+
172
+ ### list
173
+
174
+ List available benchmarks or models.
175
+
176
+ ```bash
177
+ llmdep list benchmark
178
+ llmdep list model
179
+ ```
180
+
181
+ ### dashboard
182
+
183
+ Start the local web control panel.
184
+
185
+ ```bash
186
+ llmdep dashboard
187
+ llmdep dashboard --host 127.0.0.1 --port 8765
188
+ ```
189
+
190
+ ### batch
191
+
192
+ Batch process tasks from config file.
193
+
194
+ ```bash
195
+ llmdep batch --config tasks.json
196
+ ```
197
+
198
+ ## Project Structure
199
+
200
+ ```
201
+ llmdep/
202
+ ├── llm/ # Packaged BaseModel compatibility module
203
+ ├── mep_client/ # Client for generating answers
204
+ │ ├── run/
205
+ │ └── API/
206
+ ├── method_server/ # Benchmark implementations
207
+ │ ├── ARC_parquet/
208
+ │ ├── ethics/
209
+ │ ├── BiPaR/
210
+ │ └── ...
211
+ └── llmdep_cli.py # CLI entry point
212
+ ```
213
+
214
+ ## Adding New Models
215
+
216
+ 1. Configure `model_root`: `llmdep config set model_root /data/llm_models`
217
+ 2. Create folder under `/data/llm_models/<your_model>/`
218
+ 2. Implement `model.py` inheriting from `BaseModel`
219
+ 3. Create `model_card.json`
220
+
221
+ ## Adding New Benchmarks
222
+
223
+ 1. Configure `dataset_root`: `llmdep config set dataset_root /data/eval_datasets`
224
+ 1. Create folder under `/data/eval_datasets/<benchmark_name>/`
225
+ 2. Implement `prepare_ques.py` - Prepare questions and golden answers
226
+ 3. Implement `eval_script.py` - Run evaluation
227
+ 4. Create `dataset_card.json`
228
+
229
+ ## License
230
+
231
+ MIT License
232
+
233
+
llmdep-0.2.0/README.md ADDED
@@ -0,0 +1,206 @@
1
+ # llmdep - Distributed LLM Evaluation Benchmark Server
2
+
3
+ [![Python](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/)
4
+ [![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
5
+
6
+ A distributed benchmark server for evaluating Large Language Models (LLMs) across multiple benchmarks.
7
+
8
+ ## Features
9
+
10
+ - **Distributed Evaluation**: Support concurrent worker processes
11
+ - **Multiple Benchmarks**: ARC, ETHICS, MMLU-Pro, BiPaR, and more
12
+ - **Rate Limiting**: Token bucket based rate limiting for API calls
13
+ - **Batch Processing**: Process multiple model-benchmark pairs in batch
14
+ - **Resume Support**: Continue interrupted tasks with `--run_id`
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install llmdep
20
+ ```
21
+
22
+ Or install from source:
23
+
24
+ ```bash
25
+ pip install -e .
26
+ ```
27
+
28
+ ## Quick Start
29
+
30
+ ### Configure Benchmark Workspace
31
+
32
+ Benchmarks should live outside the installed pip package. Configure a benchmark root once:
33
+
34
+ ```bash
35
+ llmdep config set dataset_root /data/eval_datasets
36
+ ```
37
+
38
+ The directory should contain benchmark folders directly:
39
+
40
+ ```text
41
+ /data/eval_datasets/
42
+ ├── BiPaR/
43
+ │ ├── dataset_card.json
44
+ │ ├── methods/
45
+ │ ├── data/
46
+ │ └── cache/
47
+ └── gsm8k/
48
+ ├── dataset_card.json
49
+ ├── methods/
50
+ ├── data/
51
+ └── cache/
52
+ ```
53
+
54
+ Check the active benchmark root:
55
+
56
+ ```bash
57
+ llmdep list benchmark
58
+ ```
59
+
60
+ Configure a model root once as well:
61
+
62
+ ```bash
63
+ llmdep config set model_root /data/llm_models
64
+ ```
65
+
66
+ The directory should contain only user model folders directly. `BaseModel` is not placed here; it remains inside the installed `llmdep` package.
67
+
68
+ ```text
69
+ /data/llm_models/
70
+ └── qwen-plus/
71
+ ├── model.py
72
+ └── model_card.json
73
+ ```
74
+
75
+ Each `model.py` should expose a `Model` class with `call_with_prompt(...)`. Import the base class from the installed package:
76
+
77
+ ```python
78
+ from llm.BaseModel import BaseModel
79
+
80
+ class Model(BaseModel):
81
+ def call_with_prompt(self, input_data):
82
+ ...
83
+ ```
84
+
85
+ ### Single Task
86
+
87
+ ```bash
88
+ # Generate answers
89
+ llmdep gen_eval -b ARC -m qwen-plus -w 10
90
+
91
+ # Generate answers and evaluate
92
+ llmdep gen_eval -b ARC -m qwen-plus -w 10 --evaluate
93
+ ```
94
+
95
+ ### Batch Processing
96
+
97
+ Create a task configuration file `tasks.json`:
98
+
99
+ ```json
100
+ [
101
+ {"model": "qwen-plus", "benchmark": "ARC_parquet"},
102
+ {"model": "deepseek", "benchmark": "ethics"}
103
+ ]
104
+ ```
105
+
106
+ Then run:
107
+
108
+ ```bash
109
+ llmdep batch --config tasks.json -w 10
110
+ ```
111
+
112
+ ### Resume Interrupted Task
113
+
114
+ ```bash
115
+ llmdep gen_eval --run_id 20260210-164048-322281
116
+ ```
117
+
118
+ ## Commands
119
+
120
+ ### gen_eval
121
+
122
+ Generate model answers and optionally evaluate.
123
+
124
+ ```bash
125
+ llmdep gen_eval [options]
126
+
127
+ Options:
128
+ -b, --benchmark Benchmark name (e.g., ARC, ethics)
129
+ -m, --model Model name (e.g., qwen-plus, deepseek)
130
+ -w, --worker_nums Number of concurrent workers (default: 2)
131
+ --batch_size Batch size (default: 1)
132
+ --rate_limit Token bucket fill rate (default: 5.0)
133
+ --bucket_capacity Token bucket capacity (default: 10.0)
134
+ --limited_test Quick test with first 10 questions, or pass N
135
+ --evaluate Run evaluation after generation
136
+ --run_id Resume from run_id
137
+ ```
138
+
139
+ ### evaluate
140
+
141
+ Run evaluation on existing results.
142
+
143
+ ```bash
144
+ llmdep evaluate -b ARC -r 20260210-164048-322281_qwen-plus
145
+ ```
146
+
147
+ ### list
148
+
149
+ List available benchmarks or models.
150
+
151
+ ```bash
152
+ llmdep list benchmark
153
+ llmdep list model
154
+ ```
155
+
156
+ ### dashboard
157
+
158
+ Start the local web control panel.
159
+
160
+ ```bash
161
+ llmdep dashboard
162
+ llmdep dashboard --host 127.0.0.1 --port 8765
163
+ ```
164
+
165
+ ### batch
166
+
167
+ Batch process tasks from config file.
168
+
169
+ ```bash
170
+ llmdep batch --config tasks.json
171
+ ```
172
+
173
+ ## Project Structure
174
+
175
+ ```
176
+ llmdep/
177
+ ├── llm/ # Packaged BaseModel compatibility module
178
+ ├── mep_client/ # Client for generating answers
179
+ │ ├── run/
180
+ │ └── API/
181
+ ├── method_server/ # Benchmark implementations
182
+ │ ├── ARC_parquet/
183
+ │ ├── ethics/
184
+ │ ├── BiPaR/
185
+ │ └── ...
186
+ └── llmdep_cli.py # CLI entry point
187
+ ```
188
+
189
+ ## Adding New Models
190
+
191
+ 1. Configure `model_root`: `llmdep config set model_root /data/llm_models`
192
+ 2. Create folder under `/data/llm_models/<your_model>/`
193
+ 2. Implement `model.py` inheriting from `BaseModel`
194
+ 3. Create `model_card.json`
195
+
196
+ ## Adding New Benchmarks
197
+
198
+ 1. Configure `dataset_root`: `llmdep config set dataset_root /data/eval_datasets`
199
+ 1. Create folder under `/data/eval_datasets/<benchmark_name>/`
200
+ 2. Implement `prepare_ques.py` - Prepare questions and golden answers
201
+ 3. Implement `eval_script.py` - Run evaluation
202
+ 4. Create `dataset_card.json`
203
+
204
+ ## License
205
+
206
+ MIT License
@@ -0,0 +1,40 @@
1
+ import os
2
+ import logging
3
+ import datetime
4
+
5
+ class BaseModel:
6
+ def __init__(self, args):
7
+ self.args = args
8
+ self.worker_num = getattr(args, 'worker_nums', 1)
9
+ self.model_name = getattr(args, 'model_name', getattr(args, 'modelname', None))
10
+ run_id = getattr(args, 'run_id', None)
11
+ name = self.model_name if self.model_name else 'model'
12
+ if run_id:
13
+ self.run_tag = f"{run_id}_{name}"
14
+ else:
15
+ ts = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
16
+ self.run_tag = f"{ts}_{name}"
17
+ self._set_logger()
18
+
19
+ def _set_logger(self):
20
+ self.generation_logging = logging.getLogger(f"generation_logging_{self.run_tag}")
21
+ self.error_logging = logging.getLogger(f"error_logging_{self.run_tag}")
22
+ project_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
23
+ log_dir = os.path.join(project_path, 'mep_client', 'log', self.run_tag)
24
+ os.makedirs(log_dir, exist_ok=True)
25
+ for logger in (self.generation_logging, self.error_logging):
26
+ for handler in logger.handlers[:]:
27
+ handler.close()
28
+ logger.removeHandler(handler)
29
+ generation_handler = logging.FileHandler(os.path.join(log_dir, 'generation.log'), mode='a', encoding='utf-8')
30
+ error_handler = logging.FileHandler(os.path.join(log_dir, 'error.log'), mode='a', encoding='utf-8')
31
+ formatter = logging.Formatter('\n%(asctime)s %(levelname)s\n%(message)s')
32
+ generation_handler.setFormatter(formatter)
33
+ error_handler.setFormatter(formatter)
34
+ self.generation_logging.setLevel(logging.INFO)
35
+ self.error_logging.setLevel(logging.INFO)
36
+ self.generation_logging.addHandler(generation_handler)
37
+ self.error_logging.addHandler(error_handler)
38
+
39
+ def list_prompts_to_questions(self, prompts):
40
+ return [[{"role": "user", "content": p}] for p in prompts]
@@ -0,0 +1 @@
1
+ from .BaseModel import BaseModel