astron-eval 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +119 -0
- package/bin/astron-eval.mjs +111 -0
- package/package.json +24 -0
- package/skills/astron-eval/SKILL.md +60 -0
- package/skills/model-evaluation/SKILL.md +180 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/eval-judge.json +11 -0
- package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
- package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
- package/skills/model-evaluation/assets/experts/content-match.json +37 -0
- package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
- package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
- package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
- package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
- package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
- package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
- package/skills/model-evaluation/eval-build.md +281 -0
- package/skills/model-evaluation/eval-execute.md +196 -0
- package/skills/model-evaluation/eval-init.md +237 -0
- package/skills/model-evaluation/processes/dimension-process.md +207 -0
- package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
- package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
- package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
- package/skills/model-evaluation/processes/keypoint-process.md +148 -0
- package/skills/model-evaluation/processes/python-env-process.md +113 -0
- package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
- package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
- package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
- package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
- package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
- package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
- package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
- package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
- package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
- package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
- package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
- package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
- package/skills/model-evaluation/scripts/eval_auth.py +588 -0
- package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
- package/skills/model-evaluation/scripts/eval_set.py +410 -0
- package/skills/model-evaluation/scripts/eval_task.py +324 -0
- package/skills/model-evaluation/scripts/files/__init__.py +38 -0
- package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
- package/skills/model-evaluation/scripts/files/streaming.py +245 -0
- package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
- package/skills/model-evaluation/scripts/utils/constants.py +101 -0
- package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
- package/skills/model-evaluation/scripts/utils/errors.py +244 -0
- package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
- package/skills/skill-driven-eval/SKILL.md +456 -0
- package/skills/skill-driven-eval/agents/grader.md +144 -0
- package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
- package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
- package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
- package/skills/skill-driven-eval/references/schemas.md +282 -0
- package/skills/skill-driven-eval/scripts/__init__.py +1 -0
- package/skills/skill-driven-eval/scripts/__main__.py +70 -0
- package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
- package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
- package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: intermediate-files
|
|
3
|
+
description: Use when needing to understand the format and fields of intermediate files generated during evaluation workflow
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# 中间产物说明
|
|
7
|
+
|
|
8
|
+
本文档定义评测流程中产生的中间文件格式和字段。
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## 快速参考
|
|
13
|
+
|
|
14
|
+
| 阶段 | 文件名 | 说明 |
|
|
15
|
+
|------|--------|------|
|
|
16
|
+
| 初始化 | `auth.json` | 鉴权Token缓存 |
|
|
17
|
+
| | `state.json` | OAuth授权状态 |
|
|
18
|
+
| | `env.cfg` | 环境配置缓存 |
|
|
19
|
+
| 构建 | `eval-dimension.json` | 评测维度配置 |
|
|
20
|
+
| | `eval-judge.json` | 评委模型配置 |
|
|
21
|
+
| | `evalset-structure.json` | 评测集结构分析 |
|
|
22
|
+
| | `evalset-fields-mapping.json` | 字段映射配置 |
|
|
23
|
+
| | `evalset-prepared.{ext}` | 待标准化评测集(流程产物) |
|
|
24
|
+
| | `evalset-standard.jsonl` | 标准化评测集 |
|
|
25
|
+
| | `evalset-meta.json` | 评测集元信息 |
|
|
26
|
+
| 执行 | `evaltask-meta.json` | 任务元信息 |
|
|
27
|
+
| | `evaltask-result.json` | 评测结果JSON |
|
|
28
|
+
| | `evaltask-result.md` | 评测报告Markdown |
|
|
29
|
+
| | `eval-progress.json` | 任务进度信息 |
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## 1. 初始化阶段文件
|
|
34
|
+
|
|
35
|
+
### auth.json
|
|
36
|
+
|
|
37
|
+
**路径**:`{work-dir}/.eval/auth.json`
|
|
38
|
+
|
|
39
|
+
| 字段 | 类型 | 说明 |
|
|
40
|
+
|------|------|------|
|
|
41
|
+
| `access_token` | `string` | 访问令牌 |
|
|
42
|
+
| `expires_in` | `int` | 有效时长(秒) |
|
|
43
|
+
| `created_at` | `string` | 创建时间(ISO 8601) |
|
|
44
|
+
| `expires_at` | `string` | 过期时间(ISO 8601) |
|
|
45
|
+
|
|
46
|
+
### state.json
|
|
47
|
+
|
|
48
|
+
**路径**:`{work-dir}/.eval/state.json`
|
|
49
|
+
|
|
50
|
+
| 字段 | 类型 | 说明 |
|
|
51
|
+
|------|------|------|
|
|
52
|
+
| `state_token` | `string` | 授权状态标识(UUID) |
|
|
53
|
+
|
|
54
|
+
### env.cfg
|
|
55
|
+
|
|
56
|
+
**路径**:`{work-dir}/.eval/env.cfg`
|
|
57
|
+
|
|
58
|
+
| 字段 | 说明 |
|
|
59
|
+
|------|------|
|
|
60
|
+
| `python_cmd` | Python命令(`python` 或 `python3`) |
|
|
61
|
+
| `deps_required` | 必需依赖列表 |
|
|
62
|
+
| `deps_optional` | 可选依赖列表 |
|
|
63
|
+
| `created_at` | 创建时间 |
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## 2. 构建阶段文件
|
|
68
|
+
|
|
69
|
+
### 评测集文件关系
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
评测集来源(任务4步骤1)
|
|
73
|
+
│
|
|
74
|
+
├─► 流程3:解析评测集 ──► evalset-prepared.{ext}
|
|
75
|
+
│ │
|
|
76
|
+
│ ▼
|
|
77
|
+
├─► 流程2:补充答案 ──► evalset-prepared.jsonl
|
|
78
|
+
│ │
|
|
79
|
+
│ ▼
|
|
80
|
+
├─► 流程1:生成评测集 ──► evalset-prepared.jsonl
|
|
81
|
+
│ │
|
|
82
|
+
└──────────────────────────────┘
|
|
83
|
+
│
|
|
84
|
+
▼
|
|
85
|
+
任务4步骤2:标准化转换
|
|
86
|
+
│
|
|
87
|
+
▼
|
|
88
|
+
evalset-standard.jsonl
|
|
89
|
+
│
|
|
90
|
+
▼
|
|
91
|
+
任务4步骤3:上传评测集
|
|
92
|
+
│
|
|
93
|
+
▼
|
|
94
|
+
evalset-meta.json
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**文件生成流程**:
|
|
98
|
+
1. **流程1(生成评测集)**:AI生成 → 预览确认 → 字段映射 → 任务4步骤2
|
|
99
|
+
2. **流程2(补充答案)**:获取文件 → 验证 → AI补充答案 → 预览确认 → 字段映射 → 任务4步骤2
|
|
100
|
+
3. **流程3(解析评测集)**:下载/复制 → 解析字段映射 → 用户确认 → 任务4步骤2
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
### eval-dimension.json
|
|
105
|
+
|
|
106
|
+
**路径**:`{work-dir}/.eval/{session-id}/eval-dimension.json`
|
|
107
|
+
|
|
108
|
+
字段定义见 [评测维度说明.md](./评测维度说明.md) 第2-3节。
|
|
109
|
+
|
|
110
|
+
### eval-judge.json
|
|
111
|
+
|
|
112
|
+
**路径**:`{work-dir}/.eval/{session-id}/eval-judge.json`
|
|
113
|
+
|
|
114
|
+
| 字段 | 类型 | 必填 | 说明 |
|
|
115
|
+
|------|------|------|------|
|
|
116
|
+
| `models` | `array` | 是 | LLM评委列表 |
|
|
117
|
+
|
|
118
|
+
**models[].LLMJudge**:
|
|
119
|
+
|
|
120
|
+
| 字段 | 类型 | 必填 | 说明 |
|
|
121
|
+
|------|------|------|------|
|
|
122
|
+
| `id` | `string` | 是 | 模型标识(用于填充judge_id) |
|
|
123
|
+
| `name` | `string` | 否 | 模型名称 |
|
|
124
|
+
| `type` | `string` | 否 | 类型 |
|
|
125
|
+
| `concurrency` | `int` | 是 | 并发数 |
|
|
126
|
+
|
|
127
|
+
> **注意**:`api_key` 和 `api_url` 为空是正常现象,仅在使用自定义评委时需要配置。
|
|
128
|
+
|
|
129
|
+
### evalset-structure.json
|
|
130
|
+
|
|
131
|
+
**路径**:`{work-dir}/.eval/{session-id}/evalset/evalset-structure.json`
|
|
132
|
+
|
|
133
|
+
**说明**:由 `eval_set.py analysis` 命令生成的评测集结构文件,用于字段映射生成。
|
|
134
|
+
|
|
135
|
+
| 字段 | 类型 | 说明 |
|
|
136
|
+
|------|------|------|
|
|
137
|
+
| `file` | `string` | 文件路径 |
|
|
138
|
+
| `format` | `string` | 文件格式(json/jsonl/csv/xlsx) |
|
|
139
|
+
| `total_rows` | `int` | 总行数 |
|
|
140
|
+
| `fields` | `object` | 字段信息,key 为字段名,value 包含 type 字段 |
|
|
141
|
+
|
|
142
|
+
**示例**:
|
|
143
|
+
|
|
144
|
+
```json
|
|
145
|
+
{
|
|
146
|
+
"file": "dataset/sample-evalset.jsonl",
|
|
147
|
+
"format": "jsonl",
|
|
148
|
+
"total_rows": 100,
|
|
149
|
+
"fields": {
|
|
150
|
+
"question": {"type": "str"},
|
|
151
|
+
"answer": {"type": "str"},
|
|
152
|
+
"model": {"type": "str"},
|
|
153
|
+
"id": {"type": "int"}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### evalset-standard.jsonl
|
|
159
|
+
|
|
160
|
+
**路径**:`{work-dir}/.eval/{session-id}/evalset/evalset-standard.jsonl`
|
|
161
|
+
|
|
162
|
+
| 字段 | 类型 | 必填 | 说明 |
|
|
163
|
+
|------|------|------|------|
|
|
164
|
+
| `question` | `string` | 是 | 评测问题 |
|
|
165
|
+
| `answer` | `string` | 是 | 模型回答 |
|
|
166
|
+
| `model` | `string` | 是 | 模型标识(默认 `default`) |
|
|
167
|
+
| `case_id` | `string` | 是 | 用例标识(见下方说明) |
|
|
168
|
+
| `system` | `string` | 否 | 系统提示词 |
|
|
169
|
+
| `context` | `string` | 否 | 上下文信息 |
|
|
170
|
+
| `category` | `string` | 否 | 分类标签 |
|
|
171
|
+
| `reference` | `string` | 否 | 参考答案 |
|
|
172
|
+
| `keypoint` | `string` | 否 | 评测点列表(JSON字符串数组) |
|
|
173
|
+
|
|
174
|
+
**case_id 字段说明**:
|
|
175
|
+
|
|
176
|
+
`case_id` 用于唯一标识评测问题,在多模型横评场景下有重要作用。
|
|
177
|
+
|
|
178
|
+
| 场景 | case_id 生成规则 |
|
|
179
|
+
|------|-----------------|
|
|
180
|
+
| 源数据有 case_id 字段 | 直接使用源数据的值 |
|
|
181
|
+
| 源数据无 case_id 字段 | 根据 question 字段分组,相同问题生成相同 case_id |
|
|
182
|
+
|
|
183
|
+
**多模型横评示例**:
|
|
184
|
+
|
|
185
|
+
假设评测两个模型(A和B)对两个问题的回答:
|
|
186
|
+
|
|
187
|
+
| seq_id | case_id | model | question | answer |
|
|
188
|
+
|--------|---------|-------|----------|--------|
|
|
189
|
+
| 0 | case-0001 | A | 亚运会在哪个城市举行? | 杭州 |
|
|
190
|
+
| 1 | case-0001 | B | 亚运会在哪个城市举行? | 杭州 |
|
|
191
|
+
| 2 | case-0002 | A | 上海科技馆由哪几个主要展馆组成? | 五个展馆 |
|
|
192
|
+
| 3 | case-0002 | B | 上海科技馆由哪几个主要展馆组成? | 三个展馆 |
|
|
193
|
+
|
|
194
|
+
- `case_id`:问题的唯一标识,同一问题的不同模型回答共享相同 case_id
|
|
195
|
+
- `seq_id`:行的唯一标识(内部序号)
|
|
196
|
+
|
|
197
|
+
### evalset-prepared.jsonl / evalset-prepared.{ext}
|
|
198
|
+
|
|
199
|
+
**路径**:`{work-dir}/.eval/{session-id}/evalset/evalset-prepared.{ext}`
|
|
200
|
+
|
|
201
|
+
**说明**:各流程产出的待标准化评测集文件,统一由任务4步骤2进行标准化转换。
|
|
202
|
+
|
|
203
|
+
| 来源流程 | 文件名 | 说明 |
|
|
204
|
+
|----------|--------|------|
|
|
205
|
+
| 流程1:生成评测集 | `evalset-prepared.jsonl` | AI 生成的评测集,包含问题和答案 |
|
|
206
|
+
| 流程2:补充答案 | `evalset-prepared.jsonl` | 用户问题 + AI 补充答案 |
|
|
207
|
+
| 流程3:解析评测集 | `evalset-prepared.{ext}` | 用户提供的原始评测集(保留原格式) |
|
|
208
|
+
|
|
209
|
+
**字段说明**(流程1/2产物):
|
|
210
|
+
|
|
211
|
+
| 字段 | 类型 | 必填 | 说明 |
|
|
212
|
+
|------|------|------|------|
|
|
213
|
+
| `question` | `string` | 是 | 问题内容 |
|
|
214
|
+
| `answer` | `string` | 是 | 答案内容 |
|
|
215
|
+
| `model` | `string` | 是 | 模型标识(`generated` 或 `user-provided`) |
|
|
216
|
+
| `case_id` | `string` | 是 | 用例标识 |
|
|
217
|
+
| `category` | `string` | 否 | 分类标签 |
|
|
218
|
+
| `reference` | `string` | 否 | 参考信息 |
|
|
219
|
+
|
|
220
|
+
**流程1示例(AI生成)**:
|
|
221
|
+
|
|
222
|
+
```jsonl
|
|
223
|
+
{"case_id": "case-0001", "model": "generated", "question": "请为我推荐一个适合周末短途旅行的目的地。", "answer": "根据您的需求,我推荐苏州、杭州等周边城市...", "category": "旅游推荐"}
|
|
224
|
+
{"case_id": "case-0002", "model": "generated", "question": "如何规划一个三口之家的日本七日游?", "answer": "以下是为您规划的七日游行程...", "category": "行程规划"}
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
**流程2示例(补充答案)**:
|
|
228
|
+
|
|
229
|
+
```jsonl
|
|
230
|
+
{"case_id": "case-001", "model": "user-provided", "question": "什么是大语言模型?", "answer": "大语言模型(Large Language Model,LLM)是一种基于深度学习的自然语言处理模型..."}
|
|
231
|
+
{"case_id": "case-002", "model": "user-provided", "question": "如何提高代码质量?", "answer": "提高代码质量可以从以下几个方面入手:1. 遵循编码规范...", "category": "编程"}
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### evalset-meta.json
|
|
235
|
+
|
|
236
|
+
**路径**:`{work-dir}/.eval/{session-id}/evalset/evalset-meta.json`
|
|
237
|
+
|
|
238
|
+
| 字段 | 类型 | 说明 |
|
|
239
|
+
|------|------|------|
|
|
240
|
+
| `dataset` | `string` | 评测集标识(evalset_id) |
|
|
241
|
+
| `dataset_size` | `int` | 评测集大小 |
|
|
242
|
+
|
|
243
|
+
### evalset-fields-mapping.json
|
|
244
|
+
|
|
245
|
+
**路径**:`{work-dir}/.eval/{session-id}/evalset/evalset-fields-mapping.json`
|
|
246
|
+
|
|
247
|
+
**格式**:
|
|
248
|
+
|
|
249
|
+
```json
|
|
250
|
+
{
|
|
251
|
+
"question": {"source_field": "question", "default": null},
|
|
252
|
+
"answer": {"source_field": "answer", "default": null},
|
|
253
|
+
"model": {"source_field": null, "default": "deepseek-r1"},
|
|
254
|
+
"case_id": {"source_field": "id", "default": null}
|
|
255
|
+
}
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
| 字段 | 类型 | 说明 |
|
|
259
|
+
|------|------|------|
|
|
260
|
+
| `{目标字段}.source_field` | `string` | 源数据中的字段名,为 null 表示无对应字段 |
|
|
261
|
+
| `{目标字段}.default` | `string` | 默认值,当源数据无对应字段时使用 |
|
|
262
|
+
|
|
263
|
+
**处理规则**:
|
|
264
|
+
|
|
265
|
+
| source_field | default | 处理方式 |
|
|
266
|
+
|--------------|---------|----------|
|
|
267
|
+
| 有值且源数据有该字段 | - | 使用源数据值 |
|
|
268
|
+
| 有值但源数据无该字段 | 有值 | 使用 default 值 |
|
|
269
|
+
| null | 有值 | 使用 default 值 |
|
|
270
|
+
| null | null | 使用内置默认值(model 为 `default`,case_id 自动生成) |
|
|
271
|
+
|
|
272
|
+
> **注意**:`case_id` 字段不使用 `default` 值。有 `source_field` 时使用源数据值,无则根据 question 分组自动生成。
|
|
273
|
+
|
|
274
|
+
**必填字段**:question、answer、model、case_id
|
|
275
|
+
|
|
276
|
+
**可选字段**:system、context、category、reference、keypoint
|
|
277
|
+
|
|
278
|
+
**字段匹配规则**:见 [evalset-parse-process.md](../processes/evalset-parse-process.md#32-匹配字段)。
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## 3. 执行阶段文件
|
|
283
|
+
|
|
284
|
+
### evaltask-meta.json
|
|
285
|
+
|
|
286
|
+
**路径**:`{work-dir}/.eval/{session-id}/evaltask/evaltask-meta.json`
|
|
287
|
+
|
|
288
|
+
| 字段 | 类型 | 必填 | 说明 |
|
|
289
|
+
|------|------|------|------|
|
|
290
|
+
| `task_id` | `string` | 是 | 评测任务标识 |
|
|
291
|
+
| `evalset_id` | `string` | 否 | 关联的评测集标识 |
|
|
292
|
+
|
|
293
|
+
### evaltask-result.json
|
|
294
|
+
|
|
295
|
+
**路径**:`{work-dir}/.eval/{session-id}/evaltask/evaltask-result.json`
|
|
296
|
+
|
|
297
|
+
| 字段 | 类型 | 说明 |
|
|
298
|
+
|------|------|------|
|
|
299
|
+
| `result` | `array` | 评测结果原始数据 |
|
|
300
|
+
| `metric` | `object` | 评测指标数据 |
|
|
301
|
+
| `usage` | `object` | Token使用统计 |
|
|
302
|
+
| `summary` | `Summary` | 评测摘要(需解析展示) |
|
|
303
|
+
|
|
304
|
+
**Summary 对象**:
|
|
305
|
+
|
|
306
|
+
| 字段 | 类型 | 说明 |
|
|
307
|
+
|------|------|------|
|
|
308
|
+
| `type` | `string` | 报告类型 |
|
|
309
|
+
| `meta` | `object` | 报告元信息 |
|
|
310
|
+
| `content` | `array` | 报告详情列表 |
|
|
311
|
+
|
|
312
|
+
### eval-progress.json
|
|
313
|
+
|
|
314
|
+
**路径**:`{work-dir}/.eval/{session-id}/evaltask/eval-progress.json`
|
|
315
|
+
|
|
316
|
+
| 字段 | 类型 | 说明 |
|
|
317
|
+
|------|------|------|
|
|
318
|
+
| `evaltask_id` | `string` | 评测任务ID |
|
|
319
|
+
| `status` | `string` | 任务状态 |
|
|
320
|
+
| `progress` | `ProgressInfo` | 进度信息 |
|
|
321
|
+
| `current_stage` | `string` | 当前阶段描述 |
|
|
322
|
+
| `estimated_remaining_time` | `string` | 预估剩余时间 |
|
|
323
|
+
| `started_at` | `string` | 开始时间 |
|
|
324
|
+
| `updated_at` | `string` | 更新时间 |
|
|
325
|
+
| `result_url` | `string` | 结果页面URL(完成时) |
|
|
326
|
+
| `error` | `ErrorInfo` | 错误信息(失败时) |
|
|
327
|
+
|
|
328
|
+
**ProgressInfo 对象**:
|
|
329
|
+
|
|
330
|
+
| 字段 | 类型 | 说明 |
|
|
331
|
+
|------|------|------|
|
|
332
|
+
| `completed` | `int` | 已完成数量 |
|
|
333
|
+
| `total` | `int` | 总数量 |
|
|
334
|
+
| `percentage` | `int` | 完成百分比(0-100) |
|
|
335
|
+
|
|
336
|
+
### evaltask-result.md
|
|
337
|
+
|
|
338
|
+
**路径**:`{work-dir}/.eval/{session-id}/evaltask/evaltask-result.md`
|
|
339
|
+
|
|
340
|
+
由脚本从 `evaltask-result.json` 渲染生成的可读报告。
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: builtin-templates
|
|
3
|
+
description: Use when selecting expert templates or dimension templates for evaluation scenarios
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# 内置模板说明
|
|
7
|
+
|
|
8
|
+
本文档说明评测系统中内置的专家模板和维度配置模板。
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## 1. 专家模板
|
|
13
|
+
|
|
14
|
+
专家模板是针对特定评测场景预配置的完整评测方案,分为以下两类:
|
|
15
|
+
|
|
16
|
+
### 1.1 通用维度级评测
|
|
17
|
+
|
|
18
|
+
适合对场景化能力的宏观全面覆盖,如有效性、相关性、连贯性等通用评测维度。
|
|
19
|
+
|
|
20
|
+
| 模板名称 | 适用场景 | 模板位置 |
|
|
21
|
+
|----------|----------|----------|
|
|
22
|
+
| 内容创造 | 文章创作、文案生成、创意写作 | `assets/experts/content-generation.json` |
|
|
23
|
+
| 企业流程自动化 | 工具调用、API编排、业务流程自动化、Text2SQL | `assets/experts/business-process-automation.json` |
|
|
24
|
+
| 内容匹配 | 标准问答、选择题、简答题(精确匹配) | `assets/experts/content-match.json` |
|
|
25
|
+
| 个性化规划与推荐 | 旅游规划、学习计划、购物推荐 | `assets/experts/personalized-planning.json` |
|
|
26
|
+
| 信息聚合分析 | 数据分析、报告生成、竞品分析、市场研究 | `assets/experts/information-analysis.json` |
|
|
27
|
+
| 文本翻译 | 机器翻译、多语言内容转换 | `assets/experts/text-translation.json` |
|
|
28
|
+
| 旅游出行 | 旅游攻略生成、行程规划、景点推荐 | `assets/experts/tourism-travel.json` |
|
|
29
|
+
|
|
30
|
+
### 1.2 定制用例级评测
|
|
31
|
+
|
|
32
|
+
适合对场景化能力的细节偏好对齐,针对每条用例有定制的评测要点。
|
|
33
|
+
|
|
34
|
+
| 模板名称 | 适用场景 | 模板位置 |
|
|
35
|
+
|----------|----------|----------|
|
|
36
|
+
| 营销数字人 | 虚拟人多轮对话有效性评测 | `assets/experts/marketing-digital-human.json` |
|
|
37
|
+
|
|
38
|
+
**使用方式**:直接复制专家模板至会话目录,按需调整维度权重。
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## 2. 维度配置模板
|
|
43
|
+
|
|
44
|
+
维度配置模板是可复用的单一评测维度定义,可灵活组合。分为以下两类:
|
|
45
|
+
|
|
46
|
+
### 2.1 通用维度级评测模板
|
|
47
|
+
|
|
48
|
+
适合对场景化能力的宏观全面覆盖,不依赖评测要点,可直接应用于整个评测集。
|
|
49
|
+
|
|
50
|
+
#### 通用维度
|
|
51
|
+
|
|
52
|
+
| 维度名称 | 适用场景 | 模板位置 |
|
|
53
|
+
|----------|----------|----------|
|
|
54
|
+
| 相关性 | 评估回答与问题的相关程度 | `assets/dimensions/相关性维度.json` |
|
|
55
|
+
| 有效性 | 评估回答是否有效解决问题 | `assets/dimensions/有效性维度.json` |
|
|
56
|
+
| 逻辑连贯性 | 评估内容组织和推理逻辑 | `assets/dimensions/逻辑连贯性维度.json` |
|
|
57
|
+
| 完整性 | 评估是否覆盖所有需求点 | `assets/dimensions/完整性维度.json` |
|
|
58
|
+
| 创意性/吸引性 | 评估创意性和吸引力 | `assets/dimensions/创意性-吸引性维度.json` |
|
|
59
|
+
| 创新性 | 评估新颖洞察和独特视角 | `assets/dimensions/创新性维度.json` |
|
|
60
|
+
|
|
61
|
+
#### 流程自动化维度
|
|
62
|
+
|
|
63
|
+
| 维度名称 | 适用场景 | 模板位置 |
|
|
64
|
+
|----------|----------|----------|
|
|
65
|
+
| 准确性 | 工具调用准确性、输出结果准确性 | `assets/dimensions/准确性维度.json` |
|
|
66
|
+
| 有效性-流程自动化 | 流程执行完整性和目标达成度 | `assets/dimensions/有效性维度-流程自动化.json` |
|
|
67
|
+
| 指令遵循 | 输出格式、字段要求、特殊约束 | `assets/dimensions/指令遵循维度.json` |
|
|
68
|
+
| 逻辑连贯性-流程自动化 | 多步骤编排合理性 | `assets/dimensions/逻辑连贯性维度-流程自动化.json` |
|
|
69
|
+
|
|
70
|
+
#### 内容匹配维度
|
|
71
|
+
|
|
72
|
+
| 维度名称 | 适用场景 | 模板位置 |
|
|
73
|
+
|----------|----------|----------|
|
|
74
|
+
| 内容精确 | 精确匹配(EQUAL函数) | `assets/dimensions/内容精确维度.json` |
|
|
75
|
+
| 格式遵循 | JSON格式校验(JSONFORMAT函数) | `assets/dimensions/格式遵循维度.json` |
|
|
76
|
+
|
|
77
|
+
#### 个性化规划维度
|
|
78
|
+
|
|
79
|
+
| 维度名称 | 适用场景 | 模板位置 |
|
|
80
|
+
|----------|----------|----------|
|
|
81
|
+
| 准确性-个性化规划 | 推荐项信息准确可靠 | `assets/dimensions/准确性维度-个性化规划.json` |
|
|
82
|
+
| 相关性-个性化规划 | 推荐内容与需求相关 | `assets/dimensions/相关性维度-个性化规划.json` |
|
|
83
|
+
| 有效性-个性化规划 | 方案实际可执行性 | `assets/dimensions/有效性维度-个性化规划.json` |
|
|
84
|
+
| 逻辑连贯性-个性化规划 | 时间、路线、顺序合理性 | `assets/dimensions/逻辑连贯性维度-个性化规划.json` |
|
|
85
|
+
|
|
86
|
+
#### 信息分析维度
|
|
87
|
+
|
|
88
|
+
| 维度名称 | 适用场景 | 模板位置 |
|
|
89
|
+
|----------|----------|----------|
|
|
90
|
+
| 准确性-信息分析 | 数据准确性、引用正确性 | `assets/dimensions/准确性维度-信息分析.json` |
|
|
91
|
+
| 有效性-信息分析 | 分析结论的实际价值 | `assets/dimensions/有效性维度-信息分析.json` |
|
|
92
|
+
| 完整性-信息分析 | 维度覆盖完整性 | `assets/dimensions/完整性维度-信息分析.json` |
|
|
93
|
+
| 逻辑连贯性-信息分析 | 分析过程逻辑性 | `assets/dimensions/逻辑连贯性维度-信息分析.json` |
|
|
94
|
+
|
|
95
|
+
#### 翻译维度(builtin类型)
|
|
96
|
+
|
|
97
|
+
| 维度名称 | 函数 | 模板位置 |
|
|
98
|
+
|----------|------|----------|
|
|
99
|
+
| 精确性-BLEU | BLEU | `assets/dimensions/精确性-BLEU维度.json` |
|
|
100
|
+
| 相似度-ROUGE | ROUGE | `assets/dimensions/相似度-ROUGE维度.json` |
|
|
101
|
+
| 相似度-BERTScore | BERTScore | `assets/dimensions/相似度-BERTScore维度.json` |
|
|
102
|
+
| 相似度-Cosine | Cosine | `assets/dimensions/相似度-Cosine维度.json` |
|
|
103
|
+
| 精确性-COMET | COMET | `assets/dimensions/精确性-COMET维度.json` |
|
|
104
|
+
| 文本差异度-TER | TER | `assets/dimensions/文本差异度-TER维度.json` |
|
|
105
|
+
|
|
106
|
+
#### 旅游出行维度
|
|
107
|
+
|
|
108
|
+
| 维度名称 | 适用场景 | 模板位置 |
|
|
109
|
+
|----------|----------|----------|
|
|
110
|
+
| 形式相关性 | 响应形式是否为旅游攻略 | `assets/dimensions/形式相关性维度.json` |
|
|
111
|
+
| 内容相关性 | 时间、路线、途经点匹配度 | `assets/dimensions/内容相关性维度.json` |
|
|
112
|
+
| 核心元素 | 交通、景点、住宿、饮食覆盖 | `assets/dimensions/核心元素维度.json` |
|
|
113
|
+
| 逻辑合理性 | 时间合理、路线可执行、行程闭环 | `assets/dimensions/逻辑合理性维度.json` |
|
|
114
|
+
| 准确性-旅游出行 | 景点信息、交通价格准确性 | `assets/dimensions/准确性维度-旅游出行.json` |
|
|
115
|
+
| 特色亮点 | 地域独特性和情感价值 | `assets/dimensions/特色亮点维度.json` |
|
|
116
|
+
| 忠诚度 | 引用链接、来源标识 | `assets/dimensions/忠诚度维度.json` |
|
|
117
|
+
|
|
118
|
+
### 2.2 定制用例级评测模板
|
|
119
|
+
|
|
120
|
+
适合对场景化能力的细节偏好对齐,依赖评测要点(keypoint),需针对每条用例配置评测要点后使用。
|
|
121
|
+
|
|
122
|
+
| 维度名称 | 适用场景 | 模板位置 |
|
|
123
|
+
|----------|----------|----------|
|
|
124
|
+
| 有效性(用例级) | 多轮对话场景下,基于评测要点评估回复有效性 | `assets/dimensions/用例级评测维度模板.json` |
|
|
125
|
+
|
|
126
|
+
**特点说明**:
|
|
127
|
+
- 模板中 `body.keypoint` 设置为 `true`,表示依赖评测要点
|
|
128
|
+
- 模板中 `body.context` 和 `body.history` 设置为 `true`,表示需要上下文和对话历史
|
|
129
|
+
- 适用于营销数字人等需要针对每条用例定制评测要点的场景
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## 3. 维度类型说明
|
|
134
|
+
|
|
135
|
+
| 类型 | 说明 | 输出形式 |
|
|
136
|
+
|------|------|----------|
|
|
137
|
+
| llm-score | LLM评委打分 | 1-5分 |
|
|
138
|
+
| llm-judge | LLM评委判断 | 通过/不通过 |
|
|
139
|
+
| agent-score | Agent评委打分(需外部数据源) | 1-5分 |
|
|
140
|
+
| builtin | 内置函数计算 | 数值指标 |
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## 4. 使用注意事项
|
|
145
|
+
|
|
146
|
+
- 同名维度在不同场景下有不同的定义和评分标准,使用时请注意区分
|
|
147
|
+
- Agent类型维度需配置外部数据源
|
|
148
|
+
- Builtin类型维度依赖评测服务支持对应函数
|
|
149
|
+
- 自定义评测方案时,维度权重总和应为1.0
|