astron-eval 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +119 -0
- package/bin/astron-eval.mjs +111 -0
- package/package.json +24 -0
- package/skills/astron-eval/SKILL.md +60 -0
- package/skills/model-evaluation/SKILL.md +180 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/eval-judge.json +11 -0
- package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
- package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
- package/skills/model-evaluation/assets/experts/content-match.json +37 -0
- package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
- package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
- package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
- package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
- package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
- package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
- package/skills/model-evaluation/eval-build.md +281 -0
- package/skills/model-evaluation/eval-execute.md +196 -0
- package/skills/model-evaluation/eval-init.md +237 -0
- package/skills/model-evaluation/processes/dimension-process.md +207 -0
- package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
- package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
- package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
- package/skills/model-evaluation/processes/keypoint-process.md +148 -0
- package/skills/model-evaluation/processes/python-env-process.md +113 -0
- package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
- package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
- package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
- package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
- package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
- package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
- package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
- package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
- package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
- package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
- package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
- package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
- package/skills/model-evaluation/scripts/eval_auth.py +588 -0
- package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
- package/skills/model-evaluation/scripts/eval_set.py +410 -0
- package/skills/model-evaluation/scripts/eval_task.py +324 -0
- package/skills/model-evaluation/scripts/files/__init__.py +38 -0
- package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
- package/skills/model-evaluation/scripts/files/streaming.py +245 -0
- package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
- package/skills/model-evaluation/scripts/utils/constants.py +101 -0
- package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
- package/skills/model-evaluation/scripts/utils/errors.py +244 -0
- package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
- package/skills/skill-driven-eval/SKILL.md +456 -0
- package/skills/skill-driven-eval/agents/grader.md +144 -0
- package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
- package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
- package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
- package/skills/skill-driven-eval/references/schemas.md +282 -0
- package/skills/skill-driven-eval/scripts/__init__.py +1 -0
- package/skills/skill-driven-eval/scripts/__main__.py +70 -0
- package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
- package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
- package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""维度配置工具:校验配置、更新judge_id"""
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List, Optional, Union
|
|
8
|
+
|
|
9
|
+
from utils import (
|
|
10
|
+
result,
|
|
11
|
+
VALID_DIMENSION_TYPES,
|
|
12
|
+
BUILTIN_FUNCTIONS,
|
|
13
|
+
)
|
|
14
|
+
from files import (
|
|
15
|
+
load_json,
|
|
16
|
+
save_json,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ============================================================================
|
|
21
|
+
# 校验逻辑
|
|
22
|
+
# ============================================================================
|
|
23
|
+
|
|
24
|
+
def validate_dimension(dim: dict, idx: int) -> List[str]:
|
|
25
|
+
"""校验单个维度,返回错误列表"""
|
|
26
|
+
errors = []
|
|
27
|
+
name = dim.get("name", f"index_{idx}")
|
|
28
|
+
|
|
29
|
+
if not dim.get("name"):
|
|
30
|
+
errors.append(f"[{idx}] missing 'name'")
|
|
31
|
+
if not dim.get("type"):
|
|
32
|
+
errors.append(f"[{name}] missing 'type'")
|
|
33
|
+
return errors
|
|
34
|
+
|
|
35
|
+
dtype = dim["type"]
|
|
36
|
+
if dtype not in VALID_DIMENSION_TYPES:
|
|
37
|
+
errors.append(f"[{name}] invalid type '{dtype}'")
|
|
38
|
+
return errors
|
|
39
|
+
|
|
40
|
+
# LLM类维度校验(llm-score 和 llm-judge)
|
|
41
|
+
if dtype in ("llm-score", "llm-judge"):
|
|
42
|
+
# judge_id 必填
|
|
43
|
+
if not dim.get("judge_id"):
|
|
44
|
+
errors.append(f"[{name}] missing 'judge_id'")
|
|
45
|
+
|
|
46
|
+
# weight 必填
|
|
47
|
+
w = dim.get("weight")
|
|
48
|
+
if w is None:
|
|
49
|
+
errors.append(f"[{name}] missing 'weight'")
|
|
50
|
+
elif not isinstance(w, (int, float)) or not (0 <= w <= 1):
|
|
51
|
+
errors.append(f"[{name}] invalid weight '{w}'")
|
|
52
|
+
|
|
53
|
+
# prompt 必填
|
|
54
|
+
if "prompt" not in dim:
|
|
55
|
+
errors.append(f"[{name}] missing 'prompt'")
|
|
56
|
+
elif isinstance(dim["prompt"], dict):
|
|
57
|
+
for f in ("definition", "instruct", "step"):
|
|
58
|
+
if not dim["prompt"].get(f):
|
|
59
|
+
errors.append(f"[{name}] prompt.{f} missing")
|
|
60
|
+
|
|
61
|
+
# 内置函数校验
|
|
62
|
+
elif dtype == "builtin":
|
|
63
|
+
# judge_id 不应该存在
|
|
64
|
+
if "judge_id" in dim:
|
|
65
|
+
errors.append(f"[{name}] builtin type should not have 'judge_id'")
|
|
66
|
+
|
|
67
|
+
# func 必填
|
|
68
|
+
func = dim.get("func")
|
|
69
|
+
if not func:
|
|
70
|
+
errors.append(f"[{name}] missing 'func'")
|
|
71
|
+
elif func not in BUILTIN_FUNCTIONS:
|
|
72
|
+
errors.append(f"[{name}] invalid func '{func}'")
|
|
73
|
+
|
|
74
|
+
# weight 必填
|
|
75
|
+
w = dim.get("weight")
|
|
76
|
+
if w is None:
|
|
77
|
+
errors.append(f"[{name}] missing 'weight'")
|
|
78
|
+
elif not isinstance(w, (int, float)) or not (0 <= w <= 1):
|
|
79
|
+
errors.append(f"[{name}] invalid weight '{w}'")
|
|
80
|
+
|
|
81
|
+
return errors
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def check_config(path: str) -> dict:
|
|
85
|
+
"""校验维度配置文件"""
|
|
86
|
+
result_obj = {"success": True, "file": path, "errors": [], "dimensions": []}
|
|
87
|
+
|
|
88
|
+
p = Path(path)
|
|
89
|
+
if not p.exists():
|
|
90
|
+
return {**result_obj, "success": False, "errors": [f"file not found: {path}"]}
|
|
91
|
+
|
|
92
|
+
# 使用 common.load_json 加载文件
|
|
93
|
+
load_result = load_json(path)
|
|
94
|
+
if not load_result.get("success"):
|
|
95
|
+
return {**result_obj, "success": False, "errors": [load_result.get("message", "load failed")]}
|
|
96
|
+
|
|
97
|
+
data = load_result.get("data")
|
|
98
|
+
|
|
99
|
+
if not isinstance(data, dict):
|
|
100
|
+
return {**result_obj, "success": False, "errors": ["root must be object"]}
|
|
101
|
+
|
|
102
|
+
# 检查根节点字段
|
|
103
|
+
valid_root_fields = {"name", "description", "evals"}
|
|
104
|
+
invalid_root_fields = {"scene", "scene_type", "dimensions"}
|
|
105
|
+
for field in invalid_root_fields:
|
|
106
|
+
if field in data:
|
|
107
|
+
result_obj["errors"].append(f"invalid root field '{field}', use correct field name")
|
|
108
|
+
|
|
109
|
+
# 检查维度数组字段
|
|
110
|
+
if "evals" not in data:
|
|
111
|
+
if "dimensions" in data:
|
|
112
|
+
result_obj["errors"].append("'dimensions' is invalid, use 'evals' instead")
|
|
113
|
+
else:
|
|
114
|
+
result_obj["errors"].append("missing 'evals' field")
|
|
115
|
+
|
|
116
|
+
dims = data.get("evals", [])
|
|
117
|
+
if not isinstance(dims, list):
|
|
118
|
+
return {**result_obj, "success": False, "errors": ["'evals' must be array"]}
|
|
119
|
+
|
|
120
|
+
# 逐个校验
|
|
121
|
+
for i, d in enumerate(dims):
|
|
122
|
+
if not isinstance(d, dict):
|
|
123
|
+
result_obj["errors"].append(f"[{i}] must be object")
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
# 检查是否错误嵌套在 config 内
|
|
127
|
+
if "config" in d and isinstance(d["config"], dict):
|
|
128
|
+
# config 内有 type 字段,说明维度对象嵌套错误
|
|
129
|
+
if "type" in d["config"]:
|
|
130
|
+
name = d.get("name", f"index_{i}")
|
|
131
|
+
result_obj["errors"].append(f"[{name}] dimension fields should not be nested in 'config'")
|
|
132
|
+
# 从 config 中提取字段进行校验
|
|
133
|
+
d = d["config"]
|
|
134
|
+
|
|
135
|
+
errs = validate_dimension(d, i)
|
|
136
|
+
result_obj["errors"].extend(errs)
|
|
137
|
+
result_obj["dimensions"].append({"name": d.get("name", f"index_{i}"), "valid": not errs})
|
|
138
|
+
|
|
139
|
+
# 检查权重总和(所有维度都应有 weight)
|
|
140
|
+
weight_dims = [d for d in dims if isinstance(d, dict) and "weight" in d]
|
|
141
|
+
if weight_dims:
|
|
142
|
+
total = sum(d["weight"] for d in weight_dims if isinstance(d.get("weight"), (int, float)))
|
|
143
|
+
if abs(total - 1.0) > 0.0001:
|
|
144
|
+
result_obj["errors"].append(f"weight sum {total:.4f} != 1.0")
|
|
145
|
+
|
|
146
|
+
if result_obj["errors"]:
|
|
147
|
+
result_obj["success"] = False
|
|
148
|
+
return result_obj
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ============================================================================
|
|
152
|
+
# 更新 judge_id
|
|
153
|
+
# ============================================================================
|
|
154
|
+
|
|
155
|
+
def get_judge_id(config: Union[dict, list]) -> Optional[str]:
|
|
156
|
+
"""从评委配置中提取judge_id"""
|
|
157
|
+
if isinstance(config, list) and config:
|
|
158
|
+
return config[0].get("id")
|
|
159
|
+
if isinstance(config, dict):
|
|
160
|
+
return config.get("id") or config.get("models", [{}])[0].get("id")
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def update_config(dim_path: str, judge_path: str, output_path: Optional[str]) -> dict:
|
|
165
|
+
"""更新维度配置中的judge_id"""
|
|
166
|
+
result_obj = {"success": True, "errors": []}
|
|
167
|
+
|
|
168
|
+
# 加载评委配置 - 使用 common.load_json
|
|
169
|
+
judge_result = load_json(judge_path)
|
|
170
|
+
if not judge_result.get("success"):
|
|
171
|
+
return {"success": False, "errors": [f"load judge config failed: {judge_result.get('message')}"]}
|
|
172
|
+
judge_config = judge_result.get("data")
|
|
173
|
+
|
|
174
|
+
judge_id = get_judge_id(judge_config)
|
|
175
|
+
if not judge_id:
|
|
176
|
+
return {"success": False, "errors": ["judge_id not found in config"]}
|
|
177
|
+
result_obj["judge_id"] = judge_id
|
|
178
|
+
|
|
179
|
+
# 加载维度配置 - 使用 common.load_json
|
|
180
|
+
dim_result = load_json(dim_path)
|
|
181
|
+
if not dim_result.get("success"):
|
|
182
|
+
return {"success": False, "errors": [f"load dimension config failed: {dim_result.get('message')}"]}
|
|
183
|
+
dim_config = dim_result.get("data")
|
|
184
|
+
|
|
185
|
+
# 更新judge_id
|
|
186
|
+
updated = 0
|
|
187
|
+
for dim in dim_config.get("evals", []):
|
|
188
|
+
if isinstance(dim, dict) and dim.get("type") in ("llm-score", "llm-judge"):
|
|
189
|
+
dim["judge_id"] = judge_id
|
|
190
|
+
updated += 1
|
|
191
|
+
|
|
192
|
+
result_obj["updated"] = updated
|
|
193
|
+
|
|
194
|
+
# 保存 - 使用 common.save_json
|
|
195
|
+
out = output_path or dim_path
|
|
196
|
+
save_result = save_json(out, dim_config)
|
|
197
|
+
if not save_result.get("success"):
|
|
198
|
+
result_obj["success"] = False
|
|
199
|
+
result_obj["errors"].append(f"save failed: {save_result.get('message')}")
|
|
200
|
+
else:
|
|
201
|
+
result_obj["output"] = out
|
|
202
|
+
|
|
203
|
+
return result_obj
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# ============================================================================
|
|
207
|
+
# CLI 入口
|
|
208
|
+
# ============================================================================
|
|
209
|
+
|
|
210
|
+
def main():
|
|
211
|
+
parser = argparse.ArgumentParser(description="维度配置工具")
|
|
212
|
+
parser.add_argument("-a", "--action", required=True, choices=["check", "update"], help="操作类型")
|
|
213
|
+
parser.add_argument("-d", "--dimension", help="维度配置文件")
|
|
214
|
+
parser.add_argument("-j", "--judge", help="评委配置文件(update时必需)")
|
|
215
|
+
parser.add_argument("-o", "--output", help="输出文件路径")
|
|
216
|
+
parser.add_argument("--quiet", action="store_true", help="仅输出JSON")
|
|
217
|
+
args = parser.parse_args()
|
|
218
|
+
|
|
219
|
+
if args.action == "check":
|
|
220
|
+
if not args.dimension:
|
|
221
|
+
parser.error("--dimension required for check")
|
|
222
|
+
result = check_config(args.dimension)
|
|
223
|
+
|
|
224
|
+
if not args.quiet:
|
|
225
|
+
status = "PASS" if result["success"] else f"FAIL ({len(result['errors'])} errors)"
|
|
226
|
+
print(f"Validation: {status}")
|
|
227
|
+
for e in result["errors"]:
|
|
228
|
+
print(f" - {e}")
|
|
229
|
+
|
|
230
|
+
elif args.action == "update":
|
|
231
|
+
if not args.dimension or not args.judge:
|
|
232
|
+
parser.error("--dimension and --judge required for update")
|
|
233
|
+
result = update_config(args.dimension, args.judge, args.output)
|
|
234
|
+
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
235
|
+
|
|
236
|
+
sys.exit(0 if result.get("success") else 1)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
if __name__ == "__main__":
|
|
240
|
+
main()
|
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""评测集管理:解析、标准化、提交"""
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import math
|
|
6
|
+
import random
|
|
7
|
+
import string
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from utils import (
|
|
12
|
+
OPTIONAL_FIELDS,
|
|
13
|
+
ERR_REMOTE_DEFAULT,
|
|
14
|
+
handle_cli_error,
|
|
15
|
+
)
|
|
16
|
+
from files import (
|
|
17
|
+
load_json,
|
|
18
|
+
save_json,
|
|
19
|
+
load_config_kv,
|
|
20
|
+
load_data,
|
|
21
|
+
load_jsonl_stream,
|
|
22
|
+
extract_fields,
|
|
23
|
+
)
|
|
24
|
+
from clients import (
|
|
25
|
+
ApiClient,
|
|
26
|
+
TokenManager,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def cmd_analysis(args):
|
|
31
|
+
"""解析评测集文件结构,输出结构文件
|
|
32
|
+
|
|
33
|
+
产物:evalset-structure.json(包含文件格式、行数、字段信息)
|
|
34
|
+
|
|
35
|
+
字段映射由 Claude Code 根据规则生成,不再由脚本推断。
|
|
36
|
+
"""
|
|
37
|
+
load_result = load_data(args.input)
|
|
38
|
+
if not load_result.get("success"):
|
|
39
|
+
raise ValueError(f"数据加载失败: {load_result.get('message')}")
|
|
40
|
+
data = load_result.get("data", {}).get("items", [])
|
|
41
|
+
|
|
42
|
+
fields = extract_fields(data)
|
|
43
|
+
|
|
44
|
+
# 结构文件:唯一产物
|
|
45
|
+
structure = {
|
|
46
|
+
"file": args.input,
|
|
47
|
+
"format": Path(args.input).suffix.lower()[1:],
|
|
48
|
+
"total_rows": len(data),
|
|
49
|
+
"fields": fields
|
|
50
|
+
}
|
|
51
|
+
save_json(args.output, structure)
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
"success": True,
|
|
55
|
+
"total_rows": len(data),
|
|
56
|
+
"fields": list(fields.keys()),
|
|
57
|
+
"structure_file": args.output
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ============================================================================
|
|
62
|
+
# 标准化
|
|
63
|
+
# ============================================================================
|
|
64
|
+
|
|
65
|
+
def normalize_data(data: list, mapping: dict) -> list:
|
|
66
|
+
"""根据字段映射将数据转为标准格式
|
|
67
|
+
|
|
68
|
+
映射格式:
|
|
69
|
+
{
|
|
70
|
+
"question": {"source_field": "question", "default": null},
|
|
71
|
+
"answer": {"source_field": "answer", "default": null},
|
|
72
|
+
"model": {"source_field": null, "default": "deepseek-r1"},
|
|
73
|
+
"case_id": {"source_field": "id", "default": null}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
处理规则:
|
|
77
|
+
- 有 source_field 且源数据有该字段 → 使用源数据值
|
|
78
|
+
- 无 source_field 或源数据无该字段 → 使用 default 值
|
|
79
|
+
"""
|
|
80
|
+
# 提取字段配置
|
|
81
|
+
def get_field_config(field_name):
|
|
82
|
+
config = mapping.get(field_name, {})
|
|
83
|
+
if isinstance(config, str):
|
|
84
|
+
# 兼容旧格式:直接是字段名
|
|
85
|
+
return {"source_field": config, "default": None}
|
|
86
|
+
return config
|
|
87
|
+
|
|
88
|
+
q_config = get_field_config('question')
|
|
89
|
+
a_config = get_field_config('answer')
|
|
90
|
+
m_config = get_field_config('model')
|
|
91
|
+
c_config = get_field_config('case_id')
|
|
92
|
+
|
|
93
|
+
q_field = q_config.get('source_field')
|
|
94
|
+
a_field = a_config.get('source_field')
|
|
95
|
+
|
|
96
|
+
if not q_field or not a_field:
|
|
97
|
+
raise ValueError("字段映射必须包含 question 和 answer 的 source_field")
|
|
98
|
+
|
|
99
|
+
# 可选字段配置
|
|
100
|
+
opt_field_configs = {}
|
|
101
|
+
for f in OPTIONAL_FIELDS:
|
|
102
|
+
config = get_field_config(f)
|
|
103
|
+
if config.get('source_field') or config.get('default'):
|
|
104
|
+
opt_field_configs[f] = config
|
|
105
|
+
|
|
106
|
+
result = []
|
|
107
|
+
|
|
108
|
+
# case_id 分组生成
|
|
109
|
+
question_to_case = {}
|
|
110
|
+
case_counter = 0
|
|
111
|
+
|
|
112
|
+
for idx, item in enumerate(data):
|
|
113
|
+
question = str(item.get(q_field, ''))
|
|
114
|
+
answer = str(item.get(a_field, ''))
|
|
115
|
+
if not question or not answer:
|
|
116
|
+
print(f"警告: 第{idx+1}行缺少必要字段", file=sys.stderr)
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
# case_id 处理
|
|
120
|
+
# 规则:有 source_field 用源数据值,无则自动生成(不使用 default)
|
|
121
|
+
c_field = c_config.get('source_field')
|
|
122
|
+
if c_field and item.get(c_field):
|
|
123
|
+
case_id = str(item.get(c_field))
|
|
124
|
+
else:
|
|
125
|
+
# 无 case_id 字段,根据 question 分组自动生成
|
|
126
|
+
if question not in question_to_case:
|
|
127
|
+
case_counter += 1
|
|
128
|
+
question_to_case[question] = f'case-{case_counter:04d}'
|
|
129
|
+
case_id = question_to_case[question]
|
|
130
|
+
|
|
131
|
+
# model 处理
|
|
132
|
+
m_field = m_config.get('source_field')
|
|
133
|
+
m_default = m_config.get('default')
|
|
134
|
+
if m_field and m_field in item:
|
|
135
|
+
model_value = str(item.get(m_field))
|
|
136
|
+
elif m_default:
|
|
137
|
+
model_value = str(m_default)
|
|
138
|
+
else:
|
|
139
|
+
model_value = 'default'
|
|
140
|
+
|
|
141
|
+
record = {
|
|
142
|
+
"question": question,
|
|
143
|
+
"answer": answer,
|
|
144
|
+
"model": model_value,
|
|
145
|
+
"case_id": case_id
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
# 添加可选字段
|
|
149
|
+
for std_field, config in opt_field_configs.items():
|
|
150
|
+
src_field = config.get('source_field')
|
|
151
|
+
default_val = config.get('default')
|
|
152
|
+
if src_field and src_field in item:
|
|
153
|
+
value = item.get(src_field)
|
|
154
|
+
# 正确的空值检查:排除 None 和 NaN(pandas 读取 Excel 空单元格产生 NaN)
|
|
155
|
+
if value is not None and not (isinstance(value, float) and math.isnan(value)):
|
|
156
|
+
if str(value).strip():
|
|
157
|
+
record[std_field] = str(value)
|
|
158
|
+
elif default_val:
|
|
159
|
+
record[std_field] = str(default_val)
|
|
160
|
+
|
|
161
|
+
result.append(record)
|
|
162
|
+
return result
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def cmd_normalize(args):
|
|
166
|
+
"""将评测集转为标准格式"""
|
|
167
|
+
load_result = load_data(args.input)
|
|
168
|
+
if not load_result.get("success"):
|
|
169
|
+
raise ValueError(f"数据加载失败: {load_result.get('message')}")
|
|
170
|
+
data = load_result.get("data", {}).get("items", [])
|
|
171
|
+
if not data:
|
|
172
|
+
raise ValueError("评测集为空或无法解析")
|
|
173
|
+
|
|
174
|
+
mapping_result = load_json(args.mapping)
|
|
175
|
+
if not mapping_result.get("success"):
|
|
176
|
+
raise ValueError(f"映射文件加载失败: {mapping_result.get('message')}")
|
|
177
|
+
mapping = mapping_result.get("data", {})
|
|
178
|
+
|
|
179
|
+
normalized = normalize_data(data, mapping)
|
|
180
|
+
if not normalized:
|
|
181
|
+
raise ValueError("转换后的评测集为空")
|
|
182
|
+
|
|
183
|
+
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
|
|
184
|
+
Path(args.output).write_text('\n'.join(json.dumps(item, ensure_ascii=False) for item in normalized), encoding='utf-8')
|
|
185
|
+
|
|
186
|
+
return {"success": True, "input_rows": len(data), "output_rows": len(normalized), "output_file": args.output}
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ============================================================================
|
|
190
|
+
# 提交
|
|
191
|
+
# ============================================================================
|
|
192
|
+
|
|
193
|
+
def generate_evalset_id() -> str:
|
|
194
|
+
"""生成评测集ID"""
|
|
195
|
+
suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
|
|
196
|
+
return f"eval-{suffix}"
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def cmd_submit(args):
|
|
200
|
+
"""提交评测集到后端服务"""
|
|
201
|
+
# 解析评测集
|
|
202
|
+
items = []
|
|
203
|
+
for idx, line in enumerate(Path(args.evalset).read_text(encoding='utf-8').splitlines()):
|
|
204
|
+
if not line.strip():
|
|
205
|
+
continue
|
|
206
|
+
try:
|
|
207
|
+
case = json.loads(line)
|
|
208
|
+
# 必填字段
|
|
209
|
+
item = {
|
|
210
|
+
"case_id": case.get('case_id', f'case-{idx+1:04d}'),
|
|
211
|
+
"model": case.get('model', 'default'),
|
|
212
|
+
"question": case['question'],
|
|
213
|
+
"answer": case['answer']
|
|
214
|
+
}
|
|
215
|
+
# 可选字段
|
|
216
|
+
for field in OPTIONAL_FIELDS:
|
|
217
|
+
if field in case and case[field]:
|
|
218
|
+
item[field] = case[field]
|
|
219
|
+
items.append(item)
|
|
220
|
+
except (json.JSONDecodeError, KeyError) as e:
|
|
221
|
+
raise ValueError(f"评测集第{idx+1}行解析失败: {e}")
|
|
222
|
+
|
|
223
|
+
if not items:
|
|
224
|
+
raise ValueError("评测集为空")
|
|
225
|
+
|
|
226
|
+
# 提交到API
|
|
227
|
+
config_result = load_config_kv(args.config)
|
|
228
|
+
if not config_result.get("success"):
|
|
229
|
+
raise ValueError(f"配置文件加载失败: {config_result.get('message')}")
|
|
230
|
+
config = config_result.get("data", {})
|
|
231
|
+
|
|
232
|
+
# 使用 TokenManager 和 ApiClient
|
|
233
|
+
token_manager = TokenManager(args.auth)
|
|
234
|
+
client = ApiClient(token_manager, config.get('base_url', 'http://127.0.0.1:8080'))
|
|
235
|
+
|
|
236
|
+
evalset_id = generate_evalset_id()
|
|
237
|
+
|
|
238
|
+
client.post("/open/api/v1/evalset", json={"evalset_id": evalset_id, "items": items})
|
|
239
|
+
|
|
240
|
+
save_json(args.output, {"dataset": evalset_id, "total": len(items)})
|
|
241
|
+
return {"evalset_id": evalset_id, "total": len(items)}
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# ============================================================================
|
|
245
|
+
# 批次提交(流式处理)
|
|
246
|
+
# ============================================================================
|
|
247
|
+
|
|
248
|
+
# D-36: 批次大小固定为 500 条
|
|
249
|
+
BATCH_SIZE = 500
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def cmd_submit_batch(file_path: str, api_client, endpoint: str) -> dict:
|
|
253
|
+
"""
|
|
254
|
+
分批提交评测集数据(流式处理)
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
file_path: JSONL 文件路径
|
|
258
|
+
api_client: API 客户端实例
|
|
259
|
+
endpoint: API 端点
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
包含 success, stats, errors 字段的结果字典
|
|
263
|
+
|
|
264
|
+
实现决策:
|
|
265
|
+
- D-36: 批次大小固定为 500 条
|
|
266
|
+
- D-37: 批次采用顺序同步提交策略
|
|
267
|
+
- D-38: 批次级别失败时立即停止处理
|
|
268
|
+
- D-39: 单条数据错误收集在 errors 数组
|
|
269
|
+
- D-40: 错误报告包含 line 和 message 字段
|
|
270
|
+
- D-41: 进度输出到 stderr
|
|
271
|
+
- D-42: 每批次完成后输出一次进度
|
|
272
|
+
- D-43: 进度采用 JSON 格式
|
|
273
|
+
"""
|
|
274
|
+
# 1. 流式读取文件
|
|
275
|
+
stream = load_jsonl_stream(file_path)
|
|
276
|
+
|
|
277
|
+
# 2. 初始化状态
|
|
278
|
+
batch = []
|
|
279
|
+
stats = {"total": 0, "success": 0, "failed": 0, "batches": 0}
|
|
280
|
+
errors = []
|
|
281
|
+
evalset_id = generate_evalset_id()
|
|
282
|
+
|
|
283
|
+
# 3. 流式处理
|
|
284
|
+
for item in stream:
|
|
285
|
+
# 3.1 处理错误项(单条数据错误)
|
|
286
|
+
if item.get("success") is False:
|
|
287
|
+
stats["total"] += 1
|
|
288
|
+
stats["failed"] += 1
|
|
289
|
+
# D-40: 错误报告包含 line 和 message 字段
|
|
290
|
+
errors.append({
|
|
291
|
+
"line": item.get("line", 0),
|
|
292
|
+
"message": item.get("message", "未知错误")
|
|
293
|
+
})
|
|
294
|
+
continue
|
|
295
|
+
|
|
296
|
+
# 3.2 构建数据项
|
|
297
|
+
data = item["data"]
|
|
298
|
+
record = {
|
|
299
|
+
"case_id": data.get('case_id', f'case-{item["line"]:04d}'),
|
|
300
|
+
"model": data.get('model', 'default'),
|
|
301
|
+
"question": data['question'],
|
|
302
|
+
"answer": data['answer']
|
|
303
|
+
}
|
|
304
|
+
# 添加可选字段
|
|
305
|
+
for field in OPTIONAL_FIELDS:
|
|
306
|
+
if field in data and data[field]:
|
|
307
|
+
record[field] = data[field]
|
|
308
|
+
|
|
309
|
+
batch.append(record)
|
|
310
|
+
stats["total"] += 1
|
|
311
|
+
|
|
312
|
+
# 3.3 批次满时提交
|
|
313
|
+
if len(batch) >= BATCH_SIZE:
|
|
314
|
+
try:
|
|
315
|
+
api_client.post(endpoint, items=batch)
|
|
316
|
+
stats["success"] += len(batch)
|
|
317
|
+
stats["batches"] += 1
|
|
318
|
+
# D-41, D-42, D-43: 进度输出到 stderr,JSON 格式
|
|
319
|
+
print(json.dumps({"progress": stats["total"], "batches": stats["batches"]}), file=sys.stderr)
|
|
320
|
+
batch = []
|
|
321
|
+
except Exception as e:
|
|
322
|
+
# D-38: 批次失败立即停止
|
|
323
|
+
stats["failed"] += len(batch)
|
|
324
|
+
error_msg = str(e)
|
|
325
|
+
if hasattr(e, 'message'):
|
|
326
|
+
error_msg = e.message
|
|
327
|
+
errors.append({"line": 0, "message": f"批次提交失败: {error_msg}"})
|
|
328
|
+
return {
|
|
329
|
+
"success": False,
|
|
330
|
+
"code": getattr(e, 'code', ERR_REMOTE_DEFAULT),
|
|
331
|
+
"message": error_msg,
|
|
332
|
+
"stats": stats,
|
|
333
|
+
"errors": errors
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
# 4. 提交最后一批
|
|
337
|
+
if batch:
|
|
338
|
+
try:
|
|
339
|
+
api_client.post(endpoint, items=batch)
|
|
340
|
+
stats["success"] += len(batch)
|
|
341
|
+
stats["batches"] += 1
|
|
342
|
+
# D-41, D-42, D-43: 进度输出到 stderr
|
|
343
|
+
print(json.dumps({"progress": stats["total"], "batches": stats["batches"]}), file=sys.stderr)
|
|
344
|
+
except Exception as e:
|
|
345
|
+
stats["failed"] += len(batch)
|
|
346
|
+
error_msg = str(e)
|
|
347
|
+
if hasattr(e, 'message'):
|
|
348
|
+
error_msg = e.message
|
|
349
|
+
errors.append({"line": 0, "message": f"批次提交失败: {error_msg}"})
|
|
350
|
+
return {
|
|
351
|
+
"success": False,
|
|
352
|
+
"code": getattr(e, 'code', ERR_REMOTE_DEFAULT),
|
|
353
|
+
"message": error_msg,
|
|
354
|
+
"stats": stats,
|
|
355
|
+
"errors": errors
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
# 5. 输出结果
|
|
359
|
+
return {
|
|
360
|
+
"success": True,
|
|
361
|
+
"evalset_id": evalset_id,
|
|
362
|
+
"stats": stats,
|
|
363
|
+
"errors": errors
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
# ============================================================================
|
|
368
|
+
# CLI 入口
|
|
369
|
+
# ============================================================================
|
|
370
|
+
|
|
371
|
+
def main():
|
|
372
|
+
parser = argparse.ArgumentParser(description='评测集管理')
|
|
373
|
+
subparsers = parser.add_subparsers(dest='command', help='子命令')
|
|
374
|
+
|
|
375
|
+
# analysis
|
|
376
|
+
p = subparsers.add_parser('analysis', help='解析评测集结构')
|
|
377
|
+
p.add_argument('--input', required=True, help='评测集文件路径')
|
|
378
|
+
p.add_argument('--output', required=True, help='输出结构文件路径')
|
|
379
|
+
p.set_defaults(func=cmd_analysis)
|
|
380
|
+
|
|
381
|
+
# normalize
|
|
382
|
+
p = subparsers.add_parser('normalize', help='标准化评测集')
|
|
383
|
+
p.add_argument('--input', required=True, help='原始评测集文件路径')
|
|
384
|
+
p.add_argument('--mapping', required=True, help='字段映射文件路径')
|
|
385
|
+
p.add_argument('--output', required=True, help='输出文件路径')
|
|
386
|
+
p.set_defaults(func=cmd_normalize)
|
|
387
|
+
|
|
388
|
+
# submit
|
|
389
|
+
p = subparsers.add_parser('submit', help='提交评测集')
|
|
390
|
+
p.add_argument('--evalset', required=True, help='标准化评测集文件路径')
|
|
391
|
+
p.add_argument('--config', required=True, help='服务配置文件')
|
|
392
|
+
p.add_argument('--auth', required=True, help='鉴权信息文件')
|
|
393
|
+
p.add_argument('--output', required=True, help='输出文件路径')
|
|
394
|
+
p.set_defaults(func=cmd_submit)
|
|
395
|
+
|
|
396
|
+
args = parser.parse_args()
|
|
397
|
+
|
|
398
|
+
# Python 3.6 兼容:手动检查子命令
|
|
399
|
+
if args.command is None:
|
|
400
|
+
parser.error("请指定子命令: analysis, normalize, submit")
|
|
401
|
+
|
|
402
|
+
try:
|
|
403
|
+
result_obj = args.func(args)
|
|
404
|
+
print(json.dumps(result_obj, ensure_ascii=False))
|
|
405
|
+
except Exception as e:
|
|
406
|
+
handle_cli_error(e)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
if __name__ == '__main__':
|
|
410
|
+
main()
|