astron-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +119 -0
  3. package/bin/astron-eval.mjs +111 -0
  4. package/package.json +24 -0
  5. package/skills/astron-eval/SKILL.md +60 -0
  6. package/skills/model-evaluation/SKILL.md +180 -0
  7. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  8. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
  9. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  10. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  11. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
  12. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
  13. package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
  14. package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
  15. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  16. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
  17. package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  18. package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
  19. package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
  20. package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
  21. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  22. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  23. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  24. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
  25. package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
  26. package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
  27. package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
  28. package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
  29. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
  30. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
  31. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
  32. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  33. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
  34. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
  35. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
  36. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
  37. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  38. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  39. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  40. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
  41. package/skills/model-evaluation/assets/eval-judge.json +11 -0
  42. package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
  43. package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
  44. package/skills/model-evaluation/assets/experts/content-match.json +37 -0
  45. package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
  46. package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
  47. package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
  48. package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
  49. package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
  50. package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
  51. package/skills/model-evaluation/eval-build.md +281 -0
  52. package/skills/model-evaluation/eval-execute.md +196 -0
  53. package/skills/model-evaluation/eval-init.md +237 -0
  54. package/skills/model-evaluation/processes/dimension-process.md +207 -0
  55. package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
  56. package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
  57. package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
  58. package/skills/model-evaluation/processes/keypoint-process.md +148 -0
  59. package/skills/model-evaluation/processes/python-env-process.md +113 -0
  60. package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
  61. package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
  62. package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
  63. package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
  64. package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
  65. package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
  66. package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
  67. package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
  68. package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
  69. package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
  70. package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
  71. package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
  72. package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
  73. package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
  74. package/skills/model-evaluation/scripts/eval_auth.py +588 -0
  75. package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
  76. package/skills/model-evaluation/scripts/eval_set.py +410 -0
  77. package/skills/model-evaluation/scripts/eval_task.py +324 -0
  78. package/skills/model-evaluation/scripts/files/__init__.py +38 -0
  79. package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
  80. package/skills/model-evaluation/scripts/files/streaming.py +245 -0
  81. package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
  82. package/skills/model-evaluation/scripts/utils/constants.py +101 -0
  83. package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
  84. package/skills/model-evaluation/scripts/utils/errors.py +244 -0
  85. package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
  86. package/skills/skill-driven-eval/SKILL.md +456 -0
  87. package/skills/skill-driven-eval/agents/grader.md +144 -0
  88. package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
  89. package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
  90. package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
  91. package/skills/skill-driven-eval/references/schemas.md +282 -0
  92. package/skills/skill-driven-eval/scripts/__init__.py +1 -0
  93. package/skills/skill-driven-eval/scripts/__main__.py +70 -0
  94. package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
  95. package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
  96. package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
@@ -0,0 +1,240 @@
1
+ #!/usr/bin/env python3
2
+ """维度配置工具:校验配置、更新judge_id"""
3
+ import argparse
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import List, Optional, Union
8
+
9
+ from utils import (
10
+ result,
11
+ VALID_DIMENSION_TYPES,
12
+ BUILTIN_FUNCTIONS,
13
+ )
14
+ from files import (
15
+ load_json,
16
+ save_json,
17
+ )
18
+
19
+
20
+ # ============================================================================
21
+ # 校验逻辑
22
+ # ============================================================================
23
+
24
+ def validate_dimension(dim: dict, idx: int) -> List[str]:
25
+ """校验单个维度,返回错误列表"""
26
+ errors = []
27
+ name = dim.get("name", f"index_{idx}")
28
+
29
+ if not dim.get("name"):
30
+ errors.append(f"[{idx}] missing 'name'")
31
+ if not dim.get("type"):
32
+ errors.append(f"[{name}] missing 'type'")
33
+ return errors
34
+
35
+ dtype = dim["type"]
36
+ if dtype not in VALID_DIMENSION_TYPES:
37
+ errors.append(f"[{name}] invalid type '{dtype}'")
38
+ return errors
39
+
40
+ # LLM类维度校验(llm-score 和 llm-judge)
41
+ if dtype in ("llm-score", "llm-judge"):
42
+ # judge_id 必填
43
+ if not dim.get("judge_id"):
44
+ errors.append(f"[{name}] missing 'judge_id'")
45
+
46
+ # weight 必填
47
+ w = dim.get("weight")
48
+ if w is None:
49
+ errors.append(f"[{name}] missing 'weight'")
50
+ elif not isinstance(w, (int, float)) or not (0 <= w <= 1):
51
+ errors.append(f"[{name}] invalid weight '{w}'")
52
+
53
+ # prompt 必填
54
+ if "prompt" not in dim:
55
+ errors.append(f"[{name}] missing 'prompt'")
56
+ elif isinstance(dim["prompt"], dict):
57
+ for f in ("definition", "instruct", "step"):
58
+ if not dim["prompt"].get(f):
59
+ errors.append(f"[{name}] prompt.{f} missing")
60
+
61
+ # 内置函数校验
62
+ elif dtype == "builtin":
63
+ # judge_id 不应该存在
64
+ if "judge_id" in dim:
65
+ errors.append(f"[{name}] builtin type should not have 'judge_id'")
66
+
67
+ # func 必填
68
+ func = dim.get("func")
69
+ if not func:
70
+ errors.append(f"[{name}] missing 'func'")
71
+ elif func not in BUILTIN_FUNCTIONS:
72
+ errors.append(f"[{name}] invalid func '{func}'")
73
+
74
+ # weight 必填
75
+ w = dim.get("weight")
76
+ if w is None:
77
+ errors.append(f"[{name}] missing 'weight'")
78
+ elif not isinstance(w, (int, float)) or not (0 <= w <= 1):
79
+ errors.append(f"[{name}] invalid weight '{w}'")
80
+
81
+ return errors
82
+
83
+
84
+ def check_config(path: str) -> dict:
85
+ """校验维度配置文件"""
86
+ result_obj = {"success": True, "file": path, "errors": [], "dimensions": []}
87
+
88
+ p = Path(path)
89
+ if not p.exists():
90
+ return {**result_obj, "success": False, "errors": [f"file not found: {path}"]}
91
+
92
+ # 使用 common.load_json 加载文件
93
+ load_result = load_json(path)
94
+ if not load_result.get("success"):
95
+ return {**result_obj, "success": False, "errors": [load_result.get("message", "load failed")]}
96
+
97
+ data = load_result.get("data")
98
+
99
+ if not isinstance(data, dict):
100
+ return {**result_obj, "success": False, "errors": ["root must be object"]}
101
+
102
+ # 检查根节点字段
103
+ valid_root_fields = {"name", "description", "evals"}
104
+ invalid_root_fields = {"scene", "scene_type", "dimensions"}
105
+ for field in invalid_root_fields:
106
+ if field in data:
107
+ result_obj["errors"].append(f"invalid root field '{field}', use correct field name")
108
+
109
+ # 检查维度数组字段
110
+ if "evals" not in data:
111
+ if "dimensions" in data:
112
+ result_obj["errors"].append("'dimensions' is invalid, use 'evals' instead")
113
+ else:
114
+ result_obj["errors"].append("missing 'evals' field")
115
+
116
+ dims = data.get("evals", [])
117
+ if not isinstance(dims, list):
118
+ return {**result_obj, "success": False, "errors": ["'evals' must be array"]}
119
+
120
+ # 逐个校验
121
+ for i, d in enumerate(dims):
122
+ if not isinstance(d, dict):
123
+ result_obj["errors"].append(f"[{i}] must be object")
124
+ continue
125
+
126
+ # 检查是否错误嵌套在 config 内
127
+ if "config" in d and isinstance(d["config"], dict):
128
+ # config 内有 type 字段,说明维度对象嵌套错误
129
+ if "type" in d["config"]:
130
+ name = d.get("name", f"index_{i}")
131
+ result_obj["errors"].append(f"[{name}] dimension fields should not be nested in 'config'")
132
+ # 从 config 中提取字段进行校验
133
+ d = d["config"]
134
+
135
+ errs = validate_dimension(d, i)
136
+ result_obj["errors"].extend(errs)
137
+ result_obj["dimensions"].append({"name": d.get("name", f"index_{i}"), "valid": not errs})
138
+
139
+ # 检查权重总和(所有维度都应有 weight)
140
+ weight_dims = [d for d in dims if isinstance(d, dict) and "weight" in d]
141
+ if weight_dims:
142
+ total = sum(d["weight"] for d in weight_dims if isinstance(d.get("weight"), (int, float)))
143
+ if abs(total - 1.0) > 0.0001:
144
+ result_obj["errors"].append(f"weight sum {total:.4f} != 1.0")
145
+
146
+ if result_obj["errors"]:
147
+ result_obj["success"] = False
148
+ return result_obj
149
+
150
+
151
+ # ============================================================================
152
+ # 更新 judge_id
153
+ # ============================================================================
154
+
155
+ def get_judge_id(config: Union[dict, list]) -> Optional[str]:
156
+ """从评委配置中提取judge_id"""
157
+ if isinstance(config, list) and config:
158
+ return config[0].get("id")
159
+ if isinstance(config, dict):
160
+ return config.get("id") or config.get("models", [{}])[0].get("id")
161
+ return None
162
+
163
+
164
+ def update_config(dim_path: str, judge_path: str, output_path: Optional[str]) -> dict:
165
+ """更新维度配置中的judge_id"""
166
+ result_obj = {"success": True, "errors": []}
167
+
168
+ # 加载评委配置 - 使用 common.load_json
169
+ judge_result = load_json(judge_path)
170
+ if not judge_result.get("success"):
171
+ return {"success": False, "errors": [f"load judge config failed: {judge_result.get('message')}"]}
172
+ judge_config = judge_result.get("data")
173
+
174
+ judge_id = get_judge_id(judge_config)
175
+ if not judge_id:
176
+ return {"success": False, "errors": ["judge_id not found in config"]}
177
+ result_obj["judge_id"] = judge_id
178
+
179
+ # 加载维度配置 - 使用 common.load_json
180
+ dim_result = load_json(dim_path)
181
+ if not dim_result.get("success"):
182
+ return {"success": False, "errors": [f"load dimension config failed: {dim_result.get('message')}"]}
183
+ dim_config = dim_result.get("data")
184
+
185
+ # 更新judge_id
186
+ updated = 0
187
+ for dim in dim_config.get("evals", []):
188
+ if isinstance(dim, dict) and dim.get("type") in ("llm-score", "llm-judge"):
189
+ dim["judge_id"] = judge_id
190
+ updated += 1
191
+
192
+ result_obj["updated"] = updated
193
+
194
+ # 保存 - 使用 common.save_json
195
+ out = output_path or dim_path
196
+ save_result = save_json(out, dim_config)
197
+ if not save_result.get("success"):
198
+ result_obj["success"] = False
199
+ result_obj["errors"].append(f"save failed: {save_result.get('message')}")
200
+ else:
201
+ result_obj["output"] = out
202
+
203
+ return result_obj
204
+
205
+
206
+ # ============================================================================
207
+ # CLI 入口
208
+ # ============================================================================
209
+
210
+ def main():
211
+ parser = argparse.ArgumentParser(description="维度配置工具")
212
+ parser.add_argument("-a", "--action", required=True, choices=["check", "update"], help="操作类型")
213
+ parser.add_argument("-d", "--dimension", help="维度配置文件")
214
+ parser.add_argument("-j", "--judge", help="评委配置文件(update时必需)")
215
+ parser.add_argument("-o", "--output", help="输出文件路径")
216
+ parser.add_argument("--quiet", action="store_true", help="仅输出JSON")
217
+ args = parser.parse_args()
218
+
219
+ if args.action == "check":
220
+ if not args.dimension:
221
+ parser.error("--dimension required for check")
222
+ result = check_config(args.dimension)
223
+
224
+ if not args.quiet:
225
+ status = "PASS" if result["success"] else f"FAIL ({len(result['errors'])} errors)"
226
+ print(f"Validation: {status}")
227
+ for e in result["errors"]:
228
+ print(f" - {e}")
229
+
230
+ elif args.action == "update":
231
+ if not args.dimension or not args.judge:
232
+ parser.error("--dimension and --judge required for update")
233
+ result = update_config(args.dimension, args.judge, args.output)
234
+ print(json.dumps(result, ensure_ascii=False, indent=2))
235
+
236
+ sys.exit(0 if result.get("success") else 1)
237
+
238
+
239
+ if __name__ == "__main__":
240
+ main()
@@ -0,0 +1,410 @@
1
+ #!/usr/bin/env python3
2
+ """评测集管理:解析、标准化、提交"""
3
+ import argparse
4
+ import json
5
+ import math
6
+ import random
7
+ import string
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ from utils import (
12
+ OPTIONAL_FIELDS,
13
+ ERR_REMOTE_DEFAULT,
14
+ handle_cli_error,
15
+ )
16
+ from files import (
17
+ load_json,
18
+ save_json,
19
+ load_config_kv,
20
+ load_data,
21
+ load_jsonl_stream,
22
+ extract_fields,
23
+ )
24
+ from clients import (
25
+ ApiClient,
26
+ TokenManager,
27
+ )
28
+
29
+
30
+ def cmd_analysis(args):
31
+ """解析评测集文件结构,输出结构文件
32
+
33
+ 产物:evalset-structure.json(包含文件格式、行数、字段信息)
34
+
35
+ 字段映射由 Claude Code 根据规则生成,不再由脚本推断。
36
+ """
37
+ load_result = load_data(args.input)
38
+ if not load_result.get("success"):
39
+ raise ValueError(f"数据加载失败: {load_result.get('message')}")
40
+ data = load_result.get("data", {}).get("items", [])
41
+
42
+ fields = extract_fields(data)
43
+
44
+ # 结构文件:唯一产物
45
+ structure = {
46
+ "file": args.input,
47
+ "format": Path(args.input).suffix.lower()[1:],
48
+ "total_rows": len(data),
49
+ "fields": fields
50
+ }
51
+ save_json(args.output, structure)
52
+
53
+ return {
54
+ "success": True,
55
+ "total_rows": len(data),
56
+ "fields": list(fields.keys()),
57
+ "structure_file": args.output
58
+ }
59
+
60
+
61
+ # ============================================================================
62
+ # 标准化
63
+ # ============================================================================
64
+
65
+ def normalize_data(data: list, mapping: dict) -> list:
66
+ """根据字段映射将数据转为标准格式
67
+
68
+ 映射格式:
69
+ {
70
+ "question": {"source_field": "question", "default": null},
71
+ "answer": {"source_field": "answer", "default": null},
72
+ "model": {"source_field": null, "default": "deepseek-r1"},
73
+ "case_id": {"source_field": "id", "default": null}
74
+ }
75
+
76
+ 处理规则:
77
+ - 有 source_field 且源数据有该字段 → 使用源数据值
78
+ - 无 source_field 或源数据无该字段 → 使用 default 值
79
+ """
80
+ # 提取字段配置
81
+ def get_field_config(field_name):
82
+ config = mapping.get(field_name, {})
83
+ if isinstance(config, str):
84
+ # 兼容旧格式:直接是字段名
85
+ return {"source_field": config, "default": None}
86
+ return config
87
+
88
+ q_config = get_field_config('question')
89
+ a_config = get_field_config('answer')
90
+ m_config = get_field_config('model')
91
+ c_config = get_field_config('case_id')
92
+
93
+ q_field = q_config.get('source_field')
94
+ a_field = a_config.get('source_field')
95
+
96
+ if not q_field or not a_field:
97
+ raise ValueError("字段映射必须包含 question 和 answer 的 source_field")
98
+
99
+ # 可选字段配置
100
+ opt_field_configs = {}
101
+ for f in OPTIONAL_FIELDS:
102
+ config = get_field_config(f)
103
+ if config.get('source_field') or config.get('default'):
104
+ opt_field_configs[f] = config
105
+
106
+ result = []
107
+
108
+ # case_id 分组生成
109
+ question_to_case = {}
110
+ case_counter = 0
111
+
112
+ for idx, item in enumerate(data):
113
+ question = str(item.get(q_field, ''))
114
+ answer = str(item.get(a_field, ''))
115
+ if not question or not answer:
116
+ print(f"警告: 第{idx+1}行缺少必要字段", file=sys.stderr)
117
+ continue
118
+
119
+ # case_id 处理
120
+ # 规则:有 source_field 用源数据值,无则自动生成(不使用 default)
121
+ c_field = c_config.get('source_field')
122
+ if c_field and item.get(c_field):
123
+ case_id = str(item.get(c_field))
124
+ else:
125
+ # 无 case_id 字段,根据 question 分组自动生成
126
+ if question not in question_to_case:
127
+ case_counter += 1
128
+ question_to_case[question] = f'case-{case_counter:04d}'
129
+ case_id = question_to_case[question]
130
+
131
+ # model 处理
132
+ m_field = m_config.get('source_field')
133
+ m_default = m_config.get('default')
134
+ if m_field and m_field in item:
135
+ model_value = str(item.get(m_field))
136
+ elif m_default:
137
+ model_value = str(m_default)
138
+ else:
139
+ model_value = 'default'
140
+
141
+ record = {
142
+ "question": question,
143
+ "answer": answer,
144
+ "model": model_value,
145
+ "case_id": case_id
146
+ }
147
+
148
+ # 添加可选字段
149
+ for std_field, config in opt_field_configs.items():
150
+ src_field = config.get('source_field')
151
+ default_val = config.get('default')
152
+ if src_field and src_field in item:
153
+ value = item.get(src_field)
154
+ # 正确的空值检查:排除 None 和 NaN(pandas 读取 Excel 空单元格产生 NaN)
155
+ if value is not None and not (isinstance(value, float) and math.isnan(value)):
156
+ if str(value).strip():
157
+ record[std_field] = str(value)
158
+ elif default_val:
159
+ record[std_field] = str(default_val)
160
+
161
+ result.append(record)
162
+ return result
163
+
164
+
165
+ def cmd_normalize(args):
166
+ """将评测集转为标准格式"""
167
+ load_result = load_data(args.input)
168
+ if not load_result.get("success"):
169
+ raise ValueError(f"数据加载失败: {load_result.get('message')}")
170
+ data = load_result.get("data", {}).get("items", [])
171
+ if not data:
172
+ raise ValueError("评测集为空或无法解析")
173
+
174
+ mapping_result = load_json(args.mapping)
175
+ if not mapping_result.get("success"):
176
+ raise ValueError(f"映射文件加载失败: {mapping_result.get('message')}")
177
+ mapping = mapping_result.get("data", {})
178
+
179
+ normalized = normalize_data(data, mapping)
180
+ if not normalized:
181
+ raise ValueError("转换后的评测集为空")
182
+
183
+ Path(args.output).parent.mkdir(parents=True, exist_ok=True)
184
+ Path(args.output).write_text('\n'.join(json.dumps(item, ensure_ascii=False) for item in normalized), encoding='utf-8')
185
+
186
+ return {"success": True, "input_rows": len(data), "output_rows": len(normalized), "output_file": args.output}
187
+
188
+
189
+ # ============================================================================
190
+ # 提交
191
+ # ============================================================================
192
+
193
+ def generate_evalset_id() -> str:
194
+ """生成评测集ID"""
195
+ suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
196
+ return f"eval-{suffix}"
197
+
198
+
199
+ def cmd_submit(args):
200
+ """提交评测集到后端服务"""
201
+ # 解析评测集
202
+ items = []
203
+ for idx, line in enumerate(Path(args.evalset).read_text(encoding='utf-8').splitlines()):
204
+ if not line.strip():
205
+ continue
206
+ try:
207
+ case = json.loads(line)
208
+ # 必填字段
209
+ item = {
210
+ "case_id": case.get('case_id', f'case-{idx+1:04d}'),
211
+ "model": case.get('model', 'default'),
212
+ "question": case['question'],
213
+ "answer": case['answer']
214
+ }
215
+ # 可选字段
216
+ for field in OPTIONAL_FIELDS:
217
+ if field in case and case[field]:
218
+ item[field] = case[field]
219
+ items.append(item)
220
+ except (json.JSONDecodeError, KeyError) as e:
221
+ raise ValueError(f"评测集第{idx+1}行解析失败: {e}")
222
+
223
+ if not items:
224
+ raise ValueError("评测集为空")
225
+
226
+ # 提交到API
227
+ config_result = load_config_kv(args.config)
228
+ if not config_result.get("success"):
229
+ raise ValueError(f"配置文件加载失败: {config_result.get('message')}")
230
+ config = config_result.get("data", {})
231
+
232
+ # 使用 TokenManager 和 ApiClient
233
+ token_manager = TokenManager(args.auth)
234
+ client = ApiClient(token_manager, config.get('base_url', 'http://127.0.0.1:8080'))
235
+
236
+ evalset_id = generate_evalset_id()
237
+
238
+ client.post("/open/api/v1/evalset", json={"evalset_id": evalset_id, "items": items})
239
+
240
+ save_json(args.output, {"dataset": evalset_id, "total": len(items)})
241
+ return {"evalset_id": evalset_id, "total": len(items)}
242
+
243
+
244
+ # ============================================================================
245
+ # 批次提交(流式处理)
246
+ # ============================================================================
247
+
248
+ # D-36: 批次大小固定为 500 条
249
+ BATCH_SIZE = 500
250
+
251
+
252
+ def cmd_submit_batch(file_path: str, api_client, endpoint: str) -> dict:
253
+ """
254
+ 分批提交评测集数据(流式处理)
255
+
256
+ Args:
257
+ file_path: JSONL 文件路径
258
+ api_client: API 客户端实例
259
+ endpoint: API 端点
260
+
261
+ Returns:
262
+ 包含 success, stats, errors 字段的结果字典
263
+
264
+ 实现决策:
265
+ - D-36: 批次大小固定为 500 条
266
+ - D-37: 批次采用顺序同步提交策略
267
+ - D-38: 批次级别失败时立即停止处理
268
+ - D-39: 单条数据错误收集在 errors 数组
269
+ - D-40: 错误报告包含 line 和 message 字段
270
+ - D-41: 进度输出到 stderr
271
+ - D-42: 每批次完成后输出一次进度
272
+ - D-43: 进度采用 JSON 格式
273
+ """
274
+ # 1. 流式读取文件
275
+ stream = load_jsonl_stream(file_path)
276
+
277
+ # 2. 初始化状态
278
+ batch = []
279
+ stats = {"total": 0, "success": 0, "failed": 0, "batches": 0}
280
+ errors = []
281
+ evalset_id = generate_evalset_id()
282
+
283
+ # 3. 流式处理
284
+ for item in stream:
285
+ # 3.1 处理错误项(单条数据错误)
286
+ if item.get("success") is False:
287
+ stats["total"] += 1
288
+ stats["failed"] += 1
289
+ # D-40: 错误报告包含 line 和 message 字段
290
+ errors.append({
291
+ "line": item.get("line", 0),
292
+ "message": item.get("message", "未知错误")
293
+ })
294
+ continue
295
+
296
+ # 3.2 构建数据项
297
+ data = item["data"]
298
+ record = {
299
+ "case_id": data.get('case_id', f'case-{item["line"]:04d}'),
300
+ "model": data.get('model', 'default'),
301
+ "question": data['question'],
302
+ "answer": data['answer']
303
+ }
304
+ # 添加可选字段
305
+ for field in OPTIONAL_FIELDS:
306
+ if field in data and data[field]:
307
+ record[field] = data[field]
308
+
309
+ batch.append(record)
310
+ stats["total"] += 1
311
+
312
+ # 3.3 批次满时提交
313
+ if len(batch) >= BATCH_SIZE:
314
+ try:
315
+ api_client.post(endpoint, items=batch)
316
+ stats["success"] += len(batch)
317
+ stats["batches"] += 1
318
+ # D-41, D-42, D-43: 进度输出到 stderr,JSON 格式
319
+ print(json.dumps({"progress": stats["total"], "batches": stats["batches"]}), file=sys.stderr)
320
+ batch = []
321
+ except Exception as e:
322
+ # D-38: 批次失败立即停止
323
+ stats["failed"] += len(batch)
324
+ error_msg = str(e)
325
+ if hasattr(e, 'message'):
326
+ error_msg = e.message
327
+ errors.append({"line": 0, "message": f"批次提交失败: {error_msg}"})
328
+ return {
329
+ "success": False,
330
+ "code": getattr(e, 'code', ERR_REMOTE_DEFAULT),
331
+ "message": error_msg,
332
+ "stats": stats,
333
+ "errors": errors
334
+ }
335
+
336
+ # 4. 提交最后一批
337
+ if batch:
338
+ try:
339
+ api_client.post(endpoint, items=batch)
340
+ stats["success"] += len(batch)
341
+ stats["batches"] += 1
342
+ # D-41, D-42, D-43: 进度输出到 stderr
343
+ print(json.dumps({"progress": stats["total"], "batches": stats["batches"]}), file=sys.stderr)
344
+ except Exception as e:
345
+ stats["failed"] += len(batch)
346
+ error_msg = str(e)
347
+ if hasattr(e, 'message'):
348
+ error_msg = e.message
349
+ errors.append({"line": 0, "message": f"批次提交失败: {error_msg}"})
350
+ return {
351
+ "success": False,
352
+ "code": getattr(e, 'code', ERR_REMOTE_DEFAULT),
353
+ "message": error_msg,
354
+ "stats": stats,
355
+ "errors": errors
356
+ }
357
+
358
+ # 5. 输出结果
359
+ return {
360
+ "success": True,
361
+ "evalset_id": evalset_id,
362
+ "stats": stats,
363
+ "errors": errors
364
+ }
365
+
366
+
367
+ # ============================================================================
368
+ # CLI 入口
369
+ # ============================================================================
370
+
371
+ def main():
372
+ parser = argparse.ArgumentParser(description='评测集管理')
373
+ subparsers = parser.add_subparsers(dest='command', help='子命令')
374
+
375
+ # analysis
376
+ p = subparsers.add_parser('analysis', help='解析评测集结构')
377
+ p.add_argument('--input', required=True, help='评测集文件路径')
378
+ p.add_argument('--output', required=True, help='输出结构文件路径')
379
+ p.set_defaults(func=cmd_analysis)
380
+
381
+ # normalize
382
+ p = subparsers.add_parser('normalize', help='标准化评测集')
383
+ p.add_argument('--input', required=True, help='原始评测集文件路径')
384
+ p.add_argument('--mapping', required=True, help='字段映射文件路径')
385
+ p.add_argument('--output', required=True, help='输出文件路径')
386
+ p.set_defaults(func=cmd_normalize)
387
+
388
+ # submit
389
+ p = subparsers.add_parser('submit', help='提交评测集')
390
+ p.add_argument('--evalset', required=True, help='标准化评测集文件路径')
391
+ p.add_argument('--config', required=True, help='服务配置文件')
392
+ p.add_argument('--auth', required=True, help='鉴权信息文件')
393
+ p.add_argument('--output', required=True, help='输出文件路径')
394
+ p.set_defaults(func=cmd_submit)
395
+
396
+ args = parser.parse_args()
397
+
398
+ # Python 3.6 兼容:手动检查子命令
399
+ if args.command is None:
400
+ parser.error("请指定子命令: analysis, normalize, submit")
401
+
402
+ try:
403
+ result_obj = args.func(args)
404
+ print(json.dumps(result_obj, ensure_ascii=False))
405
+ except Exception as e:
406
+ handle_cli_error(e)
407
+
408
+
409
+ if __name__ == '__main__':
410
+ main()