astron-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +119 -0
  3. package/bin/astron-eval.mjs +111 -0
  4. package/package.json +24 -0
  5. package/skills/astron-eval/SKILL.md +60 -0
  6. package/skills/model-evaluation/SKILL.md +180 -0
  7. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  8. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
  9. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  10. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  11. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
  12. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
  13. package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
  14. package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
  15. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  16. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
  17. package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  18. package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
  19. package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
  20. package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
  21. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  22. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  23. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  24. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
  25. package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
  26. package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
  27. package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
  28. package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
  29. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
  30. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
  31. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
  32. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  33. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
  34. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
  35. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
  36. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
  37. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  38. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  39. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  40. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
  41. package/skills/model-evaluation/assets/eval-judge.json +11 -0
  42. package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
  43. package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
  44. package/skills/model-evaluation/assets/experts/content-match.json +37 -0
  45. package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
  46. package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
  47. package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
  48. package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
  49. package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
  50. package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
  51. package/skills/model-evaluation/eval-build.md +281 -0
  52. package/skills/model-evaluation/eval-execute.md +196 -0
  53. package/skills/model-evaluation/eval-init.md +237 -0
  54. package/skills/model-evaluation/processes/dimension-process.md +207 -0
  55. package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
  56. package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
  57. package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
  58. package/skills/model-evaluation/processes/keypoint-process.md +148 -0
  59. package/skills/model-evaluation/processes/python-env-process.md +113 -0
  60. package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
  61. package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
  62. package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
  63. package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
  64. package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
  65. package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
  66. package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
  67. package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
  68. package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
  69. package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
  70. package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
  71. package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
  72. package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
  73. package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
  74. package/skills/model-evaluation/scripts/eval_auth.py +588 -0
  75. package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
  76. package/skills/model-evaluation/scripts/eval_set.py +410 -0
  77. package/skills/model-evaluation/scripts/eval_task.py +324 -0
  78. package/skills/model-evaluation/scripts/files/__init__.py +38 -0
  79. package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
  80. package/skills/model-evaluation/scripts/files/streaming.py +245 -0
  81. package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
  82. package/skills/model-evaluation/scripts/utils/constants.py +101 -0
  83. package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
  84. package/skills/model-evaluation/scripts/utils/errors.py +244 -0
  85. package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
  86. package/skills/skill-driven-eval/SKILL.md +456 -0
  87. package/skills/skill-driven-eval/agents/grader.md +144 -0
  88. package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
  89. package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
  90. package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
  91. package/skills/skill-driven-eval/references/schemas.md +282 -0
  92. package/skills/skill-driven-eval/scripts/__init__.py +1 -0
  93. package/skills/skill-driven-eval/scripts/__main__.py +70 -0
  94. package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
  95. package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
  96. package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
@@ -0,0 +1,282 @@
1
+ # JSON 模式
2
+
3
+ 本文档定义了 skill-driven-eval 使用的 JSON 模式。
4
+
5
+ ---
6
+
7
+ ## evals.json
8
+
9
+ 定义用于跨模型评估技能的测试用例。
10
+
11
+ ```json
12
+ {
13
+ "target_skill": "example-skill",
14
+ "target_skill_path": "/path/to/skill",
15
+ "models_to_compare": ["model-a", "model-b"],
16
+ "evals": [
17
+ {
18
+ "id": 1,
19
+ "name": "descriptive-name",
20
+ "prompt": "用户的任务提示词",
21
+ "expected_output": "预期结果的描述",
22
+ "assertions": [
23
+ "输出包含 X",
24
+ "技能成功完成了 Y"
25
+ ]
26
+ }
27
+ ]
28
+ }
29
+ ```
30
+
31
+ **字段:**
32
+ - `target_skill`:被评估的技能名称
33
+ - `target_skill_path`:技能目录的绝对路径
34
+ - `models_to_compare`:要比较的模型标识符列表(例如 "opus"、"sonnet"、"haiku" 或自定义模型 ID)
35
+ - `evals[].id`:唯一的整数标识符
36
+ - `evals[].name`:测试用例的描述性名称
37
+ - `evals[].prompt`:要执行的任务
38
+ - `evals[].expected_output`:成功的人类可读描述
39
+ - `evals[].assertions`:可验证的语句列表
40
+
41
+ ---
42
+
43
+ ## mapping.json
44
+
45
+ **对盲评估至关重要。** 将匿名运行 ID 映射到实际模型名称。由 MainAgent 在评分完成后创建,以防止评分者偏见。
46
+
47
+ 位于 `<workspace>/mapping.json`。
48
+
49
+ ```json
50
+ {
51
+ "run-001": {
52
+ "model": "opus",
53
+ "eval_id": 1,
54
+ "eval_name": "表单填写"
55
+ },
56
+ "run-002": {
57
+ "model": "sonnet",
58
+ "eval_id": 1,
59
+ "eval_name": "表单填写"
60
+ },
61
+ "run-003": {
62
+ "model": "opus",
63
+ "eval_id": 2,
64
+ "eval_name": "文本提取"
65
+ },
66
+ "run-004": {
67
+ "model": "sonnet",
68
+ "eval_id": 2,
69
+ "eval_name": "文本提取"
70
+ }
71
+ }
72
+ ```
73
+
74
+ **字段:**
75
+ - 键:匿名运行 ID(例如 "run-001")
76
+ - `model`:实际的模型标识符
77
+ - `eval_id`:此运行所属的评估 ID
78
+ - `eval_name`:人类可读的评估名称
79
+
80
+ **重要:** 此文件必须在评分完成后才能创建。评分子智能体不能访问此信息。
81
+
82
+ ---
83
+
84
+ ## timing.json
85
+
86
+ 单次运行的时间数据。位于 `<workspace>/run-<ID>/timing.json`。
87
+
88
+ ```json
89
+ {
90
+ "run_id": "run-001",
91
+ "total_tokens": 84852,
92
+ "duration_ms": 23332,
93
+ "total_duration_seconds": 23.3,
94
+ "start_time": "2026-03-19T10:30:00Z",
95
+ "end_time": "2026-03-19T10:32:45Z"
96
+ }
97
+ ```
98
+
99
+ **字段:**
100
+ - `run_id`:匿名运行标识符(不是模型名称)
101
+ - `total_tokens`:消耗的总 token 数(来自任务通知)
102
+ - `duration_ms`:持续时间(毫秒)
103
+ - `total_duration_seconds`:持续时间(秒)
104
+
105
+ ---
106
+
107
+ ## grading.json
108
+
109
+ 评分智能体的输出。位于 `<workspace>/run-<ID>/grading.json`。
110
+
111
+ **关键:** 不能包含模型名称或任何关于哪个模型产生了输出的识别信息。
112
+
113
+ ```json
114
+ {
115
+ "run_id": "run-001",
116
+ "expectations": [
117
+ {
118
+ "text": "输出包含名称 'John Smith'",
119
+ "passed": true,
120
+ "evidence": "在 output.txt 中找到:'提取的名称:John Smith'"
121
+ },
122
+ {
123
+ "text": "电子表格有 SUM 公式",
124
+ "passed": false,
125
+ "evidence": "没有创建电子表格"
126
+ }
127
+ ],
128
+ "summary": {
129
+ "passed": 2,
130
+ "failed": 1,
131
+ "total": 3,
132
+ "pass_rate": 0.67
133
+ },
134
+ "execution_metrics": {
135
+ "tool_calls": {
136
+ "Read": 5,
137
+ "Write": 2,
138
+ "Bash": 8
139
+ },
140
+ "total_tool_calls": 15,
141
+ "errors_encountered": 0
142
+ },
143
+ "issues": []
144
+ }
145
+ ```
146
+
147
+ **字段:**
148
+ - `run_id`:匿名运行标识符(不要包含模型)
149
+ - `expectations[]`:带证据的已评分预期
150
+ - `text`:预期文本
151
+ - `passed`:布尔值
152
+ - `evidence`:支持或反驳的证据
153
+ - `summary`:汇总统计数据
154
+ - `execution_metrics`:工具使用(可选)
155
+ - `issues`:观察到的问题(可选)
156
+
157
+ ---
158
+
159
+ ## benchmark.json
160
+
161
+ 汇总比较结果。位于 `<workspace>/benchmark.json`。
162
+
163
+ 由 MainAgent 在结合 grading.json 与 mapping.json 后生成。
164
+
165
+ ```json
166
+ {
167
+ "metadata": {
168
+ "target_skill": "pdf",
169
+ "target_skill_path": "/path/to/pdf",
170
+ "models_compared": ["opus", "sonnet"],
171
+ "timestamp": "2026-03-19T10:30:00Z",
172
+ "evals_run": [1, 2, 3],
173
+ "note": "结果基于盲评估"
174
+ },
175
+
176
+ "runs": [
177
+ {
178
+ "run_id": "run-001",
179
+ "eval_id": 1,
180
+ "eval_name": "表单填写",
181
+ "model": "opus",
182
+ "result": {
183
+ "pass_rate": 0.85,
184
+ "passed": 6,
185
+ "failed": 1,
186
+ "total": 7,
187
+ "time_seconds": 42.5,
188
+ "tokens": 38000,
189
+ "tool_calls": 18,
190
+ "errors": 0
191
+ },
192
+ "expectations": [
193
+ {"text": "...", "passed": true, "evidence": "..."}
194
+ ],
195
+ "issues": []
196
+ }
197
+ ],
198
+
199
+ "model_summary": {
200
+ "opus": {
201
+ "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
202
+ "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
203
+ "tokens": {"mean": 42000, "stddev": 4000, "min": 36000, "max": 48000}
204
+ },
205
+ "sonnet": {
206
+ "pass_rate": {"mean": 0.72, "stddev": 0.08, "min": 0.65, "max": 0.80},
207
+ "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
208
+ "tokens": {"mean": 28000, "stddev": 3000, "min": 24000, "max": 32000}
209
+ }
210
+ },
211
+
212
+ "comparison": {
213
+ "pass_rate_delta": "+0.13",
214
+ "time_delta": "+13.0s",
215
+ "token_delta": "+14000",
216
+ "cost_efficiency": {
217
+ "opus": 20.2,
218
+ "sonnet": 25.7
219
+ }
220
+ },
221
+
222
+ "recommendations": [
223
+ {
224
+ "scenario": "高风险任务",
225
+ "recommended_model": "opus",
226
+ "reason": "13% 更高的通过率证明了关键操作的成本合理性"
227
+ },
228
+ {
229
+ "scenario": "大批量任务",
230
+ "recommended_model": "sonnet",
231
+ "reason": "批量操作具有更好的成本效益"
232
+ }
233
+ ],
234
+
235
+ "notes": [
236
+ "Opus 达到 85% 通过率,而 Sonnet 为 72%",
237
+ "Sonnet 平均快 30%"
238
+ ]
239
+ }
240
+ ```
241
+
242
+ **字段:**
243
+ - `metadata`:关于评估运行的信息
244
+ - `runs[]`:单次运行结果(映射后现在包含模型名称)
245
+ - `model_summary`:每个模型的统计汇总
246
+ - `comparison`:模型之间的差异和成本效益分数
247
+ - `recommendations`:数据驱动的建议(非预设)
248
+ - `notes`:分析者观察
249
+
250
+ ---
251
+
252
+ ## 成本效益分数
253
+
254
+ 成本效益计算公式:`pass_rate * 1000 / (tokens / 1000)`
255
+
256
+ 越高越好。这表示"每千 token 的质量分数"。
257
+
258
+ ---
259
+
260
+ ## 工作区结构
261
+
262
+ ```
263
+ <skill-name>-eval-workspace/
264
+ ├── evals.json # 测试用例定义
265
+ ├── mapping.json # 模型映射(评分后创建)
266
+ ├── run-001/ # 匿名运行 ID
267
+ │ ├── outputs/ # 模型的输出
268
+ │ ├── transcript.md # 执行记录
269
+ │ ├── grading.json # 评分结果(盲评)
270
+ │ └── timing.json # 时间数据
271
+ ├── run-002/
272
+ │ └── ...
273
+ ├── benchmark.json # 汇总结果(含模型)
274
+ ├── benchmark.md # 人类可读摘要
275
+ └── report.html # 可视化比较报告
276
+ ```
277
+
278
+ **信息流:**
279
+ 1. MainAgent 生成执行子智能体 → 输出到 `run-XXX/outputs/`
280
+ 2. MainAgent 生成评分子智能体 → 评分到 `run-XXX/grading.json`
281
+ 3. MainAgent 创建 `mapping.json`(所有评分完成后)
282
+ 4. MainAgent 运行聚合 → 结合评分 + 映射 → `benchmark.json`
@@ -0,0 +1 @@
1
+ """Scripts for skill-driven-eval."""
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env python3
2
+ """Main entry point for skill-driven-eval scripts.
3
+
4
+ Usage:
5
+ python -m scripts aggregate <workspace>
6
+ python -m scripts test
7
+ """
8
+
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ # Add parent directory to path for imports
13
+ sys.path.insert(0, str(Path(__file__).parent.parent))
14
+
15
+
16
+ def main():
17
+ if len(sys.argv) < 2:
18
+ print("Usage: python -m scripts <command> [args]")
19
+ print("Commands:")
20
+ print(" aggregate <workspace> - Aggregate results into benchmark.json")
21
+ print(" test - Run test with sample data")
22
+ sys.exit(1)
23
+
24
+ command = sys.argv[1]
25
+
26
+ if command == "aggregate":
27
+ if len(sys.argv) < 3:
28
+ print("Usage: python -m scripts aggregate <workspace>")
29
+ sys.exit(1)
30
+
31
+ from aggregate_results import generate_benchmark, generate_markdown
32
+ import json
33
+
34
+ workspace = Path(sys.argv[2])
35
+ if not workspace.exists():
36
+ print(f"Error: Directory not found: {workspace}")
37
+ sys.exit(1)
38
+
39
+ benchmark = generate_benchmark(workspace)
40
+
41
+ # Write benchmark.json
42
+ output_json = workspace / "benchmark.json"
43
+ with open(output_json, "w") as f:
44
+ json.dump(benchmark, f, indent=2)
45
+ print(f"Generated: {output_json}")
46
+
47
+ # Write benchmark.md
48
+ output_md = workspace / "benchmark.md"
49
+ markdown = generate_markdown(benchmark)
50
+ with open(output_md, "w") as f:
51
+ f.write(markdown)
52
+ print(f"Generated: {output_md}")
53
+
54
+ # Print summary
55
+ print(f"\nSummary:")
56
+ for model, summary in benchmark["model_summary"].items():
57
+ pr = summary["pass_rate"]["mean"]
58
+ print(f" {model}: {pr*100:.1f}% pass rate")
59
+
60
+ elif command == "test":
61
+ from test_aggregate import main as test_main
62
+ test_main()
63
+
64
+ else:
65
+ print(f"Unknown command: {command}")
66
+ sys.exit(1)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()