astron-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +119 -0
  3. package/bin/astron-eval.mjs +111 -0
  4. package/package.json +24 -0
  5. package/skills/astron-eval/SKILL.md +60 -0
  6. package/skills/model-evaluation/SKILL.md +180 -0
  7. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  8. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
  9. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  10. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  11. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
  12. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
  13. package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
  14. package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
  15. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  16. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
  17. package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  18. package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
  19. package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
  20. package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
  21. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  22. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  23. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  24. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
  25. package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
  26. package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
  27. package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
  28. package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
  29. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
  30. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
  31. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
  32. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  33. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
  34. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
  35. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
  36. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
  37. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  38. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  39. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  40. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
  41. package/skills/model-evaluation/assets/eval-judge.json +11 -0
  42. package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
  43. package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
  44. package/skills/model-evaluation/assets/experts/content-match.json +37 -0
  45. package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
  46. package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
  47. package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
  48. package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
  49. package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
  50. package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
  51. package/skills/model-evaluation/eval-build.md +281 -0
  52. package/skills/model-evaluation/eval-execute.md +196 -0
  53. package/skills/model-evaluation/eval-init.md +237 -0
  54. package/skills/model-evaluation/processes/dimension-process.md +207 -0
  55. package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
  56. package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
  57. package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
  58. package/skills/model-evaluation/processes/keypoint-process.md +148 -0
  59. package/skills/model-evaluation/processes/python-env-process.md +113 -0
  60. package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
  61. package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
  62. package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
  63. package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
  64. package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
  65. package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
  66. package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
  67. package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
  68. package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
  69. package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
  70. package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
  71. package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
  72. package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
  73. package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
  74. package/skills/model-evaluation/scripts/eval_auth.py +588 -0
  75. package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
  76. package/skills/model-evaluation/scripts/eval_set.py +410 -0
  77. package/skills/model-evaluation/scripts/eval_task.py +324 -0
  78. package/skills/model-evaluation/scripts/files/__init__.py +38 -0
  79. package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
  80. package/skills/model-evaluation/scripts/files/streaming.py +245 -0
  81. package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
  82. package/skills/model-evaluation/scripts/utils/constants.py +101 -0
  83. package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
  84. package/skills/model-evaluation/scripts/utils/errors.py +244 -0
  85. package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
  86. package/skills/skill-driven-eval/SKILL.md +456 -0
  87. package/skills/skill-driven-eval/agents/grader.md +144 -0
  88. package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
  89. package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
  90. package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
  91. package/skills/skill-driven-eval/references/schemas.md +282 -0
  92. package/skills/skill-driven-eval/scripts/__init__.py +1 -0
  93. package/skills/skill-driven-eval/scripts/__main__.py +70 -0
  94. package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
  95. package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
  96. package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
@@ -0,0 +1,245 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 流式文件读取模块
4
+
5
+ 提供大文件的流式读取能力,使用生成器模式逐行处理,
6
+ 避免一次性加载整个文件到内存。
7
+
8
+ 函数:
9
+ load_jsonl_stream: 流式读取 JSONL 文件
10
+ load_csv_stream: 流式读取 CSV 文件
11
+ """
12
+ import json
13
+ import csv
14
+ from pathlib import Path
15
+ from typing import Generator, Dict, Any
16
+
17
+ from utils.constants import ERR_FILE_NOT_FOUND, ERR_FILE_ENCODING, ERR_FILE_PARSE
18
+
19
+
20
+ # ============================================================================
21
+ # 内部迭代器类 - 用于支持 skipped_lines 属性
22
+ # ============================================================================
23
+
24
+ class _ErrorGenerator:
25
+ """
26
+ 错误生成器 - 用于文件不存在等场景
27
+
28
+ yield 一个错误对象后结束,支持 skipped_lines 属性
29
+ """
30
+ def __init__(self, code: int, message: str):
31
+ self._code = code
32
+ self._message = message
33
+ self._yielded = False
34
+ self.skipped_lines = 0
35
+
36
+ def __iter__(self):
37
+ return self
38
+
39
+ def __next__(self):
40
+ if self._yielded:
41
+ raise StopIteration
42
+ self._yielded = True
43
+ return {
44
+ "success": False,
45
+ "message": self._message,
46
+ "code": self._code
47
+ }
48
+
49
+
50
+ class _JsonlStreamIterator:
51
+ """
52
+ JSONL 流式读取迭代器
53
+
54
+ 逐行读取 JSONL 文件,支持 skipped_lines 属性
55
+ """
56
+ def __init__(self, path: Path, encoding: str):
57
+ self._path = path
58
+ self._encoding = encoding
59
+ self._file = None
60
+ self._line_num = 0
61
+ self.skipped_lines = 0
62
+ self._encoding_error = False
63
+
64
+ def __iter__(self):
65
+ return self
66
+
67
+ def __next__(self):
68
+ # 编码错误已在第一次读取时检测
69
+ if self._encoding_error:
70
+ raise StopIteration
71
+
72
+ # 延迟打开文件
73
+ if self._file is None:
74
+ try:
75
+ self._file = open(self._path, 'r', encoding=self._encoding)
76
+ except UnicodeDecodeError:
77
+ self._encoding_error = True
78
+ # 返回编码错误,下次迭代结束
79
+ return {
80
+ "success": False,
81
+ "message": f"无法使用 {self._encoding} 编码读取文件: {self._path}",
82
+ "code": ERR_FILE_ENCODING
83
+ }
84
+
85
+ # 逐行读取
86
+ while True:
87
+ try:
88
+ line = self._file.readline()
89
+ except UnicodeDecodeError:
90
+ # 编码错误可能在读取时发生
91
+ self._encoding_error = True
92
+ return {
93
+ "success": False,
94
+ "message": f"无法使用 {self._encoding} 编码读取文件: {self._path}",
95
+ "code": ERR_FILE_ENCODING
96
+ }
97
+
98
+ if not line:
99
+ self._file.close()
100
+ raise StopIteration
101
+
102
+ self._line_num += 1
103
+ stripped = line.strip()
104
+
105
+ # 跳过空行(D-31)
106
+ if not stripped:
107
+ self.skipped_lines += 1
108
+ continue
109
+
110
+ # 解析 JSON
111
+ try:
112
+ data = json.loads(stripped)
113
+ return {"data": data, "line": self._line_num}
114
+ except json.JSONDecodeError as e:
115
+ # D-29: yield 错误对象,不抛异常
116
+ # D-33: 使用 ERR_FILE_PARSE (1003)
117
+ return {
118
+ "success": False,
119
+ "line": self._line_num,
120
+ "message": f"JSON 解析失败: {e}",
121
+ "code": ERR_FILE_PARSE
122
+ }
123
+
124
+
125
+ class _CsvStreamIterator:
126
+ """
127
+ CSV 流式读取迭代器
128
+
129
+ 逐行读取 CSV 文件,支持 skipped_lines 属性
130
+ """
131
+ def __init__(self, path: Path, encoding: str):
132
+ self._path = path
133
+ self._encoding = encoding
134
+ self._file = None
135
+ self._reader = None
136
+ self.skipped_lines = 0
137
+ self._encoding_error = False
138
+
139
+ def __iter__(self):
140
+ return self
141
+
142
+ def __next__(self):
143
+ # 编码错误已在初始化时检测
144
+ if self._encoding_error:
145
+ raise StopIteration
146
+
147
+ # 延迟打开文件
148
+ if self._file is None:
149
+ try:
150
+ self._file = open(self._path, 'r', encoding=self._encoding, newline='')
151
+ self._reader = csv.DictReader(self._file)
152
+ except UnicodeDecodeError:
153
+ self._encoding_error = True
154
+ return {
155
+ "success": False,
156
+ "message": f"无法使用 {self._encoding} 编码读取文件: {self._path}",
157
+ "code": ERR_FILE_ENCODING
158
+ }
159
+
160
+ # 逐行读取
161
+ while True:
162
+ try:
163
+ row = next(self._reader)
164
+ except StopIteration:
165
+ self._file.close()
166
+ raise
167
+ except UnicodeDecodeError:
168
+ # 编码错误可能在读取时发生
169
+ self._encoding_error = True
170
+ return {
171
+ "success": False,
172
+ "message": f"无法使用 {self._encoding} 编码读取文件: {self._path}",
173
+ "code": ERR_FILE_ENCODING
174
+ }
175
+
176
+ # CSV 行号:header=1,数据从 2 开始
177
+ line_num = self._reader.line_num
178
+
179
+ # 检查是否为空行(所有值为空或 None)
180
+ if not row or all(v is None or v.strip() == '' for v in row.values() if v):
181
+ self.skipped_lines += 1
182
+ continue
183
+
184
+ return {"data": dict(row), "line": line_num}
185
+
186
+
187
+ def load_jsonl_stream(path: str, encoding: str = "utf-8") -> Generator[Dict[str, Any], None, None]:
188
+ """
189
+ 流式读取 JSONL 文件,逐行返回数据
190
+
191
+ Args:
192
+ path: 文件路径
193
+ encoding: 文件编码(默认 utf-8)
194
+
195
+ Yields:
196
+ 成功: {"data": <解析后的数据>, "line": <行号>}
197
+ 错误: {"success": False, "line": <行号>, "message": "<错误信息>", "code": <错误码>}
198
+
199
+ 属性:
200
+ skipped_lines (int): 生成器耗尽后可访问,返回跳过的空行数
201
+
202
+ Example:
203
+ >>> gen = load_jsonl_stream("data.jsonl")
204
+ >>> for item in gen:
205
+ ... if item.get("success") is False:
206
+ ... print(f"Error at line {item['line']}: {item['message']}")
207
+ ... else:
208
+ ... process(item["data"])
209
+ >>> print(f"Skipped {gen.skipped_lines} empty lines")
210
+ """
211
+ p = Path(path)
212
+
213
+ # 文件不存在 - 返回单元素错误生成器
214
+ if not p.exists():
215
+ return _ErrorGenerator(ERR_FILE_NOT_FOUND, f"文件不存在: {path}")
216
+
217
+ # 使用迭代器包装器实现 skipped_lines 属性
218
+ return _JsonlStreamIterator(p, encoding)
219
+
220
+
221
+ def load_csv_stream(path: str, encoding: str = "utf-8") -> Generator[Dict[str, Any], None, None]:
222
+ """
223
+ 流式读取 CSV 文件,逐行返回数据
224
+
225
+ Args:
226
+ path: 文件路径
227
+ encoding: 文件编码(默认 utf-8)
228
+
229
+ Yields:
230
+ {"data": <字典形式行数据>, "line": <行号>}
231
+
232
+ 属性:
233
+ skipped_lines (int): 生成器耗尽后可访问,返回跳过的空行数
234
+
235
+ Note:
236
+ CSV 文件第一行为 header,数据行从 line=2 开始
237
+ """
238
+ p = Path(path)
239
+
240
+ # 文件不存在 - 返回单元素错误生成器
241
+ if not p.exists():
242
+ return _ErrorGenerator(ERR_FILE_NOT_FOUND, f"文件不存在: {path}")
243
+
244
+ # 使用迭代器包装器实现 skipped_lines 属性
245
+ return _CsvStreamIterator(p, encoding)
@@ -0,0 +1,128 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 基础工具模块
4
+
5
+ 包含通用基础设施:
6
+ - 常量定义
7
+ - 异常类和结果构建器
8
+ - 时间处理工具
9
+ """
10
+
11
+ from .constants import (
12
+ # 超时配置
13
+ DEFAULT_TIMEOUT,
14
+ DEFAULT_TOKEN_EXPIRY,
15
+ DEFAULT_POLL_INTERVAL,
16
+ DEFAULT_POLL_TIMEOUT,
17
+ # 重试配置
18
+ MAX_RETRIES,
19
+ RETRY_BACKOFF_FACTOR,
20
+ # 错误码
21
+ ERR_FILE_NOT_FOUND,
22
+ ERR_FILE_ENCODING,
23
+ ERR_FILE_PARSE,
24
+ ERR_CONFIG_INVALID,
25
+ ERR_NETWORK_TIMEOUT,
26
+ ERR_NETWORK_CONNECTION,
27
+ ERR_NETWORK_RETRY_EXHAUSTED,
28
+ ERR_REMOTE_AUTH_EXPIRED,
29
+ ERR_REMOTE_DEFAULT,
30
+ # 默认路径
31
+ DEFAULT_AUTH_CONFIG,
32
+ DEFAULT_SERVER_CONFIG,
33
+ DEFAULT_AUTH_CACHE,
34
+ # OAuth 配置
35
+ OOB_REDIRECT,
36
+ DEFAULT_CALLBACK_HOST,
37
+ DEFAULT_CALLBACK_PORT,
38
+ DEFAULT_CALLBACK_PATH,
39
+ DEFAULT_CALLBACK_TIMEOUT,
40
+ # 状态
41
+ TERMINAL_STATES,
42
+ # 维度配置
43
+ VALID_DIMENSION_TYPES,
44
+ BUILTIN_FUNCTIONS,
45
+ # 字段映射
46
+ FIELD_PATTERNS,
47
+ REQUIRED_FIELDS,
48
+ OPTIONAL_FIELDS,
49
+ )
50
+
51
+ from .errors import (
52
+ result,
53
+ ResultDict,
54
+ handle_cli_error,
55
+ EvalError,
56
+ FileEncodingError,
57
+ FileParseError,
58
+ FileNotFoundError,
59
+ ConfigError,
60
+ NetworkError,
61
+ NetworkTimeoutError,
62
+ NetworkConnectionError,
63
+ AuthExpiredError,
64
+ ApiError,
65
+ )
66
+
67
+ from .datetime_utils import (
68
+ parse_iso_datetime,
69
+ is_expired,
70
+ )
71
+
72
+ from .keypoint_prompts import (
73
+ SYSTEM_PROMPT,
74
+ build_user_prompt,
75
+ )
76
+
77
+ __all__ = [
78
+ # 常量
79
+ 'DEFAULT_TIMEOUT',
80
+ 'DEFAULT_TOKEN_EXPIRY',
81
+ 'DEFAULT_POLL_INTERVAL',
82
+ 'DEFAULT_POLL_TIMEOUT',
83
+ 'MAX_RETRIES',
84
+ 'RETRY_BACKOFF_FACTOR',
85
+ 'ERR_FILE_NOT_FOUND',
86
+ 'ERR_FILE_ENCODING',
87
+ 'ERR_FILE_PARSE',
88
+ 'ERR_CONFIG_INVALID',
89
+ 'ERR_NETWORK_TIMEOUT',
90
+ 'ERR_NETWORK_CONNECTION',
91
+ 'ERR_NETWORK_RETRY_EXHAUSTED',
92
+ 'ERR_REMOTE_AUTH_EXPIRED',
93
+ 'ERR_REMOTE_DEFAULT',
94
+ 'DEFAULT_AUTH_CONFIG',
95
+ 'DEFAULT_SERVER_CONFIG',
96
+ 'DEFAULT_AUTH_CACHE',
97
+ 'OOB_REDIRECT',
98
+ 'DEFAULT_CALLBACK_HOST',
99
+ 'DEFAULT_CALLBACK_PORT',
100
+ 'DEFAULT_CALLBACK_PATH',
101
+ 'DEFAULT_CALLBACK_TIMEOUT',
102
+ 'TERMINAL_STATES',
103
+ 'VALID_DIMENSION_TYPES',
104
+ 'BUILTIN_FUNCTIONS',
105
+ 'FIELD_PATTERNS',
106
+ 'REQUIRED_FIELDS',
107
+ 'OPTIONAL_FIELDS',
108
+ # 错误处理
109
+ 'result',
110
+ 'ResultDict',
111
+ 'handle_cli_error',
112
+ 'EvalError',
113
+ 'FileEncodingError',
114
+ 'FileParseError',
115
+ 'FileNotFoundError',
116
+ 'ConfigError',
117
+ 'NetworkError',
118
+ 'NetworkTimeoutError',
119
+ 'NetworkConnectionError',
120
+ 'AuthExpiredError',
121
+ 'ApiError',
122
+ # 时间处理
123
+ 'parse_iso_datetime',
124
+ 'is_expired',
125
+ # 评测点生成
126
+ 'SYSTEM_PROMPT',
127
+ 'build_user_prompt',
128
+ ]
@@ -0,0 +1,101 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 常量定义模块
4
+ 集中管理超时、默认值、错误码等
5
+ """
6
+
7
+ # ============================================================================
8
+ # 超时配置(秒)
9
+ # ============================================================================
10
+ DEFAULT_TIMEOUT = 30 # HTTP 请求默认超时
11
+ DEFAULT_TOKEN_EXPIRY = 7200 # Token 默认过期时间(2小时)
12
+ DEFAULT_POLL_INTERVAL = 30 # 任务轮询间隔
13
+ DEFAULT_POLL_TIMEOUT = 3600 # 任务轮询总超时(1小时)
14
+
15
+ # ============================================================================
16
+ # 重试配置
17
+ # ============================================================================
18
+ MAX_RETRIES = 3
19
+ RETRY_BACKOFF_FACTOR = 1.0
20
+
21
+ # ============================================================================
22
+ # 错误码定义
23
+ # ============================================================================
24
+ # 错误码范围说明:
25
+ # - 脚本本地错误码: 1000-4999
26
+ # - 远程服务错误码: 10000-99999(透传,不修改)
27
+
28
+ # 文件相关错误 (1000-1999)
29
+ ERR_FILE_NOT_FOUND = 1001
30
+ ERR_FILE_ENCODING = 1002
31
+ ERR_FILE_PARSE = 1003
32
+
33
+ # 配置相关错误 (2000-2999)
34
+ ERR_CONFIG_INVALID = 2001
35
+ ERR_CONFIG_MISSING = 2002
36
+
37
+ # 网络相关错误 (3000-3999)
38
+ ERR_NETWORK_TIMEOUT = 3001
39
+ ERR_NETWORK_CONNECTION = 3002
40
+ ERR_NETWORK_RETRY_EXHAUSTED = 3003
41
+
42
+ # 数据相关错误 (4000-4999)
43
+ ERR_DATA_INVALID = 4001
44
+ ERR_DATA_MISSING_FIELD = 4002
45
+
46
+ # 远程服务错误码(透传,仅作参考)
47
+ # 认证服务错误码: 10000-19999
48
+ ERR_REMOTE_AUTH_EXPIRED = 10002 # Token 过期
49
+ ERR_REMOTE_DEFAULT = 10001 # 未知远程错误
50
+
51
+ # ============================================================================
52
+ # 默认路径
53
+ # ============================================================================
54
+ DEFAULT_AUTH_CONFIG = "scripts/cfg/eval-auth.cfg"
55
+ DEFAULT_SERVER_CONFIG = "scripts/cfg/eval-server.cfg"
56
+ DEFAULT_AUTH_CACHE = "./.eval/auth.json"
57
+
58
+ # ============================================================================
59
+ # OAuth 配置
60
+ # ============================================================================
61
+ OOB_REDIRECT = "urn:ietf:wg:oauth:2.0:oob"
62
+
63
+ # OAuth2 回调配置(loopback 模式)
64
+ DEFAULT_CALLBACK_HOST = "127.0.0.1"
65
+ DEFAULT_CALLBACK_PORT = 51943
66
+ DEFAULT_CALLBACK_PATH = "/callback"
67
+ DEFAULT_CALLBACK_TIMEOUT = 120 # 秒
68
+
69
+ # ============================================================================
70
+ # 任务状态
71
+ # ============================================================================
72
+ TERMINAL_STATES = {"Succeeded", "Failed", "Cancelled"}
73
+
74
+ # ============================================================================
75
+ # 维度配置
76
+ # ============================================================================
77
+ VALID_DIMENSION_TYPES = {"llm-score", "llm-judge", "builtin"}
78
+ BUILTIN_FUNCTIONS = {"BLEU", "ROUGE", "BERTScore", "COMET", "TER", "Cosine"}
79
+
80
+ # ============================================================================
81
+ # 评测集字段映射
82
+ # ============================================================================
83
+ FIELD_PATTERNS = {
84
+ # 必填字段
85
+ 'question': ['question', 'prompt', 'input', 'query', '问题', '提问', '用户问题'],
86
+ 'answer': ['answer', 'response', 'output', 'reply', '回答', '回复', '模型回复'],
87
+ 'model': ['model', 'model_name', 'model_id', 'llm', 'llm_name', '模型', '模型名称', '大模型', '大语言模型'],
88
+ 'case_id': ['case_id', 'caseid', '用例id', '用例ID'],
89
+ # 可选字段
90
+ 'system': ['system', 'system_prompt', '系统提示', '系统提示词'],
91
+ 'context': ['context', '上下文'],
92
+ 'category': ['category', 'type', '分类', '类别'],
93
+ 'reference': ['reference', 'ref', 'gold', '参考答案', '标准答案'],
94
+ 'keypoint': ['keypoint', 'keypoints', '关键点', '评测点', '评估点'],
95
+ }
96
+
97
+ # 特殊处理:'id' 字段需精确匹配到 case_id,避免 seq_id、user_id 误匹配
98
+ CASE_ID_EXACT_MATCH = ['id']
99
+
100
+ REQUIRED_FIELDS = ['question', 'answer', 'model', 'case_id']
101
+ OPTIONAL_FIELDS = ['system', 'context', 'category', 'reference', 'keypoint']
@@ -0,0 +1,60 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 时间处理工具函数
4
+ 提供 ISO 格式时间解析和过期检查功能
5
+ """
6
+ from datetime import datetime
7
+ from typing import Optional
8
+
9
+
10
+ def parse_iso_datetime(dt_str: str) -> Optional[datetime]:
11
+ """
12
+ 解析 ISO 格式时间字符串,支持带时区和不带时区格式
13
+
14
+ Args:
15
+ dt_str: ISO 格式时间字符串
16
+
17
+ Returns:
18
+ datetime 对象,解析失败返回 None
19
+
20
+ Examples:
21
+ >>> parse_iso_datetime("2024-01-15T10:30:00")
22
+ datetime(2024, 1, 15, 10, 30, 0)
23
+ >>> parse_iso_datetime("2024-01-15T10:30:00Z")
24
+ datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc)
25
+ >>> parse_iso_datetime("2024-01-15T10:30:00+08:00")
26
+ datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone(timedelta(hours=8)))
27
+ """
28
+ try:
29
+ # Python 3.7+ 支持 datetime.fromisoformat
30
+ if '+' in dt_str or dt_str.endswith('Z'):
31
+ return datetime.fromisoformat(dt_str.replace('Z', '+00:00'))
32
+ return datetime.fromisoformat(dt_str)
33
+ except ValueError:
34
+ return None
35
+
36
+
37
+ def is_expired(expires_at: str) -> bool:
38
+ """
39
+ 检查过期时间是否已过期
40
+
41
+ Args:
42
+ expires_at: ISO 格式的过期时间字符串
43
+
44
+ Returns:
45
+ True 表示已过期或无法解析,False 表示未过期
46
+
47
+ Examples:
48
+ >>> is_expired("2020-01-01T00:00:00") # 过去时间
49
+ True
50
+ >>> is_expired("2099-12-31T23:59:59") # 未来时间
51
+ False
52
+ """
53
+ expire_time = parse_iso_datetime(expires_at)
54
+ if expire_time is None:
55
+ return True
56
+
57
+ now = datetime.now()
58
+ if expire_time.tzinfo:
59
+ now = now.astimezone()
60
+ return now >= expire_time