code-abyss 1.6.16 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/skills/SKILL.md +24 -16
- package/skills/domains/ai/SKILL.md +2 -2
- package/skills/domains/ai/prompt-and-eval.md +279 -0
- package/skills/domains/architecture/SKILL.md +2 -3
- package/skills/domains/architecture/security-arch.md +87 -0
- package/skills/domains/data-engineering/SKILL.md +188 -26
- package/skills/domains/development/SKILL.md +1 -4
- package/skills/domains/devops/SKILL.md +3 -5
- package/skills/domains/devops/performance.md +63 -0
- package/skills/domains/devops/testing.md +97 -0
- package/skills/domains/frontend-design/SKILL.md +12 -3
- package/skills/domains/frontend-design/claymorphism/SKILL.md +117 -0
- package/skills/domains/frontend-design/claymorphism/references/tokens.css +52 -0
- package/skills/domains/frontend-design/engineering.md +287 -0
- package/skills/domains/frontend-design/glassmorphism/SKILL.md +138 -0
- package/skills/domains/frontend-design/glassmorphism/references/tokens.css +32 -0
- package/skills/domains/frontend-design/liquid-glass/SKILL.md +135 -0
- package/skills/domains/frontend-design/liquid-glass/references/tokens.css +81 -0
- package/skills/domains/frontend-design/neubrutalism/SKILL.md +141 -0
- package/skills/domains/frontend-design/neubrutalism/references/tokens.css +44 -0
- package/skills/domains/infrastructure/SKILL.md +174 -34
- package/skills/domains/mobile/SKILL.md +211 -21
- package/skills/domains/orchestration/SKILL.md +1 -0
- package/skills/domains/security/SKILL.md +4 -6
- package/skills/domains/security/blue-team.md +57 -0
- package/skills/domains/security/red-team.md +54 -0
- package/skills/domains/security/threat-intel.md +50 -0
- package/skills/orchestration/multi-agent/SKILL.md +195 -46
- package/skills/run_skill.js +134 -0
- package/skills/tools/gen-docs/SKILL.md +6 -4
- package/skills/tools/gen-docs/scripts/doc_generator.js +349 -0
- package/skills/tools/verify-change/SKILL.md +8 -6
- package/skills/tools/verify-change/scripts/change_analyzer.js +270 -0
- package/skills/tools/verify-module/SKILL.md +6 -4
- package/skills/tools/verify-module/scripts/module_scanner.js +145 -0
- package/skills/tools/verify-quality/SKILL.md +5 -3
- package/skills/tools/verify-quality/scripts/quality_checker.js +276 -0
- package/skills/tools/verify-security/SKILL.md +7 -5
- package/skills/tools/verify-security/scripts/security_scanner.js +133 -0
- package/skills/__pycache__/run_skill.cpython-312.pyc +0 -0
- package/skills/domains/COVERAGE_PLAN.md +0 -232
- package/skills/domains/ai/model-evaluation.md +0 -790
- package/skills/domains/ai/prompt-engineering.md +0 -703
- package/skills/domains/architecture/compliance.md +0 -299
- package/skills/domains/architecture/data-security.md +0 -184
- package/skills/domains/data-engineering/data-pipeline.md +0 -762
- package/skills/domains/data-engineering/data-quality.md +0 -894
- package/skills/domains/data-engineering/stream-processing.md +0 -791
- package/skills/domains/development/dart.md +0 -963
- package/skills/domains/development/kotlin.md +0 -834
- package/skills/domains/development/php.md +0 -659
- package/skills/domains/development/swift.md +0 -755
- package/skills/domains/devops/e2e-testing.md +0 -914
- package/skills/domains/devops/performance-testing.md +0 -734
- package/skills/domains/devops/testing-strategy.md +0 -667
- package/skills/domains/frontend-design/build-tools.md +0 -743
- package/skills/domains/frontend-design/performance.md +0 -734
- package/skills/domains/frontend-design/testing.md +0 -699
- package/skills/domains/infrastructure/gitops.md +0 -735
- package/skills/domains/infrastructure/iac.md +0 -855
- package/skills/domains/infrastructure/kubernetes.md +0 -1018
- package/skills/domains/mobile/android-dev.md +0 -979
- package/skills/domains/mobile/cross-platform.md +0 -795
- package/skills/domains/mobile/ios-dev.md +0 -931
- package/skills/domains/security/secrets-management.md +0 -834
- package/skills/domains/security/supply-chain.md +0 -931
- package/skills/domains/security/threat-modeling.md +0 -828
- package/skills/run_skill.py +0 -153
- package/skills/tests/README.md +0 -225
- package/skills/tests/SUMMARY.md +0 -362
- package/skills/tests/__init__.py +0 -3
- package/skills/tests/__pycache__/test_change_analyzer.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_doc_generator.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_module_scanner.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_quality_checker.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_security_scanner.cpython-312.pyc +0 -0
- package/skills/tests/test_change_analyzer.py +0 -558
- package/skills/tests/test_doc_generator.py +0 -538
- package/skills/tests/test_module_scanner.py +0 -376
- package/skills/tests/test_quality_checker.py +0 -516
- package/skills/tests/test_security_scanner.py +0 -426
- package/skills/tools/gen-docs/scripts/__pycache__/doc_generator.cpython-312.pyc +0 -0
- package/skills/tools/gen-docs/scripts/doc_generator.py +0 -520
- package/skills/tools/verify-change/scripts/__pycache__/change_analyzer.cpython-312.pyc +0 -0
- package/skills/tools/verify-change/scripts/change_analyzer.py +0 -529
- package/skills/tools/verify-module/scripts/__pycache__/module_scanner.cpython-312.pyc +0 -0
- package/skills/tools/verify-module/scripts/module_scanner.py +0 -321
- package/skills/tools/verify-quality/scripts/__pycache__/quality_checker.cpython-312.pyc +0 -0
- package/skills/tools/verify-quality/scripts/quality_checker.py +0 -481
- package/skills/tools/verify-security/scripts/__pycache__/security_scanner.cpython-312.pyc +0 -0
- package/skills/tools/verify-security/scripts/security_scanner.py +0 -374
|
@@ -1,790 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: model-evaluation
|
|
3
|
-
description: AI 模型评估技术。RAGAS、LLM-as-Judge、评估指标、基准测试、A/B 测试。当用户提到模型评估、RAGAS、LLM-as-Judge、基准测试、评估指标、模型对比时使用。
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
# 📊 天机秘典 · 模型评估 (Model Evaluation)
|
|
7
|
-
|
|
8
|
-
## 评估体系
|
|
9
|
-
|
|
10
|
-
```
|
|
11
|
-
离线评估 → 在线评估 → 持续监控
|
|
12
|
-
│ │ │
|
|
13
|
-
├─ 基准测试 ├─ A/B 测试 ├─ 指标追踪
|
|
14
|
-
├─ 人工评估 ├─ 用户反馈 ├─ 异常检测
|
|
15
|
-
└─ 自动评估 └─ 实时分析 └─ 质量报告
|
|
16
|
-
```
|
|
17
|
-
|
|
18
|
-
### 评估维度
|
|
19
|
-
| 维度 | 指标 | 适用场景 |
|
|
20
|
-
|------|------|----------|
|
|
21
|
-
| 准确性 | Accuracy, F1, Precision, Recall | 分类、NER |
|
|
22
|
-
| 相关性 | Relevance, Context Precision | RAG、检索 |
|
|
23
|
-
| 忠实性 | Faithfulness, Hallucination Rate | 生成任务 |
|
|
24
|
-
| 连贯性 | Coherence, Fluency | 文本生成 |
|
|
25
|
-
| 效率 | Latency, Throughput, Cost | 生产部署 |
|
|
26
|
-
|
|
27
|
-
## RAGAS 框架
|
|
28
|
-
|
|
29
|
-
### 核心指标
|
|
30
|
-
```python
|
|
31
|
-
from ragas import evaluate
|
|
32
|
-
from ragas.metrics import (
|
|
33
|
-
faithfulness,
|
|
34
|
-
answer_relevancy,
|
|
35
|
-
context_precision,
|
|
36
|
-
context_recall,
|
|
37
|
-
)
|
|
38
|
-
from datasets import Dataset
|
|
39
|
-
|
|
40
|
-
# 准备评估数据
|
|
41
|
-
data = {
|
|
42
|
-
"question": ["什么是 RAG?", "如何优化检索?"],
|
|
43
|
-
"answer": ["RAG 是检索增强生成...", "可以使用混合检索..."],
|
|
44
|
-
"contexts": [
|
|
45
|
-
["RAG 结合了检索和生成...", "向量数据库用于存储..."],
|
|
46
|
-
["混合检索结合向量和关键词...", "重排可以提升相关性..."]
|
|
47
|
-
],
|
|
48
|
-
"ground_truth": ["RAG 是一种结合检索和生成的技术", "使用混合检索和重排"]
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
dataset = Dataset.from_dict(data)
|
|
52
|
-
|
|
53
|
-
# 评估
|
|
54
|
-
result = evaluate(
|
|
55
|
-
dataset,
|
|
56
|
-
metrics=[
|
|
57
|
-
faithfulness,
|
|
58
|
-
answer_relevancy,
|
|
59
|
-
context_precision,
|
|
60
|
-
context_recall,
|
|
61
|
-
],
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
print(result)
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
### Faithfulness(忠实性)
|
|
68
|
-
```python
|
|
69
|
-
from ragas.metrics import faithfulness
|
|
70
|
-
|
|
71
|
-
# 衡量答案是否基于提供的上下文
|
|
72
|
-
# 分数 0-1,越高越好
|
|
73
|
-
|
|
74
|
-
# 示例
|
|
75
|
-
question = "Python 的创始人是谁?"
|
|
76
|
-
context = ["Python 由 Guido van Rossum 在 1991 年创建"]
|
|
77
|
-
answer = "Python 由 Guido van Rossum 创建" # 高忠实性
|
|
78
|
-
|
|
79
|
-
# 低忠实性示例
|
|
80
|
-
bad_answer = "Python 由 James Gosling 创建" # 编造信息
|
|
81
|
-
|
|
82
|
-
score = faithfulness.score(
|
|
83
|
-
question=question,
|
|
84
|
-
answer=answer,
|
|
85
|
-
contexts=context
|
|
86
|
-
)
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
### Answer Relevancy(答案相关性)
|
|
90
|
-
```python
|
|
91
|
-
from ragas.metrics import answer_relevancy
|
|
92
|
-
|
|
93
|
-
# 衡量答案与问题的相关程度
|
|
94
|
-
# 分数 0-1,越高越好
|
|
95
|
-
|
|
96
|
-
question = "如何防御 SQL 注入?"
|
|
97
|
-
answer = "使用参数化查询和 ORM 框架可以有效防御 SQL 注入" # 高相关性
|
|
98
|
-
bad_answer = "SQL 是一种数据库查询语言" # 低相关性
|
|
99
|
-
|
|
100
|
-
score = answer_relevancy.score(
|
|
101
|
-
question=question,
|
|
102
|
-
answer=answer
|
|
103
|
-
)
|
|
104
|
-
```
|
|
105
|
-
|
|
106
|
-
### Context Precision(上下文精确度)
|
|
107
|
-
```python
|
|
108
|
-
from ragas.metrics import context_precision
|
|
109
|
-
|
|
110
|
-
# 衡量检索到的上下文中相关信息的比例
|
|
111
|
-
# 分数 0-1,越高越好(相关文档排在前面)
|
|
112
|
-
|
|
113
|
-
question = "什么是向量数据库?"
|
|
114
|
-
contexts = [
|
|
115
|
-
"向量数据库用于存储和检索高维向量", # 相关
|
|
116
|
-
"Pinecone 是一个向量数据库", # 相关
|
|
117
|
-
"Python 是一种编程语言", # 不相关
|
|
118
|
-
]
|
|
119
|
-
ground_truth = "向量数据库是专门用于存储和检索向量的数据库"
|
|
120
|
-
|
|
121
|
-
score = context_precision.score(
|
|
122
|
-
question=question,
|
|
123
|
-
contexts=contexts,
|
|
124
|
-
ground_truth=ground_truth
|
|
125
|
-
)
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
### Context Recall(上下文召回率)
|
|
129
|
-
```python
|
|
130
|
-
from ragas.metrics import context_recall
|
|
131
|
-
|
|
132
|
-
# 衡量检索到的上下文是否包含回答问题所需的所有信息
|
|
133
|
-
# 分数 0-1,越高越好
|
|
134
|
-
|
|
135
|
-
question = "RAG 的优势是什么?"
|
|
136
|
-
contexts = [
|
|
137
|
-
"RAG 可以减少幻觉",
|
|
138
|
-
"RAG 可以使用最新信息"
|
|
139
|
-
]
|
|
140
|
-
ground_truth = "RAG 的优势包括减少幻觉、使用最新信息、可解释性强"
|
|
141
|
-
|
|
142
|
-
# 召回率 = 2/3 = 0.67(缺少"可解释性强")
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
### 完整 RAG 评估流程
|
|
146
|
-
```python
|
|
147
|
-
from langchain.chains import RetrievalQA
|
|
148
|
-
from ragas import evaluate
|
|
149
|
-
from ragas.metrics import faithfulness, answer_relevancy
|
|
150
|
-
|
|
151
|
-
class RAGEvaluator:
|
|
152
|
-
def __init__(self, qa_chain, test_dataset):
|
|
153
|
-
self.qa_chain = qa_chain
|
|
154
|
-
self.test_dataset = test_dataset
|
|
155
|
-
|
|
156
|
-
def run_evaluation(self):
|
|
157
|
-
results = []
|
|
158
|
-
|
|
159
|
-
for item in self.test_dataset:
|
|
160
|
-
# 执行 RAG
|
|
161
|
-
response = self.qa_chain({
|
|
162
|
-
"query": item["question"]
|
|
163
|
-
})
|
|
164
|
-
|
|
165
|
-
results.append({
|
|
166
|
-
"question": item["question"],
|
|
167
|
-
"answer": response["result"],
|
|
168
|
-
"contexts": [doc.page_content for doc in response["source_documents"]],
|
|
169
|
-
"ground_truth": item["ground_truth"]
|
|
170
|
-
})
|
|
171
|
-
|
|
172
|
-
# RAGAS 评估
|
|
173
|
-
dataset = Dataset.from_dict({
|
|
174
|
-
"question": [r["question"] for r in results],
|
|
175
|
-
"answer": [r["answer"] for r in results],
|
|
176
|
-
"contexts": [r["contexts"] for r in results],
|
|
177
|
-
"ground_truth": [r["ground_truth"] for r in results]
|
|
178
|
-
})
|
|
179
|
-
|
|
180
|
-
scores = evaluate(
|
|
181
|
-
dataset,
|
|
182
|
-
metrics=[faithfulness, answer_relevancy]
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
return scores
|
|
186
|
-
|
|
187
|
-
# 使用
|
|
188
|
-
evaluator = RAGEvaluator(qa_chain, test_data)
|
|
189
|
-
scores = evaluator.run_evaluation()
|
|
190
|
-
print(f"Faithfulness: {scores['faithfulness']:.3f}")
|
|
191
|
-
print(f"Answer Relevancy: {scores['answer_relevancy']:.3f}")
|
|
192
|
-
```
|
|
193
|
-
|
|
194
|
-
## LLM-as-Judge
|
|
195
|
-
|
|
196
|
-
### 基础评估器
|
|
197
|
-
```python
|
|
198
|
-
from langchain.chat_models import ChatOpenAI
|
|
199
|
-
from langchain.prompts import ChatPromptTemplate
|
|
200
|
-
|
|
201
|
-
class LLMJudge:
|
|
202
|
-
def __init__(self, model="gpt-4"):
|
|
203
|
-
self.llm = ChatOpenAI(model=model, temperature=0)
|
|
204
|
-
|
|
205
|
-
def evaluate_answer(self, question: str, answer: str, criteria: str):
|
|
206
|
-
prompt = ChatPromptTemplate.from_template("""
|
|
207
|
-
你是一位专业的评估专家。请评估以下答案的质量。
|
|
208
|
-
|
|
209
|
-
问题: {question}
|
|
210
|
-
|
|
211
|
-
答案: {answer}
|
|
212
|
-
|
|
213
|
-
评估标准: {criteria}
|
|
214
|
-
|
|
215
|
-
请从以下维度评分(1-5 分):
|
|
216
|
-
1. 准确性: 信息是否正确
|
|
217
|
-
2. 完整性: 是否充分回答问题
|
|
218
|
-
3. 清晰度: 表达是否清晰易懂
|
|
219
|
-
4. 相关性: 是否切题
|
|
220
|
-
|
|
221
|
-
输出格式:
|
|
222
|
-
{{
|
|
223
|
-
"accuracy": <分数>,
|
|
224
|
-
"completeness": <分数>,
|
|
225
|
-
"clarity": <分数>,
|
|
226
|
-
"relevance": <分数>,
|
|
227
|
-
"overall": <总分>,
|
|
228
|
-
"feedback": "<详细反馈>"
|
|
229
|
-
}}
|
|
230
|
-
""")
|
|
231
|
-
|
|
232
|
-
chain = prompt | self.llm
|
|
233
|
-
result = chain.invoke({
|
|
234
|
-
"question": question,
|
|
235
|
-
"answer": answer,
|
|
236
|
-
"criteria": criteria
|
|
237
|
-
})
|
|
238
|
-
|
|
239
|
-
return json.loads(result.content)
|
|
240
|
-
|
|
241
|
-
# 使用
|
|
242
|
-
judge = LLMJudge()
|
|
243
|
-
score = judge.evaluate_answer(
|
|
244
|
-
question="什么是 RAG?",
|
|
245
|
-
answer="RAG 是检索增强生成技术...",
|
|
246
|
-
criteria="技术准确性和清晰度"
|
|
247
|
-
)
|
|
248
|
-
```
|
|
249
|
-
|
|
250
|
-
### 成对比较
|
|
251
|
-
```python
|
|
252
|
-
def pairwise_comparison(question: str, answer_a: str, answer_b: str):
|
|
253
|
-
prompt = f"""
|
|
254
|
-
问题: {question}
|
|
255
|
-
|
|
256
|
-
答案 A: {answer_a}
|
|
257
|
-
|
|
258
|
-
答案 B: {answer_b}
|
|
259
|
-
|
|
260
|
-
请比较两个答案的质量,从以下维度评估:
|
|
261
|
-
1. 准确性
|
|
262
|
-
2. 完整性
|
|
263
|
-
3. 清晰度
|
|
264
|
-
|
|
265
|
-
选择更好的答案(A 或 B),并说明理由。
|
|
266
|
-
|
|
267
|
-
输出格式:
|
|
268
|
-
{{
|
|
269
|
-
"winner": "A" or "B",
|
|
270
|
-
"reason": "<理由>",
|
|
271
|
-
"confidence": <0-1>
|
|
272
|
-
}}
|
|
273
|
-
"""
|
|
274
|
-
|
|
275
|
-
result = llm.predict(prompt)
|
|
276
|
-
return json.loads(result)
|
|
277
|
-
|
|
278
|
-
# ELO 排名系统
|
|
279
|
-
class ELORanking:
|
|
280
|
-
def __init__(self, k=32):
|
|
281
|
-
self.k = k
|
|
282
|
-
self.ratings = {}
|
|
283
|
-
|
|
284
|
-
def update_ratings(self, model_a: str, model_b: str, winner: str):
|
|
285
|
-
ra = self.ratings.get(model_a, 1500)
|
|
286
|
-
rb = self.ratings.get(model_b, 1500)
|
|
287
|
-
|
|
288
|
-
ea = 1 / (1 + 10 ** ((rb - ra) / 400))
|
|
289
|
-
eb = 1 / (1 + 10 ** ((ra - rb) / 400))
|
|
290
|
-
|
|
291
|
-
if winner == model_a:
|
|
292
|
-
sa, sb = 1, 0
|
|
293
|
-
elif winner == model_b:
|
|
294
|
-
sa, sb = 0, 1
|
|
295
|
-
else:
|
|
296
|
-
sa, sb = 0.5, 0.5
|
|
297
|
-
|
|
298
|
-
self.ratings[model_a] = ra + self.k * (sa - ea)
|
|
299
|
-
self.ratings[model_b] = rb + self.k * (sb - eb)
|
|
300
|
-
|
|
301
|
-
def get_leaderboard(self):
|
|
302
|
-
return sorted(self.ratings.items(), key=lambda x: x[1], reverse=True)
|
|
303
|
-
```
|
|
304
|
-
|
|
305
|
-
### 多维度评估
|
|
306
|
-
```python
|
|
307
|
-
EVALUATION_DIMENSIONS = {
|
|
308
|
-
"correctness": "答案是否准确无误",
|
|
309
|
-
"completeness": "是否完整回答了问题",
|
|
310
|
-
"conciseness": "表达是否简洁",
|
|
311
|
-
"relevance": "是否切题",
|
|
312
|
-
"helpfulness": "对用户是否有帮助",
|
|
313
|
-
"safety": "是否安全无害",
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
def multi_dimensional_eval(question: str, answer: str):
|
|
317
|
-
results = {}
|
|
318
|
-
|
|
319
|
-
for dimension, description in EVALUATION_DIMENSIONS.items():
|
|
320
|
-
prompt = f"""
|
|
321
|
-
评估维度: {dimension} - {description}
|
|
322
|
-
|
|
323
|
-
问题: {question}
|
|
324
|
-
答案: {answer}
|
|
325
|
-
|
|
326
|
-
请对该维度评分(1-5 分)并说明理由。
|
|
327
|
-
|
|
328
|
-
输出格式:
|
|
329
|
-
{{
|
|
330
|
-
"score": <分数>,
|
|
331
|
-
"reason": "<理由>"
|
|
332
|
-
}}
|
|
333
|
-
"""
|
|
334
|
-
result = llm.predict(prompt)
|
|
335
|
-
results[dimension] = json.loads(result)
|
|
336
|
-
|
|
337
|
-
# 计算加权总分
|
|
338
|
-
weights = {
|
|
339
|
-
"correctness": 0.3,
|
|
340
|
-
"completeness": 0.2,
|
|
341
|
-
"conciseness": 0.1,
|
|
342
|
-
"relevance": 0.2,
|
|
343
|
-
"helpfulness": 0.15,
|
|
344
|
-
"safety": 0.05,
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
total_score = sum(
|
|
348
|
-
results[dim]["score"] * weights[dim]
|
|
349
|
-
for dim in EVALUATION_DIMENSIONS
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
results["total_score"] = total_score
|
|
353
|
-
return results
|
|
354
|
-
```
|
|
355
|
-
|
|
356
|
-
## 基准测试
|
|
357
|
-
|
|
358
|
-
### MMLU(大规模多任务语言理解)
|
|
359
|
-
```python
|
|
360
|
-
from datasets import load_dataset
|
|
361
|
-
|
|
362
|
-
# 加载 MMLU 数据集
|
|
363
|
-
dataset = load_dataset("cais/mmlu", "all")
|
|
364
|
-
|
|
365
|
-
def evaluate_mmlu(model, subject="all", num_samples=100):
|
|
366
|
-
correct = 0
|
|
367
|
-
total = 0
|
|
368
|
-
|
|
369
|
-
for item in dataset["test"].select(range(num_samples)):
|
|
370
|
-
question = item["question"]
|
|
371
|
-
choices = item["choices"]
|
|
372
|
-
correct_answer = item["answer"]
|
|
373
|
-
|
|
374
|
-
# 构建 Prompt
|
|
375
|
-
prompt = f"""
|
|
376
|
-
问题: {question}
|
|
377
|
-
|
|
378
|
-
选项:
|
|
379
|
-
A. {choices[0]}
|
|
380
|
-
B. {choices[1]}
|
|
381
|
-
C. {choices[2]}
|
|
382
|
-
D. {choices[3]}
|
|
383
|
-
|
|
384
|
-
请选择正确答案(仅输出 A/B/C/D):
|
|
385
|
-
"""
|
|
386
|
-
|
|
387
|
-
response = model.predict(prompt).strip()
|
|
388
|
-
|
|
389
|
-
if response == ["A", "B", "C", "D"][correct_answer]:
|
|
390
|
-
correct += 1
|
|
391
|
-
total += 1
|
|
392
|
-
|
|
393
|
-
accuracy = correct / total
|
|
394
|
-
return accuracy
|
|
395
|
-
|
|
396
|
-
# 使用
|
|
397
|
-
accuracy = evaluate_mmlu(llm, num_samples=100)
|
|
398
|
-
print(f"MMLU Accuracy: {accuracy:.2%}")
|
|
399
|
-
```
|
|
400
|
-
|
|
401
|
-
### HumanEval(代码生成)
|
|
402
|
-
```python
|
|
403
|
-
from human_eval.data import read_problems
|
|
404
|
-
from human_eval.evaluation import evaluate_functional_correctness
|
|
405
|
-
|
|
406
|
-
def evaluate_code_generation(model):
|
|
407
|
-
problems = read_problems()
|
|
408
|
-
|
|
409
|
-
samples = []
|
|
410
|
-
for task_id, problem in problems.items():
|
|
411
|
-
prompt = problem["prompt"]
|
|
412
|
-
|
|
413
|
-
# 生成代码
|
|
414
|
-
code = model.predict(f"完成以下 Python 函数:\n\n{prompt}")
|
|
415
|
-
|
|
416
|
-
samples.append({
|
|
417
|
-
"task_id": task_id,
|
|
418
|
-
"completion": code
|
|
419
|
-
})
|
|
420
|
-
|
|
421
|
-
# 保存结果
|
|
422
|
-
with open("samples.jsonl", "w") as f:
|
|
423
|
-
for sample in samples:
|
|
424
|
-
f.write(json.dumps(sample) + "\n")
|
|
425
|
-
|
|
426
|
-
# 评估
|
|
427
|
-
results = evaluate_functional_correctness("samples.jsonl")
|
|
428
|
-
return results
|
|
429
|
-
|
|
430
|
-
# Pass@k 指标
|
|
431
|
-
# Pass@1: 生成 1 次代码的通过率
|
|
432
|
-
# Pass@10: 生成 10 次代码中至少 1 次通过的概率
|
|
433
|
-
```
|
|
434
|
-
|
|
435
|
-
### GSM8K(数学推理)
|
|
436
|
-
```python
|
|
437
|
-
from datasets import load_dataset
|
|
438
|
-
|
|
439
|
-
dataset = load_dataset("gsm8k", "main")
|
|
440
|
-
|
|
441
|
-
def evaluate_gsm8k(model, num_samples=100):
|
|
442
|
-
correct = 0
|
|
443
|
-
|
|
444
|
-
for item in dataset["test"].select(range(num_samples)):
|
|
445
|
-
question = item["question"]
|
|
446
|
-
answer = item["answer"]
|
|
447
|
-
|
|
448
|
-
# 提取正确答案
|
|
449
|
-
correct_answer = int(answer.split("####")[1].strip())
|
|
450
|
-
|
|
451
|
-
# 使用 CoT
|
|
452
|
-
prompt = f"""
|
|
453
|
-
问题: {question}
|
|
454
|
-
|
|
455
|
-
让我们一步步思考:
|
|
456
|
-
"""
|
|
457
|
-
|
|
458
|
-
response = model.predict(prompt)
|
|
459
|
-
|
|
460
|
-
# 提取模型答案
|
|
461
|
-
try:
|
|
462
|
-
model_answer = extract_number(response)
|
|
463
|
-
if model_answer == correct_answer:
|
|
464
|
-
correct += 1
|
|
465
|
-
except:
|
|
466
|
-
pass
|
|
467
|
-
|
|
468
|
-
accuracy = correct / num_samples
|
|
469
|
-
return accuracy
|
|
470
|
-
```
|
|
471
|
-
|
|
472
|
-
### 自定义基准测试
|
|
473
|
-
```python
|
|
474
|
-
class CustomBenchmark:
|
|
475
|
-
def __init__(self, test_cases: list):
|
|
476
|
-
self.test_cases = test_cases
|
|
477
|
-
|
|
478
|
-
def run(self, model):
|
|
479
|
-
results = []
|
|
480
|
-
|
|
481
|
-
for case in self.test_cases:
|
|
482
|
-
start_time = time.time()
|
|
483
|
-
|
|
484
|
-
response = model.predict(case["input"])
|
|
485
|
-
|
|
486
|
-
latency = time.time() - start_time
|
|
487
|
-
|
|
488
|
-
# 评估
|
|
489
|
-
score = self._evaluate(
|
|
490
|
-
response,
|
|
491
|
-
case["expected_output"],
|
|
492
|
-
case["criteria"]
|
|
493
|
-
)
|
|
494
|
-
|
|
495
|
-
results.append({
|
|
496
|
-
"input": case["input"],
|
|
497
|
-
"output": response,
|
|
498
|
-
"expected": case["expected_output"],
|
|
499
|
-
"score": score,
|
|
500
|
-
"latency": latency
|
|
501
|
-
})
|
|
502
|
-
|
|
503
|
-
return self._aggregate_results(results)
|
|
504
|
-
|
|
505
|
-
def _evaluate(self, output, expected, criteria):
|
|
506
|
-
# 使用 LLM-as-Judge
|
|
507
|
-
judge = LLMJudge()
|
|
508
|
-
return judge.evaluate(output, expected, criteria)
|
|
509
|
-
|
|
510
|
-
def _aggregate_results(self, results):
|
|
511
|
-
return {
|
|
512
|
-
"avg_score": np.mean([r["score"] for r in results]),
|
|
513
|
-
"avg_latency": np.mean([r["latency"] for r in results]),
|
|
514
|
-
"pass_rate": sum(r["score"] >= 0.8 for r in results) / len(results),
|
|
515
|
-
"details": results
|
|
516
|
-
}
|
|
517
|
-
```
|
|
518
|
-
|
|
519
|
-
## 评估指标
|
|
520
|
-
|
|
521
|
-
### 分类指标
|
|
522
|
-
```python
|
|
523
|
-
from sklearn.metrics import (
|
|
524
|
-
accuracy_score,
|
|
525
|
-
precision_recall_fscore_support,
|
|
526
|
-
confusion_matrix,
|
|
527
|
-
classification_report
|
|
528
|
-
)
|
|
529
|
-
|
|
530
|
-
def evaluate_classification(y_true, y_pred):
|
|
531
|
-
accuracy = accuracy_score(y_true, y_pred)
|
|
532
|
-
precision, recall, f1, _ = precision_recall_fscore_support(
|
|
533
|
-
y_true, y_pred, average='weighted'
|
|
534
|
-
)
|
|
535
|
-
|
|
536
|
-
return {
|
|
537
|
-
"accuracy": accuracy,
|
|
538
|
-
"precision": precision,
|
|
539
|
-
"recall": recall,
|
|
540
|
-
"f1": f1,
|
|
541
|
-
"confusion_matrix": confusion_matrix(y_true, y_pred),
|
|
542
|
-
"report": classification_report(y_true, y_pred)
|
|
543
|
-
}
|
|
544
|
-
```
|
|
545
|
-
|
|
546
|
-
### 生成指标
|
|
547
|
-
```python
|
|
548
|
-
from rouge import Rouge
|
|
549
|
-
from nltk.translate.bleu_score import sentence_bleu
|
|
550
|
-
|
|
551
|
-
def evaluate_generation(predictions, references):
|
|
552
|
-
rouge = Rouge()
|
|
553
|
-
|
|
554
|
-
# ROUGE 分数
|
|
555
|
-
rouge_scores = rouge.get_scores(predictions, references, avg=True)
|
|
556
|
-
|
|
557
|
-
# BLEU 分数
|
|
558
|
-
bleu_scores = []
|
|
559
|
-
for pred, ref in zip(predictions, references):
|
|
560
|
-
score = sentence_bleu([ref.split()], pred.split())
|
|
561
|
-
bleu_scores.append(score)
|
|
562
|
-
|
|
563
|
-
return {
|
|
564
|
-
"rouge-1": rouge_scores["rouge-1"]["f"],
|
|
565
|
-
"rouge-2": rouge_scores["rouge-2"]["f"],
|
|
566
|
-
"rouge-l": rouge_scores["rouge-l"]["f"],
|
|
567
|
-
"bleu": np.mean(bleu_scores)
|
|
568
|
-
}
|
|
569
|
-
```
|
|
570
|
-
|
|
571
|
-
### 检索指标
|
|
572
|
-
```python
|
|
573
|
-
def evaluate_retrieval(retrieved_docs, relevant_docs, k=5):
|
|
574
|
-
# Precision@K
|
|
575
|
-
precision_at_k = len(set(retrieved_docs[:k]) & set(relevant_docs)) / k
|
|
576
|
-
|
|
577
|
-
# Recall@K
|
|
578
|
-
recall_at_k = len(set(retrieved_docs[:k]) & set(relevant_docs)) / len(relevant_docs)
|
|
579
|
-
|
|
580
|
-
# MRR (Mean Reciprocal Rank)
|
|
581
|
-
for i, doc in enumerate(retrieved_docs, 1):
|
|
582
|
-
if doc in relevant_docs:
|
|
583
|
-
mrr = 1 / i
|
|
584
|
-
break
|
|
585
|
-
else:
|
|
586
|
-
mrr = 0
|
|
587
|
-
|
|
588
|
-
# NDCG (Normalized Discounted Cumulative Gain)
|
|
589
|
-
dcg = sum(
|
|
590
|
-
(1 if retrieved_docs[i] in relevant_docs else 0) / np.log2(i + 2)
|
|
591
|
-
for i in range(k)
|
|
592
|
-
)
|
|
593
|
-
idcg = sum(1 / np.log2(i + 2) for i in range(min(k, len(relevant_docs))))
|
|
594
|
-
ndcg = dcg / idcg if idcg > 0 else 0
|
|
595
|
-
|
|
596
|
-
return {
|
|
597
|
-
"precision@k": precision_at_k,
|
|
598
|
-
"recall@k": recall_at_k,
|
|
599
|
-
"mrr": mrr,
|
|
600
|
-
"ndcg@k": ndcg
|
|
601
|
-
}
|
|
602
|
-
```
|
|
603
|
-
|
|
604
|
-
## A/B 测试
|
|
605
|
-
|
|
606
|
-
### 在线 A/B 测试框架
|
|
607
|
-
```python
|
|
608
|
-
import random
|
|
609
|
-
from dataclasses import dataclass
|
|
610
|
-
from typing import Dict
|
|
611
|
-
|
|
612
|
-
@dataclass
|
|
613
|
-
class Variant:
|
|
614
|
-
name: str
|
|
615
|
-
model: Any
|
|
616
|
-
traffic_ratio: float
|
|
617
|
-
|
|
618
|
-
class ABTest:
|
|
619
|
-
def __init__(self, variants: list[Variant]):
|
|
620
|
-
self.variants = variants
|
|
621
|
-
self.results = {v.name: {"count": 0, "scores": []} for v in variants}
|
|
622
|
-
|
|
623
|
-
def get_variant(self, user_id: str) -> Variant:
|
|
624
|
-
# 一致性哈希分流
|
|
625
|
-
hash_value = hash(user_id) % 100
|
|
626
|
-
cumulative = 0
|
|
627
|
-
|
|
628
|
-
for variant in self.variants:
|
|
629
|
-
cumulative += variant.traffic_ratio * 100
|
|
630
|
-
if hash_value < cumulative:
|
|
631
|
-
return variant
|
|
632
|
-
|
|
633
|
-
return self.variants[-1]
|
|
634
|
-
|
|
635
|
-
def log_result(self, variant_name: str, score: float):
|
|
636
|
-
self.results[variant_name]["count"] += 1
|
|
637
|
-
self.results[variant_name]["scores"].append(score)
|
|
638
|
-
|
|
639
|
-
def get_statistics(self):
|
|
640
|
-
stats = {}
|
|
641
|
-
for name, data in self.results.items():
|
|
642
|
-
if data["count"] > 0:
|
|
643
|
-
stats[name] = {
|
|
644
|
-
"count": data["count"],
|
|
645
|
-
"mean": np.mean(data["scores"]),
|
|
646
|
-
"std": np.std(data["scores"]),
|
|
647
|
-
"p95": np.percentile(data["scores"], 95)
|
|
648
|
-
}
|
|
649
|
-
return stats
|
|
650
|
-
|
|
651
|
-
# 使用
|
|
652
|
-
ab_test = ABTest([
|
|
653
|
-
Variant("gpt-4", gpt4_model, 0.5),
|
|
654
|
-
Variant("claude-3", claude_model, 0.5)
|
|
655
|
-
])
|
|
656
|
-
|
|
657
|
-
# 处理请求
|
|
658
|
-
user_id = "user_123"
|
|
659
|
-
variant = ab_test.get_variant(user_id)
|
|
660
|
-
response = variant.model.predict(query)
|
|
661
|
-
|
|
662
|
-
# 记录结果(用户反馈或自动评估)
|
|
663
|
-
score = evaluate_response(response)
|
|
664
|
-
ab_test.log_result(variant.name, score)
|
|
665
|
-
|
|
666
|
-
# 查看统计
|
|
667
|
-
print(ab_test.get_statistics())
|
|
668
|
-
```
|
|
669
|
-
|
|
670
|
-
### 统计显著性检验
|
|
671
|
-
```python
|
|
672
|
-
from scipy import stats
|
|
673
|
-
|
|
674
|
-
def check_significance(results_a, results_b, alpha=0.05):
|
|
675
|
-
# t 检验
|
|
676
|
-
t_stat, p_value = stats.ttest_ind(results_a, results_b)
|
|
677
|
-
|
|
678
|
-
is_significant = p_value < alpha
|
|
679
|
-
|
|
680
|
-
# 效应量(Cohen's d)
|
|
681
|
-
pooled_std = np.sqrt(
|
|
682
|
-
(np.std(results_a) ** 2 + np.std(results_b) ** 2) / 2
|
|
683
|
-
)
|
|
684
|
-
cohens_d = (np.mean(results_a) - np.mean(results_b)) / pooled_std
|
|
685
|
-
|
|
686
|
-
return {
|
|
687
|
-
"p_value": p_value,
|
|
688
|
-
"is_significant": is_significant,
|
|
689
|
-
"cohens_d": cohens_d,
|
|
690
|
-
"interpretation": "large" if abs(cohens_d) > 0.8 else "medium" if abs(cohens_d) > 0.5 else "small"
|
|
691
|
-
}
|
|
692
|
-
```
|
|
693
|
-
|
|
694
|
-
## 持续监控
|
|
695
|
-
|
|
696
|
-
### 实时指标追踪
|
|
697
|
-
```python
|
|
698
|
-
from prometheus_client import Counter, Histogram, Gauge
|
|
699
|
-
|
|
700
|
-
# 定义指标
|
|
701
|
-
request_count = Counter('llm_requests_total', 'Total LLM requests', ['model', 'status'])
|
|
702
|
-
latency = Histogram('llm_latency_seconds', 'LLM latency', ['model'])
|
|
703
|
-
quality_score = Gauge('llm_quality_score', 'LLM quality score', ['model'])
|
|
704
|
-
|
|
705
|
-
class MonitoredLLM:
|
|
706
|
-
def __init__(self, model, model_name):
|
|
707
|
-
self.model = model
|
|
708
|
-
self.model_name = model_name
|
|
709
|
-
|
|
710
|
-
def predict(self, prompt: str):
|
|
711
|
-
start_time = time.time()
|
|
712
|
-
|
|
713
|
-
try:
|
|
714
|
-
response = self.model.predict(prompt)
|
|
715
|
-
request_count.labels(model=self.model_name, status='success').inc()
|
|
716
|
-
|
|
717
|
-
# 记录延迟
|
|
718
|
-
latency.labels(model=self.model_name).observe(time.time() - start_time)
|
|
719
|
-
|
|
720
|
-
# 评估质量
|
|
721
|
-
score = self._evaluate_quality(response)
|
|
722
|
-
quality_score.labels(model=self.model_name).set(score)
|
|
723
|
-
|
|
724
|
-
return response
|
|
725
|
-
|
|
726
|
-
except Exception as e:
|
|
727
|
-
request_count.labels(model=self.model_name, status='error').inc()
|
|
728
|
-
raise
|
|
729
|
-
```
|
|
730
|
-
|
|
731
|
-
### 异常检测
|
|
732
|
-
```python
|
|
733
|
-
class AnomalyDetector:
|
|
734
|
-
def __init__(self, window_size=100, threshold=2.0):
|
|
735
|
-
self.window_size = window_size
|
|
736
|
-
self.threshold = threshold
|
|
737
|
-
self.history = []
|
|
738
|
-
|
|
739
|
-
def check(self, value: float) -> bool:
|
|
740
|
-
self.history.append(value)
|
|
741
|
-
|
|
742
|
-
if len(self.history) < self.window_size:
|
|
743
|
-
return False
|
|
744
|
-
|
|
745
|
-
# 保持窗口大小
|
|
746
|
-
self.history = self.history[-self.window_size:]
|
|
747
|
-
|
|
748
|
-
# Z-score 异常检测
|
|
749
|
-
mean = np.mean(self.history)
|
|
750
|
-
std = np.std(self.history)
|
|
751
|
-
|
|
752
|
-
if std == 0:
|
|
753
|
-
return False
|
|
754
|
-
|
|
755
|
-
z_score = abs((value - mean) / std)
|
|
756
|
-
|
|
757
|
-
return z_score > self.threshold
|
|
758
|
-
|
|
759
|
-
# 使用
|
|
760
|
-
detector = AnomalyDetector()
|
|
761
|
-
|
|
762
|
-
for score in quality_scores:
|
|
763
|
-
if detector.check(score):
|
|
764
|
-
alert(f"Quality anomaly detected: {score}")
|
|
765
|
-
```
|
|
766
|
-
|
|
767
|
-
## 评估工具
|
|
768
|
-
|
|
769
|
-
| 工具 | 类型 | 功能 |
|
|
770
|
-
|------|------|------|
|
|
771
|
-
| RAGAS | 框架 | RAG 专用评估 |
|
|
772
|
-
| LangSmith | 平台 | LLM 应用监控 |
|
|
773
|
-
| Phoenix | 开源 | 可观测性平台 |
|
|
774
|
-
| PromptTools | 库 | Prompt 测试 |
|
|
775
|
-
| OpenAI Evals | 框架 | 模型评估 |
|
|
776
|
-
| Weights & Biases | 平台 | 实验追踪 |
|
|
777
|
-
|
|
778
|
-
## 最佳实践
|
|
779
|
-
|
|
780
|
-
- ✅ 多维度评估:准确性、相关性、忠实性、效率
|
|
781
|
-
- ✅ 自动化评估:使用 RAGAS、LLM-as-Judge
|
|
782
|
-
- ✅ 人工抽检:定期人工审核样本
|
|
783
|
-
- ✅ 基准测试:使用标准数据集对比
|
|
784
|
-
- ✅ A/B 测试:在线对比不同版本
|
|
785
|
-
- ✅ 持续监控:实时追踪质量指标
|
|
786
|
-
- ✅ 版本管理:记录模型和 Prompt 版本
|
|
787
|
-
- ✅ 反馈闭环:收集用户反馈改进
|
|
788
|
-
- ❌ 避免:单一指标、无基准、无监控
|
|
789
|
-
|
|
790
|
-
---
|