code-abyss 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +197 -0
- package/bin/install.js +193 -0
- package/bin/uninstall.js +42 -0
- package/config/AGENTS.md +247 -0
- package/config/CLAUDE.md +207 -0
- package/config/settings.example.json +27 -0
- package/output-styles/abyss-cultivator.md +399 -0
- package/package.json +41 -0
- package/skills/SKILL.md +115 -0
- package/skills/ai/SKILL.md +29 -0
- package/skills/ai/agent-dev.md +242 -0
- package/skills/ai/llm-security.md +288 -0
- package/skills/architecture/SKILL.md +41 -0
- package/skills/architecture/api-design.md +225 -0
- package/skills/architecture/caching.md +299 -0
- package/skills/architecture/cloud-native.md +285 -0
- package/skills/architecture/compliance.md +299 -0
- package/skills/architecture/data-security.md +184 -0
- package/skills/architecture/message-queue.md +329 -0
- package/skills/architecture/security-arch.md +210 -0
- package/skills/development/SKILL.md +43 -0
- package/skills/development/cpp.md +246 -0
- package/skills/development/go.md +323 -0
- package/skills/development/java.md +277 -0
- package/skills/development/python.md +288 -0
- package/skills/development/rust.md +313 -0
- package/skills/development/shell.md +313 -0
- package/skills/development/typescript.md +277 -0
- package/skills/devops/SKILL.md +36 -0
- package/skills/devops/cost-optimization.md +272 -0
- package/skills/devops/database.md +217 -0
- package/skills/devops/devsecops.md +198 -0
- package/skills/devops/git-workflow.md +181 -0
- package/skills/devops/observability.md +280 -0
- package/skills/devops/performance.md +273 -0
- package/skills/devops/testing.md +186 -0
- package/skills/gen-docs/SKILL.md +114 -0
- package/skills/gen-docs/scripts/doc_generator.py +491 -0
- package/skills/multi-agent/SKILL.md +268 -0
- package/skills/run_skill.py +88 -0
- package/skills/security/SKILL.md +51 -0
- package/skills/security/blue-team.md +379 -0
- package/skills/security/code-audit.md +265 -0
- package/skills/security/pentest.md +226 -0
- package/skills/security/red-team.md +321 -0
- package/skills/security/threat-intel.md +322 -0
- package/skills/security/vuln-research.md +369 -0
- package/skills/tests/README.md +225 -0
- package/skills/tests/SUMMARY.md +362 -0
- package/skills/tests/__init__.py +3 -0
- package/skills/tests/test_change_analyzer.py +558 -0
- package/skills/tests/test_doc_generator.py +538 -0
- package/skills/tests/test_module_scanner.py +376 -0
- package/skills/tests/test_quality_checker.py +516 -0
- package/skills/tests/test_security_scanner.py +426 -0
- package/skills/verify-change/SKILL.md +138 -0
- package/skills/verify-change/scripts/change_analyzer.py +529 -0
- package/skills/verify-module/SKILL.md +125 -0
- package/skills/verify-module/scripts/module_scanner.py +321 -0
- package/skills/verify-quality/SKILL.md +158 -0
- package/skills/verify-quality/scripts/quality_checker.py +481 -0
- package/skills/verify-security/SKILL.md +141 -0
- package/skills/verify-security/scripts/security_scanner.py +368 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: observability
|
|
3
|
+
description: 可观测性秘典。日志、指标、追踪三大支柱,告警设计,SLI/SLO/SLA。当用户提到可观测性、日志、监控、指标、追踪、告警、SLO时路由到此。
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# 🔧 炼器秘典 · 可观测性
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
## 三大支柱
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
┌─────────────────────────────────────────┐
|
|
13
|
+
│ 可观测性 (Observability) │
|
|
14
|
+
├─────────────┬─────────────┬─────────────┤
|
|
15
|
+
│ 📋 日志 │ 📊 指标 │ 🔗 追踪 │
|
|
16
|
+
│ Logs │ Metrics │ Traces │
|
|
17
|
+
│ 离散事件 │ 聚合数值 │ 请求链路 │
|
|
18
|
+
│ What │ How much │ Where │
|
|
19
|
+
└─────────────┴─────────────┴─────────────┘
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
| 支柱 | 特征 | 适用场景 | 代表工具 |
|
|
23
|
+
|------|------|----------|----------|
|
|
24
|
+
| 日志 | 离散、非结构化/结构化事件 | 调试、审计、错误追踪 | ELK, Loki, CloudWatch |
|
|
25
|
+
| 指标 | 聚合数值、时间序列 | 告警、趋势、容量规划 | Prometheus, Datadog, CloudWatch |
|
|
26
|
+
| 追踪 | 分布式请求链路 | 延迟分析、依赖映射 | Jaeger, Zipkin, X-Ray |
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## 日志 (Logs)
|
|
31
|
+
|
|
32
|
+
### 结构化日志
|
|
33
|
+
|
|
34
|
+
```json
|
|
35
|
+
{
|
|
36
|
+
"timestamp": "2024-01-15T10:30:00.123Z",
|
|
37
|
+
"level": "ERROR",
|
|
38
|
+
"service": "order-service",
|
|
39
|
+
"trace_id": "abc123",
|
|
40
|
+
"span_id": "def456",
|
|
41
|
+
"message": "Payment failed",
|
|
42
|
+
"error": "InsufficientFunds",
|
|
43
|
+
"user_id": "u-789",
|
|
44
|
+
"order_id": "o-012",
|
|
45
|
+
"amount": 99.99,
|
|
46
|
+
"duration_ms": 234
|
|
47
|
+
}
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### 日志级别规范
|
|
51
|
+
|
|
52
|
+
| 级别 | 用途 | 生产环境 |
|
|
53
|
+
|------|------|----------|
|
|
54
|
+
| TRACE | 极细粒度调试 | ❌ 关闭 |
|
|
55
|
+
| DEBUG | 开发调试信息 | ❌ 关闭 |
|
|
56
|
+
| INFO | 业务关键事件 | ✅ 开启 |
|
|
57
|
+
| WARN | 潜在问题,可自愈 | ✅ 开启 |
|
|
58
|
+
| ERROR | 错误,需关注 | ✅ 开启 + 告警 |
|
|
59
|
+
| FATAL | 致命错误,服务不可用 | ✅ 开启 + 紧急告警 |
|
|
60
|
+
|
|
61
|
+
### 日志聚合架构
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
应用 → Filebeat/Fluentd → Kafka(缓冲) → Logstash → Elasticsearch → Kibana
|
|
65
|
+
→ S3(归档)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### 日志最佳实践
|
|
69
|
+
|
|
70
|
+
- ✅ 结构化 JSON 格式
|
|
71
|
+
- ✅ 包含 trace_id 关联追踪
|
|
72
|
+
- ✅ 敏感数据脱敏
|
|
73
|
+
- ✅ 合理的保留策略(热/温/冷)
|
|
74
|
+
- ❌ 不记录密码/Token
|
|
75
|
+
- ❌ 不在循环中打日志
|
|
76
|
+
- ❌ 不用字符串拼接(用参数化)
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## 指标 (Metrics)
|
|
81
|
+
|
|
82
|
+
### Prometheus 指标类型
|
|
83
|
+
|
|
84
|
+
| 类型 | 用途 | 示例 |
|
|
85
|
+
|------|------|------|
|
|
86
|
+
| Counter | 只增不减的计数器 | 请求总数、错误总数 |
|
|
87
|
+
| Gauge | 可增可减的瞬时值 | 当前连接数、队列长度 |
|
|
88
|
+
| Histogram | 分布统计(桶) | 请求延迟分布 |
|
|
89
|
+
| Summary | 分布统计(分位数) | 请求延迟 P99 |
|
|
90
|
+
|
|
91
|
+
### 关键 PromQL
|
|
92
|
+
|
|
93
|
+
```promql
|
|
94
|
+
# 请求速率
|
|
95
|
+
rate(http_requests_total[5m])
|
|
96
|
+
|
|
97
|
+
# 错误率
|
|
98
|
+
rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])
|
|
99
|
+
|
|
100
|
+
# P99 延迟
|
|
101
|
+
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))
|
|
102
|
+
|
|
103
|
+
# CPU 使用率
|
|
104
|
+
1 - avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)
|
|
105
|
+
|
|
106
|
+
# 内存使用率
|
|
107
|
+
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Grafana Dashboard 设计
|
|
111
|
+
|
|
112
|
+
```yaml
|
|
113
|
+
四大黄金信号 Dashboard:
|
|
114
|
+
Row 1 - 流量:
|
|
115
|
+
- QPS (rate)
|
|
116
|
+
- 按 endpoint 分组
|
|
117
|
+
Row 2 - 错误:
|
|
118
|
+
- 错误率 (%)
|
|
119
|
+
- 按错误类型分组
|
|
120
|
+
Row 3 - 延迟:
|
|
121
|
+
- P50/P95/P99
|
|
122
|
+
- 延迟热力图
|
|
123
|
+
Row 4 - 饱和度:
|
|
124
|
+
- CPU/Memory/Disk
|
|
125
|
+
- 连接池使用率
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## 追踪 (Traces)
|
|
131
|
+
|
|
132
|
+
### OpenTelemetry 集成
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
# Python 示例
|
|
136
|
+
from opentelemetry import trace
|
|
137
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
138
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
139
|
+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
140
|
+
|
|
141
|
+
provider = TracerProvider()
|
|
142
|
+
processor = BatchSpanProcessor(OTLPSpanExporter(endpoint="http://collector:4317"))
|
|
143
|
+
provider.add_span_processor(processor)
|
|
144
|
+
trace.set_tracer_provider(provider)
|
|
145
|
+
|
|
146
|
+
tracer = trace.get_tracer(__name__)
|
|
147
|
+
|
|
148
|
+
@tracer.start_as_current_span("process_order")
|
|
149
|
+
def process_order(order_id: str):
|
|
150
|
+
span = trace.get_current_span()
|
|
151
|
+
span.set_attribute("order.id", order_id)
|
|
152
|
+
# 业务逻辑...
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 追踪架构
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
Service-A → Service-B → Service-C
|
|
159
|
+
│ │ │
|
|
160
|
+
└── Span ────┴── Span ────┴── Span
|
|
161
|
+
│
|
|
162
|
+
Trace (trace_id 贯穿全链路)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Context Propagation
|
|
166
|
+
|
|
167
|
+
```
|
|
168
|
+
HTTP Header: traceparent: 00-{trace_id}-{span_id}-{flags}
|
|
169
|
+
gRPC Metadata: 自动传播
|
|
170
|
+
Message Queue: 消息头注入 trace context
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## 告警设计
|
|
176
|
+
|
|
177
|
+
### 告警分级
|
|
178
|
+
|
|
179
|
+
| 级别 | 响应时间 | 通知方式 | 示例 |
|
|
180
|
+
|------|----------|----------|------|
|
|
181
|
+
| P0 Critical | 立即 | 电话 + PagerDuty | 服务完全不可用 |
|
|
182
|
+
| P1 High | 15 min | Slack + 短信 | 错误率 > 5% |
|
|
183
|
+
| P2 Medium | 1 hour | Slack | 延迟 P99 > 阈值 |
|
|
184
|
+
| P3 Low | 次日 | 邮件/工单 | 磁盘使用 > 70% |
|
|
185
|
+
|
|
186
|
+
### 告警规则示例
|
|
187
|
+
|
|
188
|
+
```yaml
|
|
189
|
+
# Prometheus AlertManager
|
|
190
|
+
groups:
|
|
191
|
+
- name: service-alerts
|
|
192
|
+
rules:
|
|
193
|
+
- alert: HighErrorRate
|
|
194
|
+
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
|
195
|
+
for: 5m
|
|
196
|
+
labels:
|
|
197
|
+
severity: critical
|
|
198
|
+
annotations:
|
|
199
|
+
summary: "High error rate on {{ $labels.instance }}"
|
|
200
|
+
|
|
201
|
+
- alert: HighLatency
|
|
202
|
+
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 1
|
|
203
|
+
for: 5m
|
|
204
|
+
labels:
|
|
205
|
+
severity: warning
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### 告警最佳实践
|
|
209
|
+
|
|
210
|
+
- ✅ 基于 SLO 告警,而非资源指标
|
|
211
|
+
- ✅ 设置合理的 `for` 持续时间,避免抖动
|
|
212
|
+
- ✅ 告警必须可操作(收到告警知道该做什么)
|
|
213
|
+
- ✅ 定期审查告警,清理无效告警
|
|
214
|
+
- ❌ 不对每个指标都告警(告警疲劳)
|
|
215
|
+
- ❌ 不设过低阈值(噪音)
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## SLI / SLO / SLA
|
|
220
|
+
|
|
221
|
+
### 定义
|
|
222
|
+
|
|
223
|
+
| 概念 | 含义 | 示例 |
|
|
224
|
+
|------|------|------|
|
|
225
|
+
| SLI (指标) | 服务质量的量化度量 | 请求成功率、P99 延迟 |
|
|
226
|
+
| SLO (目标) | SLI 的目标值 | 可用性 99.9%、P99 < 200ms |
|
|
227
|
+
| SLA (协议) | 对外承诺 + 违约后果 | 99.9% 可用,否则赔偿 |
|
|
228
|
+
|
|
229
|
+
### Error Budget
|
|
230
|
+
|
|
231
|
+
```
|
|
232
|
+
SLO = 99.9% 可用性
|
|
233
|
+
Error Budget = 1 - 0.999 = 0.1%
|
|
234
|
+
每月 Error Budget = 30天 × 24小时 × 60分钟 × 0.001 = 43.2 分钟
|
|
235
|
+
|
|
236
|
+
已消耗: 15 分钟
|
|
237
|
+
剩余: 28.2 分钟
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### SLO Dashboard
|
|
241
|
+
|
|
242
|
+
```yaml
|
|
243
|
+
SLO Dashboard:
|
|
244
|
+
- 当前 SLI 值 vs SLO 目标
|
|
245
|
+
- Error Budget 剩余百分比
|
|
246
|
+
- Error Budget 消耗速率
|
|
247
|
+
- 30天滚动窗口趋势
|
|
248
|
+
- Burn Rate 告警状态
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
---
|
|
252
|
+
|
|
253
|
+
## 可观测性清单
|
|
254
|
+
|
|
255
|
+
```yaml
|
|
256
|
+
日志:
|
|
257
|
+
- [ ] 结构化 JSON 格式
|
|
258
|
+
- [ ] trace_id 关联
|
|
259
|
+
- [ ] 敏感数据脱敏
|
|
260
|
+
- [ ] 保留策略配置
|
|
261
|
+
|
|
262
|
+
指标:
|
|
263
|
+
- [ ] 四大黄金信号覆盖
|
|
264
|
+
- [ ] 自定义业务指标
|
|
265
|
+
- [ ] Dashboard 就绪
|
|
266
|
+
- [ ] 告警规则配置
|
|
267
|
+
|
|
268
|
+
追踪:
|
|
269
|
+
- [ ] OpenTelemetry 集成
|
|
270
|
+
- [ ] 跨服务 Context Propagation
|
|
271
|
+
- [ ] 采样策略配置
|
|
272
|
+
- [ ] 关键路径标注
|
|
273
|
+
|
|
274
|
+
告警:
|
|
275
|
+
- [ ] 基于 SLO 的告警
|
|
276
|
+
- [ ] 分级通知渠道
|
|
277
|
+
- [ ] Runbook 关联
|
|
278
|
+
- [ ] 定期审查机制
|
|
279
|
+
```
|
|
280
|
+
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: performance
|
|
3
|
+
description: 性能优化秘典。性能分析方法论、Profiling、火焰图、基准测试、瓶颈优化。当用户提到性能、延迟、吞吐、Profiling、火焰图、基准测试时路由到此。
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# 🔧 炼器秘典 · 性能优化
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
## 性能分析方法论
|
|
10
|
+
|
|
11
|
+
### USE 方法 (Utilization, Saturation, Errors)
|
|
12
|
+
|
|
13
|
+
对每个资源检查三个维度:
|
|
14
|
+
|
|
15
|
+
| 维度 | 含义 | 工具 |
|
|
16
|
+
|------|------|------|
|
|
17
|
+
| Utilization | 资源繁忙时间占比 | `top`, `vmstat`, `iostat` |
|
|
18
|
+
| Saturation | 排队等待的工作量 | `vmstat`(r列), `iostat`(avgqu-sz) |
|
|
19
|
+
| Errors | 错误事件计数 | `dmesg`, 应用日志 |
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# CPU USE
|
|
23
|
+
mpstat -P ALL 1 # Utilization per core
|
|
24
|
+
vmstat 1 # Saturation (r > CPU count)
|
|
25
|
+
dmesg | grep -i error # Errors
|
|
26
|
+
|
|
27
|
+
# Memory USE
|
|
28
|
+
free -m # Utilization
|
|
29
|
+
vmstat 1 | awk '{print $3,$4}' # Saturation (si/so > 0 = swapping)
|
|
30
|
+
|
|
31
|
+
# Disk USE
|
|
32
|
+
iostat -xz 1 # Utilization (%util), Saturation (avgqu-sz)
|
|
33
|
+
|
|
34
|
+
# Network USE
|
|
35
|
+
sar -n DEV 1 # Utilization
|
|
36
|
+
netstat -s | grep -i error # Errors
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### RED 方法 (Rate, Errors, Duration)
|
|
40
|
+
|
|
41
|
+
面向服务的性能指标:
|
|
42
|
+
|
|
43
|
+
| 维度 | 含义 | 示例 |
|
|
44
|
+
|------|------|------|
|
|
45
|
+
| Rate | 每秒请求数 | QPS/RPS |
|
|
46
|
+
| Errors | 每秒错误数 | 5xx/s |
|
|
47
|
+
| Duration | 请求延迟分布 | P50/P95/P99 |
|
|
48
|
+
|
|
49
|
+
```promql
|
|
50
|
+
# Prometheus PromQL 示例
|
|
51
|
+
rate(http_requests_total[5m]) # Rate
|
|
52
|
+
rate(http_requests_total{status=~"5.."}[5m]) # Errors
|
|
53
|
+
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) # P99
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Profiling 工具
|
|
59
|
+
|
|
60
|
+
### CPU Profiling
|
|
61
|
+
|
|
62
|
+
| 语言 | 工具 | 命令 |
|
|
63
|
+
|------|------|------|
|
|
64
|
+
| Python | cProfile / py-spy | `py-spy record -o profile.svg -- python app.py` |
|
|
65
|
+
| Go | pprof | `go tool pprof http://localhost:6060/debug/pprof/profile` |
|
|
66
|
+
| Java | async-profiler | `./profiler.sh -d 30 -f flame.html <pid>` |
|
|
67
|
+
| Node.js | clinic.js | `clinic flame -- node app.js` |
|
|
68
|
+
| Rust | cargo-flamegraph | `cargo flamegraph` |
|
|
69
|
+
| 系统级 | perf | `perf record -g -p <pid> -- sleep 30` |
|
|
70
|
+
|
|
71
|
+
### Memory Profiling
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Python
|
|
75
|
+
python -m memory_profiler script.py
|
|
76
|
+
# 或使用 tracemalloc
|
|
77
|
+
python -c "import tracemalloc; tracemalloc.start(); ..."
|
|
78
|
+
|
|
79
|
+
# Go
|
|
80
|
+
go tool pprof http://localhost:6060/debug/pprof/heap
|
|
81
|
+
|
|
82
|
+
# Java
|
|
83
|
+
jmap -dump:format=b,file=heap.hprof <pid>
|
|
84
|
+
jhat heap.hprof # 或用 MAT/VisualVM 分析
|
|
85
|
+
|
|
86
|
+
# 系统级
|
|
87
|
+
valgrind --tool=massif ./program
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### I/O Profiling
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# 磁盘 I/O
|
|
94
|
+
iostat -xz 1
|
|
95
|
+
iotop -oP
|
|
96
|
+
strace -e trace=read,write -p <pid>
|
|
97
|
+
|
|
98
|
+
# 网络 I/O
|
|
99
|
+
ss -tnp # 连接状态
|
|
100
|
+
tcpdump -i eth0 -w cap.pcap # 抓包
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## 火焰图
|
|
106
|
+
|
|
107
|
+
### 生成流程
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
# 1. 采集数据
|
|
111
|
+
perf record -F 99 -g -p <pid> -- sleep 30
|
|
112
|
+
|
|
113
|
+
# 2. 生成火焰图
|
|
114
|
+
perf script | stackcollapse-perf.pl | flamegraph.pl > flame.svg
|
|
115
|
+
|
|
116
|
+
# 3. 解读
|
|
117
|
+
# X轴:函数在采样中出现的比例(越宽=越耗时)
|
|
118
|
+
# Y轴:调用栈深度
|
|
119
|
+
# 颜色:随机,无特殊含义
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### 解读要点
|
|
123
|
+
|
|
124
|
+
| 特征 | 含义 | 行动 |
|
|
125
|
+
|------|------|------|
|
|
126
|
+
| 宽平顶 | 该函数自身耗时大 | 优化该函数逻辑 |
|
|
127
|
+
| 宽塔形 | 调用链深但每层都耗时 | 减少调用层级 |
|
|
128
|
+
| 多个窄尖峰 | 多处小开销累积 | 关注热路径 |
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## 基准测试
|
|
133
|
+
|
|
134
|
+
### HTTP 基准测试
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# wrk (推荐)
|
|
138
|
+
wrk -t12 -c400 -d30s http://localhost:8080/api
|
|
139
|
+
|
|
140
|
+
# ab (Apache Bench)
|
|
141
|
+
ab -n 10000 -c 100 http://localhost:8080/api
|
|
142
|
+
|
|
143
|
+
# hey
|
|
144
|
+
hey -n 10000 -c 100 http://localhost:8080/api
|
|
145
|
+
|
|
146
|
+
# k6 (脚本化)
|
|
147
|
+
k6 run --vus 100 --duration 30s script.js
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### 代码级基准测试
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
# Python - pytest-benchmark
|
|
154
|
+
def test_sort_benchmark(benchmark):
|
|
155
|
+
data = list(range(1000, 0, -1))
|
|
156
|
+
benchmark(sorted, data)
|
|
157
|
+
|
|
158
|
+
# Go
|
|
159
|
+
func BenchmarkSort(b *testing.B) {
|
|
160
|
+
for i := 0; i < b.N; i++ {
|
|
161
|
+
sort.Ints(generateData())
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
# Rust
|
|
166
|
+
#[bench]
|
|
167
|
+
fn bench_sort(b: &mut Bencher) {
|
|
168
|
+
b.iter(|| sort_data(test::black_box(generate_data())));
|
|
169
|
+
}
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### 基准测试原则
|
|
173
|
+
|
|
174
|
+
1. **隔离环境** — 独占机器,关闭无关进程
|
|
175
|
+
2. **预热** — 丢弃前 N 次结果
|
|
176
|
+
3. **统计显著** — 多次运行取中位数
|
|
177
|
+
4. **对比基线** — 优化前后对比,而非绝对值
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
## 常见瓶颈优化
|
|
182
|
+
|
|
183
|
+
### CPU 密集型
|
|
184
|
+
|
|
185
|
+
| 问题 | 优化 |
|
|
186
|
+
|------|------|
|
|
187
|
+
| 热循环 | 算法优化、减少分支 |
|
|
188
|
+
| 序列化/反序列化 | 换用高效格式(protobuf/msgpack) |
|
|
189
|
+
| 正则表达式 | 预编译、简化模式 |
|
|
190
|
+
| 加密运算 | 硬件加速(AES-NI) |
|
|
191
|
+
|
|
192
|
+
### I/O 密集型
|
|
193
|
+
|
|
194
|
+
| 问题 | 优化 |
|
|
195
|
+
|------|------|
|
|
196
|
+
| 同步阻塞 I/O | 异步 I/O (asyncio/epoll) |
|
|
197
|
+
| 频繁小文件读写 | 批量合并、缓冲区 |
|
|
198
|
+
| 网络往返 | 连接池、批量请求、Pipeline |
|
|
199
|
+
| DNS 解析 | 本地缓存 |
|
|
200
|
+
|
|
201
|
+
### 内存相关
|
|
202
|
+
|
|
203
|
+
| 问题 | 优化 |
|
|
204
|
+
|------|------|
|
|
205
|
+
| 内存泄漏 | Profiling 定位 + 修复引用 |
|
|
206
|
+
| GC 压力 | 减少分配、对象池 |
|
|
207
|
+
| 缓存未命中 | 数据局部性、紧凑布局 |
|
|
208
|
+
| 大对象 | 流式处理、分片 |
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## 数据库性能
|
|
213
|
+
|
|
214
|
+
### 查询优化
|
|
215
|
+
|
|
216
|
+
```sql
|
|
217
|
+
-- 1. EXPLAIN 分析
|
|
218
|
+
EXPLAIN ANALYZE SELECT * FROM orders WHERE user_id = 123;
|
|
219
|
+
|
|
220
|
+
-- 2. 索引优化
|
|
221
|
+
CREATE INDEX idx_orders_user_id ON orders(user_id);
|
|
222
|
+
CREATE INDEX idx_orders_composite ON orders(user_id, created_at DESC);
|
|
223
|
+
|
|
224
|
+
-- 3. 避免 N+1
|
|
225
|
+
-- 差:循环查询
|
|
226
|
+
-- 好:JOIN 或 IN 批量查询
|
|
227
|
+
SELECT o.*, u.name FROM orders o JOIN users u ON o.user_id = u.id;
|
|
228
|
+
|
|
229
|
+
-- 4. 分页优化
|
|
230
|
+
-- 差:OFFSET 大数值
|
|
231
|
+
SELECT * FROM orders ORDER BY id LIMIT 20 OFFSET 100000;
|
|
232
|
+
-- 好:游标分页
|
|
233
|
+
SELECT * FROM orders WHERE id > 100000 ORDER BY id LIMIT 20;
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### 连接池配置
|
|
237
|
+
|
|
238
|
+
```yaml
|
|
239
|
+
# HikariCP (Java)
|
|
240
|
+
maximumPoolSize: 10 # CPU核数 * 2 + 磁盘数
|
|
241
|
+
minimumIdle: 5
|
|
242
|
+
connectionTimeout: 30000
|
|
243
|
+
idleTimeout: 600000
|
|
244
|
+
|
|
245
|
+
# 通用公式
|
|
246
|
+
pool_size = (core_count * 2) + effective_spindle_count
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## 性能优化清单
|
|
252
|
+
|
|
253
|
+
```yaml
|
|
254
|
+
应用层:
|
|
255
|
+
- [ ] 热路径 Profiling 完成
|
|
256
|
+
- [ ] 算法复杂度 ≤ O(n log n)
|
|
257
|
+
- [ ] 无 N+1 查询
|
|
258
|
+
- [ ] 连接池配置合理
|
|
259
|
+
- [ ] 异步 I/O 用于 I/O 密集操作
|
|
260
|
+
|
|
261
|
+
数据库:
|
|
262
|
+
- [ ] 慢查询 < 100ms (P95)
|
|
263
|
+
- [ ] 索引覆盖高频查询
|
|
264
|
+
- [ ] 无全表扫描
|
|
265
|
+
- [ ] 连接池大小合理
|
|
266
|
+
|
|
267
|
+
基础设施:
|
|
268
|
+
- [ ] CPU 利用率 < 70% (P95)
|
|
269
|
+
- [ ] 内存利用率 < 80%
|
|
270
|
+
- [ ] 磁盘 I/O 无饱和
|
|
271
|
+
- [ ] 网络无丢包
|
|
272
|
+
```
|
|
273
|
+
|