code-abyss 1.6.16 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -6
- package/bin/install.js +59 -163
- package/bin/lib/ccline.js +82 -0
- package/bin/lib/utils.js +61 -0
- package/package.json +5 -2
- package/skills/SKILL.md +24 -16
- package/skills/domains/ai/SKILL.md +2 -2
- package/skills/domains/ai/prompt-and-eval.md +279 -0
- package/skills/domains/architecture/SKILL.md +2 -3
- package/skills/domains/architecture/security-arch.md +87 -0
- package/skills/domains/data-engineering/SKILL.md +188 -26
- package/skills/domains/development/SKILL.md +1 -4
- package/skills/domains/devops/SKILL.md +3 -5
- package/skills/domains/devops/performance.md +63 -0
- package/skills/domains/devops/testing.md +97 -0
- package/skills/domains/frontend-design/SKILL.md +12 -3
- package/skills/domains/frontend-design/claymorphism/SKILL.md +117 -0
- package/skills/domains/frontend-design/claymorphism/references/tokens.css +52 -0
- package/skills/domains/frontend-design/engineering.md +287 -0
- package/skills/domains/frontend-design/glassmorphism/SKILL.md +138 -0
- package/skills/domains/frontend-design/glassmorphism/references/tokens.css +32 -0
- package/skills/domains/frontend-design/liquid-glass/SKILL.md +135 -0
- package/skills/domains/frontend-design/liquid-glass/references/tokens.css +81 -0
- package/skills/domains/frontend-design/neubrutalism/SKILL.md +141 -0
- package/skills/domains/frontend-design/neubrutalism/references/tokens.css +44 -0
- package/skills/domains/infrastructure/SKILL.md +174 -34
- package/skills/domains/mobile/SKILL.md +211 -21
- package/skills/domains/orchestration/SKILL.md +1 -0
- package/skills/domains/security/SKILL.md +4 -6
- package/skills/domains/security/blue-team.md +57 -0
- package/skills/domains/security/red-team.md +54 -0
- package/skills/domains/security/threat-intel.md +50 -0
- package/skills/orchestration/multi-agent/SKILL.md +195 -46
- package/skills/run_skill.js +139 -0
- package/skills/tools/gen-docs/SKILL.md +6 -4
- package/skills/tools/gen-docs/scripts/doc_generator.js +363 -0
- package/skills/tools/lib/shared.js +98 -0
- package/skills/tools/verify-change/SKILL.md +8 -6
- package/skills/tools/verify-change/scripts/change_analyzer.js +289 -0
- package/skills/tools/verify-module/SKILL.md +6 -4
- package/skills/tools/verify-module/scripts/module_scanner.js +171 -0
- package/skills/tools/verify-quality/SKILL.md +5 -3
- package/skills/tools/verify-quality/scripts/quality_checker.js +337 -0
- package/skills/tools/verify-security/SKILL.md +7 -5
- package/skills/tools/verify-security/scripts/security_scanner.js +283 -0
- package/skills/__pycache__/run_skill.cpython-312.pyc +0 -0
- package/skills/domains/COVERAGE_PLAN.md +0 -232
- package/skills/domains/ai/model-evaluation.md +0 -790
- package/skills/domains/ai/prompt-engineering.md +0 -703
- package/skills/domains/architecture/compliance.md +0 -299
- package/skills/domains/architecture/data-security.md +0 -184
- package/skills/domains/data-engineering/data-pipeline.md +0 -762
- package/skills/domains/data-engineering/data-quality.md +0 -894
- package/skills/domains/data-engineering/stream-processing.md +0 -791
- package/skills/domains/development/dart.md +0 -963
- package/skills/domains/development/kotlin.md +0 -834
- package/skills/domains/development/php.md +0 -659
- package/skills/domains/development/swift.md +0 -755
- package/skills/domains/devops/e2e-testing.md +0 -914
- package/skills/domains/devops/performance-testing.md +0 -734
- package/skills/domains/devops/testing-strategy.md +0 -667
- package/skills/domains/frontend-design/build-tools.md +0 -743
- package/skills/domains/frontend-design/performance.md +0 -734
- package/skills/domains/frontend-design/testing.md +0 -699
- package/skills/domains/infrastructure/gitops.md +0 -735
- package/skills/domains/infrastructure/iac.md +0 -855
- package/skills/domains/infrastructure/kubernetes.md +0 -1018
- package/skills/domains/mobile/android-dev.md +0 -979
- package/skills/domains/mobile/cross-platform.md +0 -795
- package/skills/domains/mobile/ios-dev.md +0 -931
- package/skills/domains/security/secrets-management.md +0 -834
- package/skills/domains/security/supply-chain.md +0 -931
- package/skills/domains/security/threat-modeling.md +0 -828
- package/skills/run_skill.py +0 -153
- package/skills/tests/README.md +0 -225
- package/skills/tests/SUMMARY.md +0 -362
- package/skills/tests/__init__.py +0 -3
- package/skills/tests/__pycache__/test_change_analyzer.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_doc_generator.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_module_scanner.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_quality_checker.cpython-312.pyc +0 -0
- package/skills/tests/__pycache__/test_security_scanner.cpython-312.pyc +0 -0
- package/skills/tests/test_change_analyzer.py +0 -558
- package/skills/tests/test_doc_generator.py +0 -538
- package/skills/tests/test_module_scanner.py +0 -376
- package/skills/tests/test_quality_checker.py +0 -516
- package/skills/tests/test_security_scanner.py +0 -426
- package/skills/tools/gen-docs/scripts/__pycache__/doc_generator.cpython-312.pyc +0 -0
- package/skills/tools/gen-docs/scripts/doc_generator.py +0 -520
- package/skills/tools/verify-change/scripts/__pycache__/change_analyzer.cpython-312.pyc +0 -0
- package/skills/tools/verify-change/scripts/change_analyzer.py +0 -529
- package/skills/tools/verify-module/scripts/__pycache__/module_scanner.cpython-312.pyc +0 -0
- package/skills/tools/verify-module/scripts/module_scanner.py +0 -321
- package/skills/tools/verify-quality/scripts/__pycache__/quality_checker.cpython-312.pyc +0 -0
- package/skills/tools/verify-quality/scripts/quality_checker.py +0 -481
- package/skills/tools/verify-security/scripts/__pycache__/security_scanner.cpython-312.pyc +0 -0
- package/skills/tools/verify-security/scripts/security_scanner.py +0 -374
|
@@ -1,791 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: stream-processing
|
|
3
|
-
description: 流式处理。Kafka Streams、Flink、实时处理、流式计算、窗口函数、状态管理。当用户提到流处理、Kafka Streams、Flink、实时处理、流式计算时使用。
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
# 🌊 流处理秘典 · Stream Processing
|
|
7
|
-
|
|
8
|
-
## 流处理架构
|
|
9
|
-
|
|
10
|
-
```
|
|
11
|
-
数据源 → 摄取 → 处理 → 聚合 → 输出
|
|
12
|
-
│ │ │ │ │
|
|
13
|
-
└─ Kafka ─┴─ 转换 ─┴─ 窗口 ─┴─ Sink
|
|
14
|
-
```
|
|
15
|
-
|
|
16
|
-
## Kafka Streams 基础
|
|
17
|
-
|
|
18
|
-
### 拓扑构建
|
|
19
|
-
|
|
20
|
-
```java
|
|
21
|
-
import org.apache.kafka.streams.KafkaStreams;
|
|
22
|
-
import org.apache.kafka.streams.StreamsBuilder;
|
|
23
|
-
import org.apache.kafka.streams.StreamsConfig;
|
|
24
|
-
import org.apache.kafka.streams.kstream.*;
|
|
25
|
-
import java.util.Properties;
|
|
26
|
-
|
|
27
|
-
public class StreamProcessor {
|
|
28
|
-
public static void main(String[] args) {
|
|
29
|
-
Properties props = new Properties();
|
|
30
|
-
props.put(StreamsConfig.APPLICATION_ID_CONFIG, "stream-processor");
|
|
31
|
-
props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
|
|
32
|
-
props.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG,
|
|
33
|
-
Serdes.String().getClass());
|
|
34
|
-
props.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG,
|
|
35
|
-
Serdes.String().getClass());
|
|
36
|
-
|
|
37
|
-
StreamsBuilder builder = new StreamsBuilder();
|
|
38
|
-
|
|
39
|
-
// 构建拓扑
|
|
40
|
-
KStream<String, String> source = builder.stream("input-topic");
|
|
41
|
-
|
|
42
|
-
KStream<String, String> processed = source
|
|
43
|
-
.filter((key, value) -> value != null)
|
|
44
|
-
.mapValues(value -> value.toUpperCase())
|
|
45
|
-
.peek((key, value) ->
|
|
46
|
-
System.out.println("Processed: " + key + " -> " + value));
|
|
47
|
-
|
|
48
|
-
processed.to("output-topic");
|
|
49
|
-
|
|
50
|
-
KafkaStreams streams = new KafkaStreams(builder.build(), props);
|
|
51
|
-
streams.start();
|
|
52
|
-
|
|
53
|
-
Runtime.getRuntime().addShutdownHook(new Thread(streams::close));
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
### 流转换操作
|
|
59
|
-
|
|
60
|
-
```java
|
|
61
|
-
// Map 转换
|
|
62
|
-
KStream<String, Integer> lengths = stream
|
|
63
|
-
.mapValues(value -> value.length());
|
|
64
|
-
|
|
65
|
-
// FlatMap 展开
|
|
66
|
-
KStream<String, String> words = stream
|
|
67
|
-
.flatMapValues(value -> Arrays.asList(value.split("\\s+")));
|
|
68
|
-
|
|
69
|
-
// Filter 过滤
|
|
70
|
-
KStream<String, String> filtered = stream
|
|
71
|
-
.filter((key, value) -> value.length() > 10);
|
|
72
|
-
|
|
73
|
-
// Branch 分支
|
|
74
|
-
KStream<String, String>[] branches = stream.branch(
|
|
75
|
-
(key, value) -> value.startsWith("A"),
|
|
76
|
-
(key, value) -> value.startsWith("B"),
|
|
77
|
-
(key, value) -> true // 默认分支
|
|
78
|
-
);
|
|
79
|
-
|
|
80
|
-
// Merge 合并
|
|
81
|
-
KStream<String, String> merged = stream1.merge(stream2);
|
|
82
|
-
```
|
|
83
|
-
|
|
84
|
-
### 状态存储
|
|
85
|
-
|
|
86
|
-
```java
|
|
87
|
-
import org.apache.kafka.streams.state.KeyValueStore;
|
|
88
|
-
import org.apache.kafka.streams.state.StoreBuilder;
|
|
89
|
-
import org.apache.kafka.streams.state.Stores;
|
|
90
|
-
|
|
91
|
-
// 创建状态存储
|
|
92
|
-
StoreBuilder<KeyValueStore<String, Long>> storeBuilder =
|
|
93
|
-
Stores.keyValueStoreBuilder(
|
|
94
|
-
Stores.persistentKeyValueStore("counts-store"),
|
|
95
|
-
Serdes.String(),
|
|
96
|
-
Serdes.Long()
|
|
97
|
-
);
|
|
98
|
-
|
|
99
|
-
builder.addStateStore(storeBuilder);
|
|
100
|
-
|
|
101
|
-
// 使用状态存储
|
|
102
|
-
stream.transform(() -> new Transformer<String, String, KeyValue<String, Long>>() {
|
|
103
|
-
private KeyValueStore<String, Long> stateStore;
|
|
104
|
-
|
|
105
|
-
@Override
|
|
106
|
-
public void init(ProcessorContext context) {
|
|
107
|
-
this.stateStore = context.getStateStore("counts-store");
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
@Override
|
|
111
|
-
public KeyValue<String, Long> transform(String key, String value) {
|
|
112
|
-
Long count = stateStore.get(key);
|
|
113
|
-
if (count == null) count = 0L;
|
|
114
|
-
count++;
|
|
115
|
-
stateStore.put(key, count);
|
|
116
|
-
return KeyValue.pair(key, count);
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
@Override
|
|
120
|
-
public void close() {}
|
|
121
|
-
}, "counts-store");
|
|
122
|
-
```
|
|
123
|
-
|
|
124
|
-
### 聚合操作
|
|
125
|
-
|
|
126
|
-
```java
|
|
127
|
-
// Count 计数
|
|
128
|
-
KTable<String, Long> counts = stream
|
|
129
|
-
.groupByKey()
|
|
130
|
-
.count(Materialized.as("counts-store"));
|
|
131
|
-
|
|
132
|
-
// Aggregate 聚合
|
|
133
|
-
KTable<String, Double> averages = stream
|
|
134
|
-
.groupByKey()
|
|
135
|
-
.aggregate(
|
|
136
|
-
() -> new AggregateValue(0.0, 0L), // 初始化
|
|
137
|
-
(key, value, aggregate) -> {
|
|
138
|
-
aggregate.sum += Double.parseDouble(value);
|
|
139
|
-
aggregate.count++;
|
|
140
|
-
return aggregate;
|
|
141
|
-
},
|
|
142
|
-
Materialized.with(Serdes.String(), aggregateSerde)
|
|
143
|
-
)
|
|
144
|
-
.mapValues(agg -> agg.sum / agg.count);
|
|
145
|
-
|
|
146
|
-
// Reduce 归约
|
|
147
|
-
KTable<String, String> reduced = stream
|
|
148
|
-
.groupByKey()
|
|
149
|
-
.reduce((value1, value2) -> value1 + "," + value2);
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
### Join 操作
|
|
153
|
-
|
|
154
|
-
```java
|
|
155
|
-
// Stream-Stream Join
|
|
156
|
-
KStream<String, String> joined = stream1.join(
|
|
157
|
-
stream2,
|
|
158
|
-
(value1, value2) -> value1 + "-" + value2,
|
|
159
|
-
JoinWindows.ofTimeDifferenceWithNoGrace(Duration.ofMinutes(5)),
|
|
160
|
-
StreamJoined.with(Serdes.String(), Serdes.String(), Serdes.String())
|
|
161
|
-
);
|
|
162
|
-
|
|
163
|
-
// Stream-Table Join
|
|
164
|
-
KStream<String, String> enriched = stream.join(
|
|
165
|
-
table,
|
|
166
|
-
(streamValue, tableValue) -> streamValue + "-" + tableValue
|
|
167
|
-
);
|
|
168
|
-
|
|
169
|
-
// Table-Table Join
|
|
170
|
-
KTable<String, String> tableJoined = table1.join(
|
|
171
|
-
table2,
|
|
172
|
-
(value1, value2) -> value1 + "-" + value2
|
|
173
|
-
);
|
|
174
|
-
```
|
|
175
|
-
|
|
176
|
-
## Flink DataStream API
|
|
177
|
-
|
|
178
|
-
### 基础流处理
|
|
179
|
-
|
|
180
|
-
```java
|
|
181
|
-
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
|
182
|
-
import org.apache.flink.streaming.api.datastream.DataStream;
|
|
183
|
-
import org.apache.flink.api.common.functions.MapFunction;
|
|
184
|
-
|
|
185
|
-
public class FlinkStreamProcessor {
|
|
186
|
-
public static void main(String[] args) throws Exception {
|
|
187
|
-
StreamExecutionEnvironment env =
|
|
188
|
-
StreamExecutionEnvironment.getExecutionEnvironment();
|
|
189
|
-
|
|
190
|
-
// 从 Kafka 读取
|
|
191
|
-
DataStream<String> stream = env
|
|
192
|
-
.addSource(new FlinkKafkaConsumer<>(
|
|
193
|
-
"input-topic",
|
|
194
|
-
new SimpleStringSchema(),
|
|
195
|
-
properties
|
|
196
|
-
));
|
|
197
|
-
|
|
198
|
-
// 转换处理
|
|
199
|
-
DataStream<String> processed = stream
|
|
200
|
-
.filter(value -> value != null)
|
|
201
|
-
.map(new MapFunction<String, String>() {
|
|
202
|
-
@Override
|
|
203
|
-
public String map(String value) {
|
|
204
|
-
return value.toUpperCase();
|
|
205
|
-
}
|
|
206
|
-
});
|
|
207
|
-
|
|
208
|
-
// 写入 Kafka
|
|
209
|
-
processed.addSink(new FlinkKafkaProducer<>(
|
|
210
|
-
"output-topic",
|
|
211
|
-
new SimpleStringSchema(),
|
|
212
|
-
properties
|
|
213
|
-
));
|
|
214
|
-
|
|
215
|
-
env.execute("Flink Stream Processor");
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
```
|
|
219
|
-
|
|
220
|
-
### 窗口函数
|
|
221
|
-
|
|
222
|
-
```java
|
|
223
|
-
import org.apache.flink.streaming.api.windowing.time.Time;
|
|
224
|
-
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
|
|
225
|
-
|
|
226
|
-
// 滚动窗口 (Tumbling Window)
|
|
227
|
-
DataStream<Tuple2<String, Long>> tumblingCounts = stream
|
|
228
|
-
.keyBy(value -> value.getKey())
|
|
229
|
-
.window(TumblingProcessingTimeWindows.of(Time.minutes(5)))
|
|
230
|
-
.sum(1);
|
|
231
|
-
|
|
232
|
-
// 滑动窗口 (Sliding Window)
|
|
233
|
-
DataStream<Tuple2<String, Long>> slidingCounts = stream
|
|
234
|
-
.keyBy(value -> value.getKey())
|
|
235
|
-
.window(SlidingProcessingTimeWindows.of(
|
|
236
|
-
Time.minutes(10), // 窗口大小
|
|
237
|
-
Time.minutes(5) // 滑动步长
|
|
238
|
-
))
|
|
239
|
-
.sum(1);
|
|
240
|
-
|
|
241
|
-
// 会话窗口 (Session Window)
|
|
242
|
-
DataStream<Tuple2<String, Long>> sessionCounts = stream
|
|
243
|
-
.keyBy(value -> value.getKey())
|
|
244
|
-
.window(ProcessingTimeSessionWindows.withGap(Time.minutes(10)))
|
|
245
|
-
.sum(1);
|
|
246
|
-
|
|
247
|
-
// 全局窗口 (Global Window)
|
|
248
|
-
DataStream<Tuple2<String, Long>> globalCounts = stream
|
|
249
|
-
.keyBy(value -> value.getKey())
|
|
250
|
-
.window(GlobalWindows.create())
|
|
251
|
-
.trigger(CountTrigger.of(100)) // 每100条触发
|
|
252
|
-
.sum(1);
|
|
253
|
-
```
|
|
254
|
-
|
|
255
|
-
### 窗口聚合
|
|
256
|
-
|
|
257
|
-
```java
|
|
258
|
-
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
|
|
259
|
-
import org.apache.flink.util.Collector;
|
|
260
|
-
|
|
261
|
-
// 增量聚合 + 全窗口函数
|
|
262
|
-
DataStream<String> result = stream
|
|
263
|
-
.keyBy(value -> value.getKey())
|
|
264
|
-
.window(TumblingProcessingTimeWindows.of(Time.minutes(5)))
|
|
265
|
-
.aggregate(
|
|
266
|
-
new AverageAggregate(), // 增量聚合
|
|
267
|
-
new WindowResultFunction() // 全窗口处理
|
|
268
|
-
);
|
|
269
|
-
|
|
270
|
-
// AverageAggregate 实现
|
|
271
|
-
class AverageAggregate implements AggregateFunction<
|
|
272
|
-
Tuple2<String, Double>,
|
|
273
|
-
Tuple2<Double, Long>,
|
|
274
|
-
Double> {
|
|
275
|
-
|
|
276
|
-
@Override
|
|
277
|
-
public Tuple2<Double, Long> createAccumulator() {
|
|
278
|
-
return new Tuple2<>(0.0, 0L);
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
@Override
|
|
282
|
-
public Tuple2<Double, Long> add(
|
|
283
|
-
Tuple2<String, Double> value,
|
|
284
|
-
Tuple2<Double, Long> accumulator) {
|
|
285
|
-
return new Tuple2<>(
|
|
286
|
-
accumulator.f0 + value.f1,
|
|
287
|
-
accumulator.f1 + 1L
|
|
288
|
-
);
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
@Override
|
|
292
|
-
public Double getResult(Tuple2<Double, Long> accumulator) {
|
|
293
|
-
return accumulator.f0 / accumulator.f1;
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
@Override
|
|
297
|
-
public Tuple2<Double, Long> merge(
|
|
298
|
-
Tuple2<Double, Long> a,
|
|
299
|
-
Tuple2<Double, Long> b) {
|
|
300
|
-
return new Tuple2<>(a.f0 + b.f0, a.f1 + b.f1);
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
```
|
|
304
|
-
|
|
305
|
-
### ProcessFunction
|
|
306
|
-
|
|
307
|
-
```java
|
|
308
|
-
import org.apache.flink.streaming.api.functions.ProcessFunction;
|
|
309
|
-
import org.apache.flink.util.Collector;
|
|
310
|
-
|
|
311
|
-
// 低级 API - 完全控制
|
|
312
|
-
DataStream<String> processed = stream.process(
|
|
313
|
-
new ProcessFunction<String, String>() {
|
|
314
|
-
@Override
|
|
315
|
-
public void processElement(
|
|
316
|
-
String value,
|
|
317
|
-
Context ctx,
|
|
318
|
-
Collector<String> out) throws Exception {
|
|
319
|
-
|
|
320
|
-
// 访问时间戳
|
|
321
|
-
long timestamp = ctx.timestamp();
|
|
322
|
-
|
|
323
|
-
// 注册定时器
|
|
324
|
-
ctx.timerService().registerProcessingTimeTimer(
|
|
325
|
-
timestamp + 60000
|
|
326
|
-
);
|
|
327
|
-
|
|
328
|
-
// 输出结果
|
|
329
|
-
out.collect(value.toUpperCase());
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
@Override
|
|
333
|
-
public void onTimer(
|
|
334
|
-
long timestamp,
|
|
335
|
-
OnTimerContext ctx,
|
|
336
|
-
Collector<String> out) throws Exception {
|
|
337
|
-
// 定时器触发
|
|
338
|
-
out.collect("Timer fired at " + timestamp);
|
|
339
|
-
}
|
|
340
|
-
}
|
|
341
|
-
);
|
|
342
|
-
```
|
|
343
|
-
|
|
344
|
-
### 状态管理
|
|
345
|
-
|
|
346
|
-
```java
|
|
347
|
-
import org.apache.flink.api.common.state.*;
|
|
348
|
-
import org.apache.flink.configuration.Configuration;
|
|
349
|
-
|
|
350
|
-
// ValueState - 单值状态
|
|
351
|
-
class StatefulMapFunction extends RichMapFunction<String, String> {
|
|
352
|
-
private transient ValueState<Long> countState;
|
|
353
|
-
|
|
354
|
-
@Override
|
|
355
|
-
public void open(Configuration parameters) {
|
|
356
|
-
ValueStateDescriptor<Long> descriptor =
|
|
357
|
-
new ValueStateDescriptor<>("count", Long.class, 0L);
|
|
358
|
-
countState = getRuntimeContext().getState(descriptor);
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
@Override
|
|
362
|
-
public String map(String value) throws Exception {
|
|
363
|
-
Long count = countState.value();
|
|
364
|
-
count++;
|
|
365
|
-
countState.update(count);
|
|
366
|
-
return value + " (count: " + count + ")";
|
|
367
|
-
}
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
// ListState - 列表状态
|
|
371
|
-
class ListStateFunction extends RichFlatMapFunction<String, String> {
|
|
372
|
-
private transient ListState<String> listState;
|
|
373
|
-
|
|
374
|
-
@Override
|
|
375
|
-
public void open(Configuration parameters) {
|
|
376
|
-
ListStateDescriptor<String> descriptor =
|
|
377
|
-
new ListStateDescriptor<>("list", String.class);
|
|
378
|
-
listState = getRuntimeContext().getListState(descriptor);
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
@Override
|
|
382
|
-
public void flatMap(String value, Collector<String> out) throws Exception {
|
|
383
|
-
listState.add(value);
|
|
384
|
-
|
|
385
|
-
// 输出所有历史值
|
|
386
|
-
for (String item : listState.get()) {
|
|
387
|
-
out.collect(item);
|
|
388
|
-
}
|
|
389
|
-
}
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
// MapState - 映射状态
|
|
393
|
-
class MapStateFunction extends RichFlatMapFunction<
|
|
394
|
-
Tuple2<String, String>, String> {
|
|
395
|
-
|
|
396
|
-
private transient MapState<String, Long> mapState;
|
|
397
|
-
|
|
398
|
-
@Override
|
|
399
|
-
public void open(Configuration parameters) {
|
|
400
|
-
MapStateDescriptor<String, Long> descriptor =
|
|
401
|
-
new MapStateDescriptor<>("map", String.class, Long.class);
|
|
402
|
-
mapState = getRuntimeContext().getMapState(descriptor);
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
@Override
|
|
406
|
-
public void flatMap(
|
|
407
|
-
Tuple2<String, String> value,
|
|
408
|
-
Collector<String> out) throws Exception {
|
|
409
|
-
|
|
410
|
-
String key = value.f1;
|
|
411
|
-
Long count = mapState.get(key);
|
|
412
|
-
if (count == null) count = 0L;
|
|
413
|
-
count++;
|
|
414
|
-
mapState.put(key, count);
|
|
415
|
-
|
|
416
|
-
out.collect(key + ": " + count);
|
|
417
|
-
}
|
|
418
|
-
}
|
|
419
|
-
```
|
|
420
|
-
|
|
421
|
-
### Checkpoint 和 Savepoint
|
|
422
|
-
|
|
423
|
-
```java
|
|
424
|
-
// 启用 Checkpoint
|
|
425
|
-
env.enableCheckpointing(60000); // 每60秒
|
|
426
|
-
|
|
427
|
-
// Checkpoint 配置
|
|
428
|
-
env.getCheckpointConfig().setCheckpointingMode(
|
|
429
|
-
CheckpointingMode.EXACTLY_ONCE
|
|
430
|
-
);
|
|
431
|
-
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(30000);
|
|
432
|
-
env.getCheckpointConfig().setCheckpointTimeout(600000);
|
|
433
|
-
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
|
|
434
|
-
|
|
435
|
-
// 外部化 Checkpoint
|
|
436
|
-
env.getCheckpointConfig().enableExternalizedCheckpoints(
|
|
437
|
-
ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION
|
|
438
|
-
);
|
|
439
|
-
|
|
440
|
-
// 从 Savepoint 恢复
|
|
441
|
-
// flink run -s /path/to/savepoint your-job.jar
|
|
442
|
-
```
|
|
443
|
-
|
|
444
|
-
## 窗口类型对比
|
|
445
|
-
|
|
446
|
-
| 窗口类型 | 特点 | 使用场景 |
|
|
447
|
-
|----------|------|----------|
|
|
448
|
-
| 滚动窗口 | 固定大小,无重叠 | 每小时统计、日报 |
|
|
449
|
-
| 滑动窗口 | 固定大小,有重叠 | 移动平均、趋势分析 |
|
|
450
|
-
| 会话窗口 | 动态大小,基于间隔 | 用户会话、活动检测 |
|
|
451
|
-
| 全局窗口 | 无时间限制 | 自定义触发逻辑 |
|
|
452
|
-
|
|
453
|
-
## 时间语义
|
|
454
|
-
|
|
455
|
-
### Event Time vs Processing Time
|
|
456
|
-
|
|
457
|
-
```java
|
|
458
|
-
// Event Time - 事件时间
|
|
459
|
-
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
|
|
460
|
-
|
|
461
|
-
DataStream<Event> stream = env
|
|
462
|
-
.addSource(new EventSource())
|
|
463
|
-
.assignTimestampsAndWatermarks(
|
|
464
|
-
WatermarkStrategy
|
|
465
|
-
.<Event>forBoundedOutOfOrderness(Duration.ofSeconds(10))
|
|
466
|
-
.withTimestampAssigner((event, timestamp) -> event.getTimestamp())
|
|
467
|
-
);
|
|
468
|
-
|
|
469
|
-
// Processing Time - 处理时间
|
|
470
|
-
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
|
|
471
|
-
|
|
472
|
-
DataStream<Event> stream = env
|
|
473
|
-
.addSource(new EventSource())
|
|
474
|
-
.window(TumblingProcessingTimeWindows.of(Time.minutes(5)));
|
|
475
|
-
```
|
|
476
|
-
|
|
477
|
-
### Watermark 生成
|
|
478
|
-
|
|
479
|
-
```java
|
|
480
|
-
// 周期性 Watermark
|
|
481
|
-
class PeriodicWatermarkGenerator implements WatermarkGenerator<Event> {
|
|
482
|
-
private long maxTimestamp = Long.MIN_VALUE;
|
|
483
|
-
private final long maxOutOfOrderness = 5000;
|
|
484
|
-
|
|
485
|
-
@Override
|
|
486
|
-
public void onEvent(Event event, long eventTimestamp, WatermarkOutput output) {
|
|
487
|
-
maxTimestamp = Math.max(maxTimestamp, eventTimestamp);
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
@Override
|
|
491
|
-
public void onPeriodicEmit(WatermarkOutput output) {
|
|
492
|
-
output.emitWatermark(new Watermark(maxTimestamp - maxOutOfOrderness));
|
|
493
|
-
}
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
// 标点 Watermark
|
|
497
|
-
class PunctuatedWatermarkGenerator implements WatermarkGenerator<Event> {
|
|
498
|
-
@Override
|
|
499
|
-
public void onEvent(Event event, long eventTimestamp, WatermarkOutput output) {
|
|
500
|
-
if (event.hasWatermarkMarker()) {
|
|
501
|
-
output.emitWatermark(new Watermark(eventTimestamp));
|
|
502
|
-
}
|
|
503
|
-
}
|
|
504
|
-
|
|
505
|
-
@Override
|
|
506
|
-
public void onPeriodicEmit(WatermarkOutput output) {
|
|
507
|
-
// 不需要周期性发射
|
|
508
|
-
}
|
|
509
|
-
}
|
|
510
|
-
```
|
|
511
|
-
|
|
512
|
-
## 背压处理
|
|
513
|
-
|
|
514
|
-
### Kafka Streams 背压
|
|
515
|
-
|
|
516
|
-
```java
|
|
517
|
-
// 配置消费者
|
|
518
|
-
props.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 500);
|
|
519
|
-
props.put(ConsumerConfig.FETCH_MIN_BYTES_CONFIG, 1024);
|
|
520
|
-
props.put(ConsumerConfig.FETCH_MAX_WAIT_MS_CONFIG, 500);
|
|
521
|
-
|
|
522
|
-
// 配置生产者
|
|
523
|
-
props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, 33554432);
|
|
524
|
-
props.put(ProducerConfig.BATCH_SIZE_CONFIG, 16384);
|
|
525
|
-
props.put(ProducerConfig.LINGER_MS_CONFIG, 10);
|
|
526
|
-
```
|
|
527
|
-
|
|
528
|
-
### Flink 背压监控
|
|
529
|
-
|
|
530
|
-
```java
|
|
531
|
-
// 配置缓冲区
|
|
532
|
-
env.setBufferTimeout(100);
|
|
533
|
-
|
|
534
|
-
// 监控背压
|
|
535
|
-
// Web UI -> Job -> BackPressure
|
|
536
|
-
|
|
537
|
-
// 调整并行度
|
|
538
|
-
stream.map(new MyMapFunction()).setParallelism(4);
|
|
539
|
-
```
|
|
540
|
-
|
|
541
|
-
## 容错机制
|
|
542
|
-
|
|
543
|
-
### Exactly-Once 语义
|
|
544
|
-
|
|
545
|
-
```java
|
|
546
|
-
// Kafka Streams Exactly-Once
|
|
547
|
-
props.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG,
|
|
548
|
-
StreamsConfig.EXACTLY_ONCE_V2);
|
|
549
|
-
|
|
550
|
-
// Flink Exactly-Once
|
|
551
|
-
env.enableCheckpointing(60000);
|
|
552
|
-
env.getCheckpointConfig().setCheckpointingMode(
|
|
553
|
-
CheckpointingMode.EXACTLY_ONCE
|
|
554
|
-
);
|
|
555
|
-
|
|
556
|
-
// Kafka Sink Exactly-Once
|
|
557
|
-
FlinkKafkaProducer<String> producer = new FlinkKafkaProducer<>(
|
|
558
|
-
"output-topic",
|
|
559
|
-
new SimpleStringSchema(),
|
|
560
|
-
properties,
|
|
561
|
-
FlinkKafkaProducer.Semantic.EXACTLY_ONCE
|
|
562
|
-
);
|
|
563
|
-
```
|
|
564
|
-
|
|
565
|
-
### 故障恢复
|
|
566
|
-
|
|
567
|
-
```java
|
|
568
|
-
// Kafka Streams 自动恢复
|
|
569
|
-
// 状态存储自动从 Changelog Topic 恢复
|
|
570
|
-
|
|
571
|
-
// Flink 从 Checkpoint 恢复
|
|
572
|
-
// 自动从最近的 Checkpoint 恢复
|
|
573
|
-
|
|
574
|
-
// 手动从 Savepoint 恢复
|
|
575
|
-
// flink run -s /path/to/savepoint your-job.jar
|
|
576
|
-
```
|
|
577
|
-
|
|
578
|
-
## 性能优化
|
|
579
|
-
|
|
580
|
-
### Kafka Streams 优化
|
|
581
|
-
|
|
582
|
-
```java
|
|
583
|
-
// 增加并行度
|
|
584
|
-
props.put(StreamsConfig.NUM_STREAM_THREADS_CONFIG, 4);
|
|
585
|
-
|
|
586
|
-
// 优化状态存储
|
|
587
|
-
props.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 10 * 1024 * 1024);
|
|
588
|
-
props.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, 1000);
|
|
589
|
-
|
|
590
|
-
// RocksDB 配置
|
|
591
|
-
props.put(StreamsConfig.ROCKSDB_CONFIG_SETTER_CLASS_CONFIG,
|
|
592
|
-
CustomRocksDBConfig.class);
|
|
593
|
-
```
|
|
594
|
-
|
|
595
|
-
### Flink 优化
|
|
596
|
-
|
|
597
|
-
```java
|
|
598
|
-
// 调整并行度
|
|
599
|
-
env.setParallelism(8);
|
|
600
|
-
|
|
601
|
-
// 配置内存
|
|
602
|
-
env.getConfig().setTaskManagerMemory(MemorySize.ofMebiBytes(2048));
|
|
603
|
-
|
|
604
|
-
// 启用对象重用
|
|
605
|
-
env.getConfig().enableObjectReuse();
|
|
606
|
-
|
|
607
|
-
// 配置网络缓冲区
|
|
608
|
-
env.getConfig().setNetworkBufferMemory(64 * 1024 * 1024);
|
|
609
|
-
```
|
|
610
|
-
|
|
611
|
-
## Python API
|
|
612
|
-
|
|
613
|
-
### Kafka Streams Python (kafka-python)
|
|
614
|
-
|
|
615
|
-
```python
|
|
616
|
-
from kafka import KafkaConsumer, KafkaProducer
|
|
617
|
-
import json
|
|
618
|
-
|
|
619
|
-
consumer = KafkaConsumer(
|
|
620
|
-
'input-topic',
|
|
621
|
-
bootstrap_servers=['localhost:9092'],
|
|
622
|
-
value_deserializer=lambda m: json.loads(m.decode('utf-8'))
|
|
623
|
-
)
|
|
624
|
-
|
|
625
|
-
producer = KafkaProducer(
|
|
626
|
-
bootstrap_servers=['localhost:9092'],
|
|
627
|
-
value_serializer=lambda v: json.dumps(v).encode('utf-8')
|
|
628
|
-
)
|
|
629
|
-
|
|
630
|
-
for message in consumer:
|
|
631
|
-
value = message.value
|
|
632
|
-
|
|
633
|
-
# 处理逻辑
|
|
634
|
-
processed = {
|
|
635
|
-
'key': value['key'],
|
|
636
|
-
'value': value['value'].upper()
|
|
637
|
-
}
|
|
638
|
-
|
|
639
|
-
producer.send('output-topic', processed)
|
|
640
|
-
producer.flush()
|
|
641
|
-
```
|
|
642
|
-
|
|
643
|
-
### PyFlink
|
|
644
|
-
|
|
645
|
-
```python
|
|
646
|
-
from pyflink.datastream import StreamExecutionEnvironment
|
|
647
|
-
from pyflink.datastream.connectors import FlinkKafkaConsumer, FlinkKafkaProducer
|
|
648
|
-
from pyflink.common.serialization import SimpleStringSchema
|
|
649
|
-
|
|
650
|
-
env = StreamExecutionEnvironment.get_execution_environment()
|
|
651
|
-
|
|
652
|
-
# 从 Kafka 读取
|
|
653
|
-
kafka_consumer = FlinkKafkaConsumer(
|
|
654
|
-
topics='input-topic',
|
|
655
|
-
deserialization_schema=SimpleStringSchema(),
|
|
656
|
-
properties={'bootstrap.servers': 'localhost:9092'}
|
|
657
|
-
)
|
|
658
|
-
|
|
659
|
-
stream = env.add_source(kafka_consumer)
|
|
660
|
-
|
|
661
|
-
# 转换处理
|
|
662
|
-
processed = stream \
|
|
663
|
-
.filter(lambda x: x is not None) \
|
|
664
|
-
.map(lambda x: x.upper())
|
|
665
|
-
|
|
666
|
-
# 写入 Kafka
|
|
667
|
-
kafka_producer = FlinkKafkaProducer(
|
|
668
|
-
topic='output-topic',
|
|
669
|
-
serialization_schema=SimpleStringSchema(),
|
|
670
|
-
producer_config={'bootstrap.servers': 'localhost:9092'}
|
|
671
|
-
)
|
|
672
|
-
|
|
673
|
-
processed.add_sink(kafka_producer)
|
|
674
|
-
|
|
675
|
-
env.execute("PyFlink Stream Processor")
|
|
676
|
-
```
|
|
677
|
-
|
|
678
|
-
## 监控指标
|
|
679
|
-
|
|
680
|
-
### Kafka Streams 指标
|
|
681
|
-
|
|
682
|
-
```java
|
|
683
|
-
// JMX 指标
|
|
684
|
-
// kafka.streams:type=stream-metrics,client-id=*
|
|
685
|
-
// - commit-latency-avg
|
|
686
|
-
// - poll-latency-avg
|
|
687
|
-
// - process-latency-avg
|
|
688
|
-
|
|
689
|
-
// 自定义指标
|
|
690
|
-
StreamsMetrics metrics = context.metrics();
|
|
691
|
-
Sensor sensor = metrics.addLatencySensor(
|
|
692
|
-
"process",
|
|
693
|
-
"latency",
|
|
694
|
-
"Process latency"
|
|
695
|
-
);
|
|
696
|
-
```
|
|
697
|
-
|
|
698
|
-
### Flink 指标
|
|
699
|
-
|
|
700
|
-
```java
|
|
701
|
-
// 注册指标
|
|
702
|
-
public class MyMapFunction extends RichMapFunction<String, String> {
|
|
703
|
-
private transient Counter counter;
|
|
704
|
-
|
|
705
|
-
@Override
|
|
706
|
-
public void open(Configuration parameters) {
|
|
707
|
-
this.counter = getRuntimeContext()
|
|
708
|
-
.getMetricGroup()
|
|
709
|
-
.counter("myCounter");
|
|
710
|
-
}
|
|
711
|
-
|
|
712
|
-
@Override
|
|
713
|
-
public String map(String value) {
|
|
714
|
-
counter.inc();
|
|
715
|
-
return value.toUpperCase();
|
|
716
|
-
}
|
|
717
|
-
}
|
|
718
|
-
```
|
|
719
|
-
|
|
720
|
-
## 最佳实践
|
|
721
|
-
|
|
722
|
-
### 状态大小控制
|
|
723
|
-
|
|
724
|
-
```java
|
|
725
|
-
// 使用 TTL 清理过期状态
|
|
726
|
-
StateTtlConfig ttlConfig = StateTtlConfig
|
|
727
|
-
.newBuilder(Time.hours(24))
|
|
728
|
-
.setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite)
|
|
729
|
-
.setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired)
|
|
730
|
-
.build();
|
|
731
|
-
|
|
732
|
-
ValueStateDescriptor<Long> descriptor =
|
|
733
|
-
new ValueStateDescriptor<>("count", Long.class);
|
|
734
|
-
descriptor.enableTimeToLive(ttlConfig);
|
|
735
|
-
```
|
|
736
|
-
|
|
737
|
-
### 数据倾斜处理
|
|
738
|
-
|
|
739
|
-
```java
|
|
740
|
-
// 添加随机前缀
|
|
741
|
-
stream
|
|
742
|
-
.map(value -> {
|
|
743
|
-
String randomPrefix = String.valueOf(new Random().nextInt(10));
|
|
744
|
-
return Tuple2.of(randomPrefix + "-" + value.getKey(), value);
|
|
745
|
-
})
|
|
746
|
-
.keyBy(tuple -> tuple.f0)
|
|
747
|
-
.window(TumblingProcessingTimeWindows.of(Time.minutes(5)))
|
|
748
|
-
.sum(1);
|
|
749
|
-
```
|
|
750
|
-
|
|
751
|
-
### 延迟数据处理
|
|
752
|
-
|
|
753
|
-
```java
|
|
754
|
-
// 使用侧输出流处理延迟数据
|
|
755
|
-
OutputTag<Event> lateOutputTag = new OutputTag<Event>("late-data"){};
|
|
756
|
-
|
|
757
|
-
DataStream<Event> result = stream
|
|
758
|
-
.keyBy(Event::getKey)
|
|
759
|
-
.window(TumblingEventTimeWindows.of(Time.minutes(5)))
|
|
760
|
-
.allowedLateness(Time.minutes(1))
|
|
761
|
-
.sideOutputLateData(lateOutputTag)
|
|
762
|
-
.sum("value");
|
|
763
|
-
|
|
764
|
-
DataStream<Event> lateStream = result.getSideOutput(lateOutputTag);
|
|
765
|
-
```
|
|
766
|
-
|
|
767
|
-
## 框架对比
|
|
768
|
-
|
|
769
|
-
| 特性 | Kafka Streams | Flink | Spark Streaming |
|
|
770
|
-
|------|---------------|-------|-----------------|
|
|
771
|
-
| 部署模式 | 嵌入式 | 独立集群 | 独立集群 |
|
|
772
|
-
| 状态管理 | RocksDB | 内存/RocksDB | 内存 |
|
|
773
|
-
| Exactly-Once | ✅ | ✅ | ✅ |
|
|
774
|
-
| 窗口类型 | 丰富 | 最丰富 | 基础 |
|
|
775
|
-
| 学习曲线 | 平缓 | 陡峭 | 中等 |
|
|
776
|
-
| 生态集成 | Kafka 生态 | 广泛 | Spark 生态 |
|
|
777
|
-
|
|
778
|
-
## 工具清单
|
|
779
|
-
|
|
780
|
-
| 工具 | 用途 | 推荐场景 |
|
|
781
|
-
|------|------|----------|
|
|
782
|
-
| Kafka Streams | 轻量级流处理 | Kafka 生态、简单转换 |
|
|
783
|
-
| Apache Flink | 分布式流处理 | 复杂窗口、状态管理 |
|
|
784
|
-
| Spark Streaming | 批流一体 | Spark 生态、批流混合 |
|
|
785
|
-
| Apache Storm | 实时计算 | 低延迟、简单拓扑 |
|
|
786
|
-
| Apache Samza | LinkedIn 流处理 | Kafka + YARN |
|
|
787
|
-
| Pulsar Functions | Pulsar 流处理 | Pulsar 生态 |
|
|
788
|
-
|
|
789
|
-
## 触发词
|
|
790
|
-
|
|
791
|
-
流处理、Kafka Streams、Flink、实时处理、流式计算、窗口函数、状态管理、Checkpoint、Watermark、背压
|