ethan-skill 1.11.0 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +1019 -1
- package/dist/cli/index.js.map +1 -1
- package/dist/mcp/server.d.ts.map +1 -1
- package/dist/mcp/server.js +206 -1
- package/dist/mcp/server.js.map +1 -1
- package/dist/skills/27-tech-debt.d.ts +3 -0
- package/dist/skills/27-tech-debt.d.ts.map +1 -0
- package/dist/skills/27-tech-debt.js +149 -0
- package/dist/skills/27-tech-debt.js.map +1 -0
- package/dist/skills/28-api-mock.d.ts +3 -0
- package/dist/skills/28-api-mock.d.ts.map +1 -0
- package/dist/skills/28-api-mock.js +272 -0
- package/dist/skills/28-api-mock.js.map +1 -0
- package/dist/skills/29-data-migration.d.ts +3 -0
- package/dist/skills/29-data-migration.d.ts.map +1 -0
- package/dist/skills/29-data-migration.js +331 -0
- package/dist/skills/29-data-migration.js.map +1 -0
- package/dist/skills/30-llm-feature.d.ts +3 -0
- package/dist/skills/30-llm-feature.d.ts.map +1 -0
- package/dist/skills/30-llm-feature.js +328 -0
- package/dist/skills/30-llm-feature.js.map +1 -0
- package/dist/skills/31-threat-model.d.ts +3 -0
- package/dist/skills/31-threat-model.d.ts.map +1 -0
- package/dist/skills/31-threat-model.js +240 -0
- package/dist/skills/31-threat-model.js.map +1 -0
- package/dist/skills/32-green-code.d.ts +3 -0
- package/dist/skills/32-green-code.d.ts.map +1 -0
- package/dist/skills/32-green-code.js +346 -0
- package/dist/skills/32-green-code.js.map +1 -0
- package/dist/skills/33-service-catalog.d.ts +3 -0
- package/dist/skills/33-service-catalog.d.ts.map +1 -0
- package/dist/skills/33-service-catalog.js +334 -0
- package/dist/skills/33-service-catalog.js.map +1 -0
- package/dist/skills/34-mobile-review.d.ts +3 -0
- package/dist/skills/34-mobile-review.d.ts.map +1 -0
- package/dist/skills/34-mobile-review.js +390 -0
- package/dist/skills/34-mobile-review.js.map +1 -0
- package/dist/skills/35-data-pipeline.d.ts +3 -0
- package/dist/skills/35-data-pipeline.d.ts.map +1 -0
- package/dist/skills/35-data-pipeline.js +392 -0
- package/dist/skills/35-data-pipeline.js.map +1 -0
- package/dist/skills/36-ml-experiment.d.ts +3 -0
- package/dist/skills/36-ml-experiment.d.ts.map +1 -0
- package/dist/skills/36-ml-experiment.js +415 -0
- package/dist/skills/36-ml-experiment.js.map +1 -0
- package/dist/skills/index.d.ts +10 -0
- package/dist/skills/index.d.ts.map +1 -1
- package/dist/skills/index.js +41 -1
- package/dist/skills/index.js.map +1 -1
- package/dist/skills/pipeline.d.ts.map +1 -1
- package/dist/skills/pipeline.js +35 -0
- package/dist/skills/pipeline.js.map +1 -1
- package/dist/skills/skills.test.js +3 -3
- package/dist/skills/skills.test.js.map +1 -1
- package/package.json +1 -1
- package/rules/claude-code/CLAUDE.md +2963 -3
- package/rules/cline/.clinerules +2805 -2
- package/rules/codebuddy/CODEBUDDY.md +2913 -2
- package/rules/continue/.continuerules +2805 -2
- package/rules/copilot/copilot-instructions.md +2883 -2
- package/rules/cursor/.cursorrules +2952 -2
- package/rules/cursor/smart-flow.mdc +2952 -2
- package/rules/jetbrains/smart-flow.md +2883 -2
- package/rules/lingma/smart-flow.md +2904 -3
- package/rules/windsurf/.windsurf/rules/smart-flow.md +2884 -3
- package/rules/zed/smart-flow.rules +2794 -1
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.dataPipelineSkill = void 0;
|
|
4
|
+
exports.dataPipelineSkill = {
|
|
5
|
+
id: 'data-pipeline',
|
|
6
|
+
name: '数据管道设计',
|
|
7
|
+
nameEn: 'data_pipeline',
|
|
8
|
+
order: 35,
|
|
9
|
+
description: '设计 Batch/Streaming 数据管道,制定数据质量规则、容错策略与血缘追踪方案',
|
|
10
|
+
descriptionEn: 'Design Batch/Streaming data pipelines with data quality rules, fault tolerance strategies and lineage tracking',
|
|
11
|
+
detailDescription: `数据管道是数据驱动业务的基础设施,但设计不当会导致数据丢失、重复和质量下降。
|
|
12
|
+
本 Skill 覆盖数据管道设计全流程:从 Batch/Streaming/Lambda/Kappa 架构选型,
|
|
13
|
+
到数据质量规则(完整性/准确性/及时性),再到幂等容错、死信队列和断点续传设计,
|
|
14
|
+
最终建立数据血缘(Data Lineage)和可观测性体系,让数据流动可靠、可追溯。`,
|
|
15
|
+
triggers: [
|
|
16
|
+
'数据管道',
|
|
17
|
+
'data pipeline',
|
|
18
|
+
'etl',
|
|
19
|
+
'elt',
|
|
20
|
+
'data engineering',
|
|
21
|
+
'数据工程',
|
|
22
|
+
'airflow',
|
|
23
|
+
'kafka pipeline',
|
|
24
|
+
'streaming pipeline',
|
|
25
|
+
'@ethan data-pipeline',
|
|
26
|
+
'/data-pipeline',
|
|
27
|
+
],
|
|
28
|
+
steps: [
|
|
29
|
+
{
|
|
30
|
+
title: '1. 架构选型',
|
|
31
|
+
content: `根据数据特征和业务需求选择合适的管道架构:
|
|
32
|
+
|
|
33
|
+
**四大架构模式决策矩阵**
|
|
34
|
+
|
|
35
|
+
| 架构 | 延迟 | 复杂度 | 适用场景 | 代表工具 |
|
|
36
|
+
|------|------|--------|---------|---------|
|
|
37
|
+
| **Batch(批处理)** | 小时~天 | 低 | 日报、数据仓库 ETL、离线训练 | Airflow + Spark |
|
|
38
|
+
| **Streaming(流处理)** | 秒~毫秒 | 高 | 实时监控、欺诈检测、事件驱动 | Kafka + Flink |
|
|
39
|
+
| **Lambda(λ)** | 两套 | 极高 | 需要批量精确性+流式时效性 | Kafka + Spark + Hive |
|
|
40
|
+
| **Kappa(κ)** | 秒 | 中 | 一切皆流,重播历史数据 | Kafka + Flink |
|
|
41
|
+
|
|
42
|
+
**决策树**
|
|
43
|
+
\`\`\`
|
|
44
|
+
数据新鲜度要求 < 1分钟?
|
|
45
|
+
├── YES → Streaming(Kafka + Flink)
|
|
46
|
+
└── NO
|
|
47
|
+
└── 需要精确的历史重算?
|
|
48
|
+
├── YES + 实时结果 → Lambda 架构
|
|
49
|
+
├── YES + 可接受重播 → Kappa 架构
|
|
50
|
+
└── NO → Batch(Airflow + Spark/dbt)
|
|
51
|
+
\`\`\`
|
|
52
|
+
|
|
53
|
+
**Source → Transform → Sink 数据流图**
|
|
54
|
+
\`\`\`
|
|
55
|
+
数据源(Sources):
|
|
56
|
+
MySQL / PostgreSQL(CDC)
|
|
57
|
+
Kafka Topics
|
|
58
|
+
REST API / Webhook
|
|
59
|
+
文件系统(S3/GCS)
|
|
60
|
+
│
|
|
61
|
+
▼
|
|
62
|
+
变换层(Transform):
|
|
63
|
+
数据清洗(去重/标准化/脱敏)
|
|
64
|
+
业务逻辑计算
|
|
65
|
+
聚合/Join
|
|
66
|
+
│
|
|
67
|
+
▼
|
|
68
|
+
目标层(Sinks):
|
|
69
|
+
数据仓库(BigQuery/Snowflake/Redshift)
|
|
70
|
+
数据湖(S3/GCS Parquet)
|
|
71
|
+
搜索引擎(Elasticsearch)
|
|
72
|
+
特征存储(Feature Store)
|
|
73
|
+
\`\`\`
|
|
74
|
+
|
|
75
|
+
**技术栈推荐**
|
|
76
|
+
\`\`\`
|
|
77
|
+
小团队/初创:
|
|
78
|
+
Airflow + dbt + BigQuery(低运维成本)
|
|
79
|
+
|
|
80
|
+
中大型团队:
|
|
81
|
+
Kafka + Flink + Iceberg + Trino(高吞吐实时)
|
|
82
|
+
|
|
83
|
+
全托管方案:
|
|
84
|
+
Fivetran(Ingestion)+ dbt Cloud(Transform)+ Snowflake(Warehouse)
|
|
85
|
+
\`\`\`
|
|
86
|
+
|
|
87
|
+
**输出**:架构选型报告 + 数据流图`,
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
title: '2. 数据质量规则',
|
|
91
|
+
content: `定义并实施数据质量检查规则:
|
|
92
|
+
|
|
93
|
+
**数据质量四维度**
|
|
94
|
+
|
|
95
|
+
| 维度 | 定义 | 检查方式 | 阈值示例 |
|
|
96
|
+
|------|------|---------|---------|
|
|
97
|
+
| **完整性** | 必填字段不为空 | NULL 率检查 | < 0.1% |
|
|
98
|
+
| **准确性** | 值在合理范围内 | 范围/格式校验 | 异常值率 < 1% |
|
|
99
|
+
| **一致性** | 跨系统数据一致 | 对账/交叉校验 | 差异率 < 0.01% |
|
|
100
|
+
| **及时性** | 数据按时到达 | 延迟监控 | 最大延迟 < 2h |
|
|
101
|
+
|
|
102
|
+
**dbt 数据质量测试(推荐)**
|
|
103
|
+
\`\`\`yaml
|
|
104
|
+
# models/orders.yml
|
|
105
|
+
version: 2
|
|
106
|
+
models:
|
|
107
|
+
- name: orders
|
|
108
|
+
columns:
|
|
109
|
+
- name: order_id
|
|
110
|
+
tests:
|
|
111
|
+
- not_null
|
|
112
|
+
- unique
|
|
113
|
+
- name: user_id
|
|
114
|
+
tests:
|
|
115
|
+
- not_null
|
|
116
|
+
- relationships:
|
|
117
|
+
to: ref('users')
|
|
118
|
+
field: id
|
|
119
|
+
- name: status
|
|
120
|
+
tests:
|
|
121
|
+
- accepted_values:
|
|
122
|
+
values: ['pending', 'paid', 'shipped', 'completed', 'cancelled']
|
|
123
|
+
- name: amount
|
|
124
|
+
tests:
|
|
125
|
+
- not_null
|
|
126
|
+
- dbt_expectations.expect_column_values_to_be_between:
|
|
127
|
+
min_value: 0
|
|
128
|
+
max_value: 1000000
|
|
129
|
+
\`\`\`
|
|
130
|
+
|
|
131
|
+
**Great Expectations 数据期望(Python)**
|
|
132
|
+
\`\`\`python
|
|
133
|
+
import great_expectations as gx
|
|
134
|
+
|
|
135
|
+
context = gx.get_context()
|
|
136
|
+
suite = context.add_expectation_suite("orders_suite")
|
|
137
|
+
|
|
138
|
+
# 定义数据期望
|
|
139
|
+
suite.add_expectation(
|
|
140
|
+
gx.expectations.ExpectColumnValuesToNotBeNull(column="order_id")
|
|
141
|
+
)
|
|
142
|
+
suite.add_expectation(
|
|
143
|
+
gx.expectations.ExpectColumnValuesToBeBetween(
|
|
144
|
+
column="amount", min_value=0, max_value=1_000_000
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
suite.add_expectation(
|
|
148
|
+
gx.expectations.ExpectTableRowCountToBeBetween(
|
|
149
|
+
min_value=1000, # 每日订单不少于1000
|
|
150
|
+
max_value=1_000_000
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# 运行验证
|
|
155
|
+
result = context.run_checkpoint("daily_orders_checkpoint")
|
|
156
|
+
if not result.success:
|
|
157
|
+
raise DataQualityError("数据质量检查未通过!")
|
|
158
|
+
\`\`\`
|
|
159
|
+
|
|
160
|
+
**数据质量告警**
|
|
161
|
+
\`\`\`yaml
|
|
162
|
+
# 数据质量 SLA
|
|
163
|
+
- NULL 率超过 1%:🔴 阻断管道,Slack 告警
|
|
164
|
+
- 行数下降 > 20%(对比昨日):🟠 告警,人工核查
|
|
165
|
+
- P99 延迟 > 2 小时:🟡 警告,关注
|
|
166
|
+
\`\`\`
|
|
167
|
+
|
|
168
|
+
**输出**:数据质量规则文档 + dbt/GE 测试配置`,
|
|
169
|
+
},
|
|
170
|
+
{
|
|
171
|
+
title: '3. 管道设计与代码模板',
|
|
172
|
+
content: `生成可复用的管道代码模板:
|
|
173
|
+
|
|
174
|
+
**Airflow DAG 模板(Batch)**
|
|
175
|
+
\`\`\`python
|
|
176
|
+
# dags/daily_orders_etl.py
|
|
177
|
+
from airflow import DAG
|
|
178
|
+
from airflow.operators.python import PythonOperator
|
|
179
|
+
from airflow.providers.postgres.hooks.postgres import PostgresHook
|
|
180
|
+
from datetime import datetime, timedelta
|
|
181
|
+
|
|
182
|
+
default_args = {
|
|
183
|
+
'owner': 'data-team',
|
|
184
|
+
'retries': 3,
|
|
185
|
+
'retry_delay': timedelta(minutes=5),
|
|
186
|
+
'on_failure_callback': alert_slack,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
with DAG(
|
|
190
|
+
'daily_orders_etl',
|
|
191
|
+
default_args=default_args,
|
|
192
|
+
schedule_interval='0 2 * * *', # 每日凌晨2点
|
|
193
|
+
start_date=datetime(2024, 1, 1),
|
|
194
|
+
catchup=False, # 不回填历史
|
|
195
|
+
tags=['orders', 'daily'],
|
|
196
|
+
) as dag:
|
|
197
|
+
|
|
198
|
+
extract = PythonOperator(task_id='extract', python_callable=extract_orders)
|
|
199
|
+
validate = PythonOperator(task_id='validate', python_callable=run_quality_checks)
|
|
200
|
+
transform = PythonOperator(task_id='transform', python_callable=transform_orders)
|
|
201
|
+
load = PythonOperator(task_id='load', python_callable=load_to_warehouse)
|
|
202
|
+
notify = PythonOperator(task_id='notify', python_callable=send_completion_report)
|
|
203
|
+
|
|
204
|
+
extract >> validate >> transform >> load >> notify
|
|
205
|
+
\`\`\`
|
|
206
|
+
|
|
207
|
+
**Kafka + Flink 流处理模板**
|
|
208
|
+
\`\`\`python
|
|
209
|
+
# Flink Python API
|
|
210
|
+
from pyflink.datastream import StreamExecutionEnvironment
|
|
211
|
+
from pyflink.datastream.connectors.kafka import KafkaSource, KafkaSink
|
|
212
|
+
|
|
213
|
+
env = StreamExecutionEnvironment.get_execution_environment()
|
|
214
|
+
env.set_parallelism(4)
|
|
215
|
+
|
|
216
|
+
# Source
|
|
217
|
+
kafka_source = KafkaSource.builder() \
|
|
218
|
+
.set_bootstrap_servers("kafka:9092") \
|
|
219
|
+
.set_topics("orders") \
|
|
220
|
+
.set_group_id("flink-order-processor") \
|
|
221
|
+
.set_value_only_deserializer(JsonRowDeserializationSchema()) \
|
|
222
|
+
.build()
|
|
223
|
+
|
|
224
|
+
stream = env.from_source(kafka_source, WatermarkStrategy.no_watermarks(), "Kafka Source")
|
|
225
|
+
|
|
226
|
+
# Transform(滚动窗口:每分钟统计)
|
|
227
|
+
result = stream \
|
|
228
|
+
.key_by(lambda x: x['region']) \
|
|
229
|
+
.window(TumblingEventTimeWindows.of(Time.minutes(1))) \
|
|
230
|
+
.aggregate(OrderAggregateFunction())
|
|
231
|
+
|
|
232
|
+
# Sink
|
|
233
|
+
result.sink_to(KafkaSink.builder()
|
|
234
|
+
.set_bootstrap_servers("kafka:9092")
|
|
235
|
+
.set_record_serializer(JsonRowSerializationSchema())
|
|
236
|
+
.build())
|
|
237
|
+
|
|
238
|
+
env.execute("Order Stream Processing")
|
|
239
|
+
\`\`\`
|
|
240
|
+
|
|
241
|
+
**输出**:Airflow DAG 模板 + Flink/Spark 流处理模板`,
|
|
242
|
+
},
|
|
243
|
+
{
|
|
244
|
+
title: '4. 容错与重试策略',
|
|
245
|
+
content: `确保管道在故障时可靠恢复:
|
|
246
|
+
|
|
247
|
+
**核心容错原则**
|
|
248
|
+
|
|
249
|
+
**① 幂等性设计(最重要)**
|
|
250
|
+
\`\`\`python
|
|
251
|
+
def load_orders_idempotent(batch_date: str, orders: list):
|
|
252
|
+
"""幂等加载:同一批次多次执行结果相同"""
|
|
253
|
+
# 使用 INSERT ... ON CONFLICT DO NOTHING
|
|
254
|
+
# 或 MERGE(UPSERT)语义
|
|
255
|
+
pg_hook.run("""
|
|
256
|
+
INSERT INTO orders_dw (order_id, batch_date, amount, status)
|
|
257
|
+
VALUES %s
|
|
258
|
+
ON CONFLICT (order_id) DO UPDATE SET
|
|
259
|
+
amount = EXCLUDED.amount,
|
|
260
|
+
status = EXCLUDED.status,
|
|
261
|
+
updated_at = NOW()
|
|
262
|
+
""", [(o['id'], batch_date, o['amount'], o['status']) for o in orders])
|
|
263
|
+
\`\`\`
|
|
264
|
+
|
|
265
|
+
**② 死信队列(DLQ)**
|
|
266
|
+
\`\`\`python
|
|
267
|
+
# Kafka — 处理失败消息路由到 DLQ
|
|
268
|
+
def process_message(msg):
|
|
269
|
+
try:
|
|
270
|
+
transform_and_load(msg)
|
|
271
|
+
except Exception as e:
|
|
272
|
+
# 路由到死信队列,保留原始消息 + 错误信息
|
|
273
|
+
dlq_producer.send('orders-dlq', {
|
|
274
|
+
'original_message': msg,
|
|
275
|
+
'error': str(e),
|
|
276
|
+
'failed_at': datetime.utcnow().isoformat(),
|
|
277
|
+
'retry_count': msg.get('retry_count', 0) + 1
|
|
278
|
+
})
|
|
279
|
+
logger.error(f"Message sent to DLQ: {e}")
|
|
280
|
+
\`\`\`
|
|
281
|
+
|
|
282
|
+
**③ 断点续传(Checkpoint)**
|
|
283
|
+
\`\`\`python
|
|
284
|
+
def process_large_dataset():
|
|
285
|
+
checkpoint_file = '.etl_checkpoint'
|
|
286
|
+
last_id = load_checkpoint(checkpoint_file)
|
|
287
|
+
|
|
288
|
+
for batch in fetch_in_batches(after_id=last_id, batch_size=1000):
|
|
289
|
+
process_batch(batch)
|
|
290
|
+
save_checkpoint(checkpoint_file, batch[-1]['id']) # 每批保存进度
|
|
291
|
+
\`\`\`
|
|
292
|
+
|
|
293
|
+
**④ 指数退避重试**
|
|
294
|
+
\`\`\`python
|
|
295
|
+
import tenacity
|
|
296
|
+
|
|
297
|
+
@tenacity.retry(
|
|
298
|
+
wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
|
|
299
|
+
stop=tenacity.stop_after_attempt(5),
|
|
300
|
+
retry=tenacity.retry_if_exception_type(TransientError),
|
|
301
|
+
before_sleep=tenacity.before_sleep_log(logger, logging.WARNING)
|
|
302
|
+
)
|
|
303
|
+
def call_external_api(data):
|
|
304
|
+
return requests.post(API_URL, json=data, timeout=30)
|
|
305
|
+
\`\`\`
|
|
306
|
+
|
|
307
|
+
**输出**:容错策略文档 + 幂等加载模板 + DLQ 配置`,
|
|
308
|
+
},
|
|
309
|
+
{
|
|
310
|
+
title: '5. 数据血缘与可观测性',
|
|
311
|
+
content: `建立数据血缘追踪和管道监控体系:
|
|
312
|
+
|
|
313
|
+
**数据血缘(Data Lineage)**
|
|
314
|
+
\`\`\`python
|
|
315
|
+
# OpenLineage 标准(Marquez/Atlan/DataHub 支持)
|
|
316
|
+
from openlineage.client import OpenLineageClient
|
|
317
|
+
from openlineage.client.run import RunEvent, Job, Run, Dataset
|
|
318
|
+
|
|
319
|
+
client = OpenLineageClient.from_environment()
|
|
320
|
+
|
|
321
|
+
# 记录数据流转关系
|
|
322
|
+
client.emit(RunEvent(
|
|
323
|
+
eventType="COMPLETE",
|
|
324
|
+
job=Job(namespace="etl", name="daily_orders_transform"),
|
|
325
|
+
run=Run(runId=str(uuid4())),
|
|
326
|
+
inputs=[Dataset(namespace="postgres", name="raw_orders")],
|
|
327
|
+
outputs=[Dataset(namespace="bigquery", name="orders_dw.fact_orders")]
|
|
328
|
+
))
|
|
329
|
+
\`\`\`
|
|
330
|
+
|
|
331
|
+
**管道健康仪表盘指标**
|
|
332
|
+
\`\`\`
|
|
333
|
+
关键指标(Grafana 面板):
|
|
334
|
+
─────────────────────────────
|
|
335
|
+
延迟指标:
|
|
336
|
+
- 数据新鲜度(最新记录时间戳)
|
|
337
|
+
- Pipeline P95 执行时长
|
|
338
|
+
- Kafka Consumer Lag
|
|
339
|
+
|
|
340
|
+
质量指标:
|
|
341
|
+
- 每日 NULL 率趋势
|
|
342
|
+
- 行数异常检测(±30% 告警)
|
|
343
|
+
- 数据质量测试通过率
|
|
344
|
+
|
|
345
|
+
运维指标:
|
|
346
|
+
- 管道成功率(目标 > 99%)
|
|
347
|
+
- 重试次数分布
|
|
348
|
+
- DLQ 消息积压
|
|
349
|
+
\`\`\`
|
|
350
|
+
|
|
351
|
+
**Airflow SLA 监控**
|
|
352
|
+
\`\`\`python
|
|
353
|
+
with DAG(
|
|
354
|
+
'daily_orders_etl',
|
|
355
|
+
sla_miss_callback=sla_miss_alert, # SLA 超时回调
|
|
356
|
+
...
|
|
357
|
+
) as dag:
|
|
358
|
+
load_task = PythonOperator(
|
|
359
|
+
task_id='load',
|
|
360
|
+
python_callable=load_to_warehouse,
|
|
361
|
+
sla=timedelta(hours=4), # 必须在4小时内完成
|
|
362
|
+
)
|
|
363
|
+
\`\`\`
|
|
364
|
+
|
|
365
|
+
**数据目录集成**
|
|
366
|
+
\`\`\`yaml
|
|
367
|
+
# dbt docs(自动生成)
|
|
368
|
+
# 运行后访问:dbt docs serve
|
|
369
|
+
# 包含:模型血缘图、列描述、测试结果
|
|
370
|
+
|
|
371
|
+
# 推荐工具
|
|
372
|
+
生产级:DataHub, Atlan, Alation
|
|
373
|
+
开源轻量:Marquez, OpenMetadata
|
|
374
|
+
dbt 生态:dbt docs + Elementary
|
|
375
|
+
\`\`\`
|
|
376
|
+
|
|
377
|
+
**输出**:数据血缘配置 + 监控仪表盘指标定义 + 数据目录方案`,
|
|
378
|
+
},
|
|
379
|
+
],
|
|
380
|
+
outputFormat: '架构选型文档 + 数据流图 + 质量规则配置(dbt/GE)+ 管道代码模板 + 容错策略 + 血缘追踪方案',
|
|
381
|
+
examples: [],
|
|
382
|
+
notes: [
|
|
383
|
+
'幂等性是数据管道的第一原则——任何任务必须可以安全重试而不产生重复数据',
|
|
384
|
+
'从 Batch 开始,在有真实需求时再迁移到 Streaming,避免过早的复杂度',
|
|
385
|
+
'数据质量检查应与管道强耦合(而非事后补救),质量不达标应阻断下游加载',
|
|
386
|
+
'死信队列不是垃圾桶——DLQ 中的消息代表业务异常,必须定期审查和处理',
|
|
387
|
+
'dbt + Airflow + BigQuery 是目前最常见的现代数据栈,对大多数团队足够用',
|
|
388
|
+
],
|
|
389
|
+
category: '执行侧',
|
|
390
|
+
nextSkill: 'database-optimize',
|
|
391
|
+
};
|
|
392
|
+
//# sourceMappingURL=35-data-pipeline.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"35-data-pipeline.js","sourceRoot":"","sources":["../../src/skills/35-data-pipeline.ts"],"names":[],"mappings":";;;AAEa,QAAA,iBAAiB,GAAoB;IAChD,EAAE,EAAE,eAAe;IACnB,IAAI,EAAE,QAAQ;IACd,MAAM,EAAE,eAAe;IACvB,KAAK,EAAE,EAAE;IACT,WAAW,EAAE,8CAA8C;IAC3D,aAAa,EAAE,gHAAgH;IAC/H,iBAAiB,EAAE;;;2CAGsB;IACzC,QAAQ,EAAE;QACR,MAAM;QACN,eAAe;QACf,KAAK;QACL,KAAK;QACL,kBAAkB;QAClB,MAAM;QACN,SAAS;QACT,gBAAgB;QAChB,oBAAoB;QACpB,sBAAsB;QACtB,gBAAgB;KACjB;IACD,KAAK,EAAE;QACL;YACE,KAAK,EAAE,SAAS;YAChB,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;qBAwDM;SAChB;QACD;YACE,KAAK,EAAE,WAAW;YAClB,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;8BA6Ee;SACzB;QACD;YACE,KAAK,EAAE,cAAc;YACrB,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;0CAqE2B;SACrC;QACD;YACE,KAAK,EAAE,YAAY;YACnB,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gCA8DiB;SAC3B;QACD;YACE,KAAK,EAAE,cAAc;YACrB,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mCAkEoB;SAC9B;KACF;IACD,YAAY,EAAE,wDAAwD;IACtE,QAAQ,EAAE,EAAE;IACZ,KAAK,EAAE;QACL,qCAAqC;QACrC,2CAA2C;QAC3C,oCAAoC;QACpC,qCAAqC;QACrC,iDAAiD;KAClD;IACD,QAAQ,EAAE,KAAK;IACf,SAAS,EAAE,mBAAmB;CAC/B,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"36-ml-experiment.d.ts","sourceRoot":"","sources":["../../src/skills/36-ml-experiment.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,iBAAiB,EAAE,eA0Z/B,CAAC"}
|