ethan-skill 1.11.0 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/dist/cli/index.js +1019 -1
  2. package/dist/cli/index.js.map +1 -1
  3. package/dist/mcp/server.d.ts.map +1 -1
  4. package/dist/mcp/server.js +206 -1
  5. package/dist/mcp/server.js.map +1 -1
  6. package/dist/skills/27-tech-debt.d.ts +3 -0
  7. package/dist/skills/27-tech-debt.d.ts.map +1 -0
  8. package/dist/skills/27-tech-debt.js +149 -0
  9. package/dist/skills/27-tech-debt.js.map +1 -0
  10. package/dist/skills/28-api-mock.d.ts +3 -0
  11. package/dist/skills/28-api-mock.d.ts.map +1 -0
  12. package/dist/skills/28-api-mock.js +272 -0
  13. package/dist/skills/28-api-mock.js.map +1 -0
  14. package/dist/skills/29-data-migration.d.ts +3 -0
  15. package/dist/skills/29-data-migration.d.ts.map +1 -0
  16. package/dist/skills/29-data-migration.js +331 -0
  17. package/dist/skills/29-data-migration.js.map +1 -0
  18. package/dist/skills/30-llm-feature.d.ts +3 -0
  19. package/dist/skills/30-llm-feature.d.ts.map +1 -0
  20. package/dist/skills/30-llm-feature.js +328 -0
  21. package/dist/skills/30-llm-feature.js.map +1 -0
  22. package/dist/skills/31-threat-model.d.ts +3 -0
  23. package/dist/skills/31-threat-model.d.ts.map +1 -0
  24. package/dist/skills/31-threat-model.js +240 -0
  25. package/dist/skills/31-threat-model.js.map +1 -0
  26. package/dist/skills/32-green-code.d.ts +3 -0
  27. package/dist/skills/32-green-code.d.ts.map +1 -0
  28. package/dist/skills/32-green-code.js +346 -0
  29. package/dist/skills/32-green-code.js.map +1 -0
  30. package/dist/skills/33-service-catalog.d.ts +3 -0
  31. package/dist/skills/33-service-catalog.d.ts.map +1 -0
  32. package/dist/skills/33-service-catalog.js +334 -0
  33. package/dist/skills/33-service-catalog.js.map +1 -0
  34. package/dist/skills/34-mobile-review.d.ts +3 -0
  35. package/dist/skills/34-mobile-review.d.ts.map +1 -0
  36. package/dist/skills/34-mobile-review.js +390 -0
  37. package/dist/skills/34-mobile-review.js.map +1 -0
  38. package/dist/skills/35-data-pipeline.d.ts +3 -0
  39. package/dist/skills/35-data-pipeline.d.ts.map +1 -0
  40. package/dist/skills/35-data-pipeline.js +392 -0
  41. package/dist/skills/35-data-pipeline.js.map +1 -0
  42. package/dist/skills/36-ml-experiment.d.ts +3 -0
  43. package/dist/skills/36-ml-experiment.d.ts.map +1 -0
  44. package/dist/skills/36-ml-experiment.js +415 -0
  45. package/dist/skills/36-ml-experiment.js.map +1 -0
  46. package/dist/skills/index.d.ts +10 -0
  47. package/dist/skills/index.d.ts.map +1 -1
  48. package/dist/skills/index.js +41 -1
  49. package/dist/skills/index.js.map +1 -1
  50. package/dist/skills/pipeline.d.ts.map +1 -1
  51. package/dist/skills/pipeline.js +35 -0
  52. package/dist/skills/pipeline.js.map +1 -1
  53. package/dist/skills/skills.test.js +3 -3
  54. package/dist/skills/skills.test.js.map +1 -1
  55. package/package.json +1 -1
  56. package/rules/claude-code/CLAUDE.md +2963 -3
  57. package/rules/cline/.clinerules +2805 -2
  58. package/rules/codebuddy/CODEBUDDY.md +2913 -2
  59. package/rules/continue/.continuerules +2805 -2
  60. package/rules/copilot/copilot-instructions.md +2883 -2
  61. package/rules/cursor/.cursorrules +2952 -2
  62. package/rules/cursor/smart-flow.mdc +2952 -2
  63. package/rules/jetbrains/smart-flow.md +2883 -2
  64. package/rules/lingma/smart-flow.md +2904 -3
  65. package/rules/windsurf/.windsurf/rules/smart-flow.md +2884 -3
  66. package/rules/zed/smart-flow.rules +2794 -1
@@ -0,0 +1,392 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.dataPipelineSkill = void 0;
4
+ exports.dataPipelineSkill = {
5
+ id: 'data-pipeline',
6
+ name: '数据管道设计',
7
+ nameEn: 'data_pipeline',
8
+ order: 35,
9
+ description: '设计 Batch/Streaming 数据管道,制定数据质量规则、容错策略与血缘追踪方案',
10
+ descriptionEn: 'Design Batch/Streaming data pipelines with data quality rules, fault tolerance strategies and lineage tracking',
11
+ detailDescription: `数据管道是数据驱动业务的基础设施,但设计不当会导致数据丢失、重复和质量下降。
12
+ 本 Skill 覆盖数据管道设计全流程:从 Batch/Streaming/Lambda/Kappa 架构选型,
13
+ 到数据质量规则(完整性/准确性/及时性),再到幂等容错、死信队列和断点续传设计,
14
+ 最终建立数据血缘(Data Lineage)和可观测性体系,让数据流动可靠、可追溯。`,
15
+ triggers: [
16
+ '数据管道',
17
+ 'data pipeline',
18
+ 'etl',
19
+ 'elt',
20
+ 'data engineering',
21
+ '数据工程',
22
+ 'airflow',
23
+ 'kafka pipeline',
24
+ 'streaming pipeline',
25
+ '@ethan data-pipeline',
26
+ '/data-pipeline',
27
+ ],
28
+ steps: [
29
+ {
30
+ title: '1. 架构选型',
31
+ content: `根据数据特征和业务需求选择合适的管道架构:
32
+
33
+ **四大架构模式决策矩阵**
34
+
35
+ | 架构 | 延迟 | 复杂度 | 适用场景 | 代表工具 |
36
+ |------|------|--------|---------|---------|
37
+ | **Batch(批处理)** | 小时~天 | 低 | 日报、数据仓库 ETL、离线训练 | Airflow + Spark |
38
+ | **Streaming(流处理)** | 秒~毫秒 | 高 | 实时监控、欺诈检测、事件驱动 | Kafka + Flink |
39
+ | **Lambda(λ)** | 两套 | 极高 | 需要批量精确性+流式时效性 | Kafka + Spark + Hive |
40
+ | **Kappa(κ)** | 秒 | 中 | 一切皆流,重播历史数据 | Kafka + Flink |
41
+
42
+ **决策树**
43
+ \`\`\`
44
+ 数据新鲜度要求 < 1分钟?
45
+ ├── YES → Streaming(Kafka + Flink)
46
+ └── NO
47
+ └── 需要精确的历史重算?
48
+ ├── YES + 实时结果 → Lambda 架构
49
+ ├── YES + 可接受重播 → Kappa 架构
50
+ └── NO → Batch(Airflow + Spark/dbt)
51
+ \`\`\`
52
+
53
+ **Source → Transform → Sink 数据流图**
54
+ \`\`\`
55
+ 数据源(Sources):
56
+ MySQL / PostgreSQL(CDC)
57
+ Kafka Topics
58
+ REST API / Webhook
59
+ 文件系统(S3/GCS)
60
+
61
+
62
+ 变换层(Transform):
63
+ 数据清洗(去重/标准化/脱敏)
64
+ 业务逻辑计算
65
+ 聚合/Join
66
+
67
+
68
+ 目标层(Sinks):
69
+ 数据仓库(BigQuery/Snowflake/Redshift)
70
+ 数据湖(S3/GCS Parquet)
71
+ 搜索引擎(Elasticsearch)
72
+ 特征存储(Feature Store)
73
+ \`\`\`
74
+
75
+ **技术栈推荐**
76
+ \`\`\`
77
+ 小团队/初创:
78
+ Airflow + dbt + BigQuery(低运维成本)
79
+
80
+ 中大型团队:
81
+ Kafka + Flink + Iceberg + Trino(高吞吐实时)
82
+
83
+ 全托管方案:
84
+ Fivetran(Ingestion)+ dbt Cloud(Transform)+ Snowflake(Warehouse)
85
+ \`\`\`
86
+
87
+ **输出**:架构选型报告 + 数据流图`,
88
+ },
89
+ {
90
+ title: '2. 数据质量规则',
91
+ content: `定义并实施数据质量检查规则:
92
+
93
+ **数据质量四维度**
94
+
95
+ | 维度 | 定义 | 检查方式 | 阈值示例 |
96
+ |------|------|---------|---------|
97
+ | **完整性** | 必填字段不为空 | NULL 率检查 | < 0.1% |
98
+ | **准确性** | 值在合理范围内 | 范围/格式校验 | 异常值率 < 1% |
99
+ | **一致性** | 跨系统数据一致 | 对账/交叉校验 | 差异率 < 0.01% |
100
+ | **及时性** | 数据按时到达 | 延迟监控 | 最大延迟 < 2h |
101
+
102
+ **dbt 数据质量测试(推荐)**
103
+ \`\`\`yaml
104
+ # models/orders.yml
105
+ version: 2
106
+ models:
107
+ - name: orders
108
+ columns:
109
+ - name: order_id
110
+ tests:
111
+ - not_null
112
+ - unique
113
+ - name: user_id
114
+ tests:
115
+ - not_null
116
+ - relationships:
117
+ to: ref('users')
118
+ field: id
119
+ - name: status
120
+ tests:
121
+ - accepted_values:
122
+ values: ['pending', 'paid', 'shipped', 'completed', 'cancelled']
123
+ - name: amount
124
+ tests:
125
+ - not_null
126
+ - dbt_expectations.expect_column_values_to_be_between:
127
+ min_value: 0
128
+ max_value: 1000000
129
+ \`\`\`
130
+
131
+ **Great Expectations 数据期望(Python)**
132
+ \`\`\`python
133
+ import great_expectations as gx
134
+
135
+ context = gx.get_context()
136
+ suite = context.add_expectation_suite("orders_suite")
137
+
138
+ # 定义数据期望
139
+ suite.add_expectation(
140
+ gx.expectations.ExpectColumnValuesToNotBeNull(column="order_id")
141
+ )
142
+ suite.add_expectation(
143
+ gx.expectations.ExpectColumnValuesToBeBetween(
144
+ column="amount", min_value=0, max_value=1_000_000
145
+ )
146
+ )
147
+ suite.add_expectation(
148
+ gx.expectations.ExpectTableRowCountToBeBetween(
149
+ min_value=1000, # 每日订单不少于1000
150
+ max_value=1_000_000
151
+ )
152
+ )
153
+
154
+ # 运行验证
155
+ result = context.run_checkpoint("daily_orders_checkpoint")
156
+ if not result.success:
157
+ raise DataQualityError("数据质量检查未通过!")
158
+ \`\`\`
159
+
160
+ **数据质量告警**
161
+ \`\`\`yaml
162
+ # 数据质量 SLA
163
+ - NULL 率超过 1%:🔴 阻断管道,Slack 告警
164
+ - 行数下降 > 20%(对比昨日):🟠 告警,人工核查
165
+ - P99 延迟 > 2 小时:🟡 警告,关注
166
+ \`\`\`
167
+
168
+ **输出**:数据质量规则文档 + dbt/GE 测试配置`,
169
+ },
170
+ {
171
+ title: '3. 管道设计与代码模板',
172
+ content: `生成可复用的管道代码模板:
173
+
174
+ **Airflow DAG 模板(Batch)**
175
+ \`\`\`python
176
+ # dags/daily_orders_etl.py
177
+ from airflow import DAG
178
+ from airflow.operators.python import PythonOperator
179
+ from airflow.providers.postgres.hooks.postgres import PostgresHook
180
+ from datetime import datetime, timedelta
181
+
182
+ default_args = {
183
+ 'owner': 'data-team',
184
+ 'retries': 3,
185
+ 'retry_delay': timedelta(minutes=5),
186
+ 'on_failure_callback': alert_slack,
187
+ }
188
+
189
+ with DAG(
190
+ 'daily_orders_etl',
191
+ default_args=default_args,
192
+ schedule_interval='0 2 * * *', # 每日凌晨2点
193
+ start_date=datetime(2024, 1, 1),
194
+ catchup=False, # 不回填历史
195
+ tags=['orders', 'daily'],
196
+ ) as dag:
197
+
198
+ extract = PythonOperator(task_id='extract', python_callable=extract_orders)
199
+ validate = PythonOperator(task_id='validate', python_callable=run_quality_checks)
200
+ transform = PythonOperator(task_id='transform', python_callable=transform_orders)
201
+ load = PythonOperator(task_id='load', python_callable=load_to_warehouse)
202
+ notify = PythonOperator(task_id='notify', python_callable=send_completion_report)
203
+
204
+ extract >> validate >> transform >> load >> notify
205
+ \`\`\`
206
+
207
+ **Kafka + Flink 流处理模板**
208
+ \`\`\`python
209
+ # Flink Python API
210
+ from pyflink.datastream import StreamExecutionEnvironment
211
+ from pyflink.datastream.connectors.kafka import KafkaSource, KafkaSink
212
+
213
+ env = StreamExecutionEnvironment.get_execution_environment()
214
+ env.set_parallelism(4)
215
+
216
+ # Source
217
+ kafka_source = KafkaSource.builder() \
218
+ .set_bootstrap_servers("kafka:9092") \
219
+ .set_topics("orders") \
220
+ .set_group_id("flink-order-processor") \
221
+ .set_value_only_deserializer(JsonRowDeserializationSchema()) \
222
+ .build()
223
+
224
+ stream = env.from_source(kafka_source, WatermarkStrategy.no_watermarks(), "Kafka Source")
225
+
226
+ # Transform(滚动窗口:每分钟统计)
227
+ result = stream \
228
+ .key_by(lambda x: x['region']) \
229
+ .window(TumblingEventTimeWindows.of(Time.minutes(1))) \
230
+ .aggregate(OrderAggregateFunction())
231
+
232
+ # Sink
233
+ result.sink_to(KafkaSink.builder()
234
+ .set_bootstrap_servers("kafka:9092")
235
+ .set_record_serializer(JsonRowSerializationSchema())
236
+ .build())
237
+
238
+ env.execute("Order Stream Processing")
239
+ \`\`\`
240
+
241
+ **输出**:Airflow DAG 模板 + Flink/Spark 流处理模板`,
242
+ },
243
+ {
244
+ title: '4. 容错与重试策略',
245
+ content: `确保管道在故障时可靠恢复:
246
+
247
+ **核心容错原则**
248
+
249
+ **① 幂等性设计(最重要)**
250
+ \`\`\`python
251
+ def load_orders_idempotent(batch_date: str, orders: list):
252
+ """幂等加载:同一批次多次执行结果相同"""
253
+ # 使用 INSERT ... ON CONFLICT DO NOTHING
254
+ # 或 MERGE(UPSERT)语义
255
+ pg_hook.run("""
256
+ INSERT INTO orders_dw (order_id, batch_date, amount, status)
257
+ VALUES %s
258
+ ON CONFLICT (order_id) DO UPDATE SET
259
+ amount = EXCLUDED.amount,
260
+ status = EXCLUDED.status,
261
+ updated_at = NOW()
262
+ """, [(o['id'], batch_date, o['amount'], o['status']) for o in orders])
263
+ \`\`\`
264
+
265
+ **② 死信队列(DLQ)**
266
+ \`\`\`python
267
+ # Kafka — 处理失败消息路由到 DLQ
268
+ def process_message(msg):
269
+ try:
270
+ transform_and_load(msg)
271
+ except Exception as e:
272
+ # 路由到死信队列,保留原始消息 + 错误信息
273
+ dlq_producer.send('orders-dlq', {
274
+ 'original_message': msg,
275
+ 'error': str(e),
276
+ 'failed_at': datetime.utcnow().isoformat(),
277
+ 'retry_count': msg.get('retry_count', 0) + 1
278
+ })
279
+ logger.error(f"Message sent to DLQ: {e}")
280
+ \`\`\`
281
+
282
+ **③ 断点续传(Checkpoint)**
283
+ \`\`\`python
284
+ def process_large_dataset():
285
+ checkpoint_file = '.etl_checkpoint'
286
+ last_id = load_checkpoint(checkpoint_file)
287
+
288
+ for batch in fetch_in_batches(after_id=last_id, batch_size=1000):
289
+ process_batch(batch)
290
+ save_checkpoint(checkpoint_file, batch[-1]['id']) # 每批保存进度
291
+ \`\`\`
292
+
293
+ **④ 指数退避重试**
294
+ \`\`\`python
295
+ import tenacity
296
+
297
+ @tenacity.retry(
298
+ wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
299
+ stop=tenacity.stop_after_attempt(5),
300
+ retry=tenacity.retry_if_exception_type(TransientError),
301
+ before_sleep=tenacity.before_sleep_log(logger, logging.WARNING)
302
+ )
303
+ def call_external_api(data):
304
+ return requests.post(API_URL, json=data, timeout=30)
305
+ \`\`\`
306
+
307
+ **输出**:容错策略文档 + 幂等加载模板 + DLQ 配置`,
308
+ },
309
+ {
310
+ title: '5. 数据血缘与可观测性',
311
+ content: `建立数据血缘追踪和管道监控体系:
312
+
313
+ **数据血缘(Data Lineage)**
314
+ \`\`\`python
315
+ # OpenLineage 标准(Marquez/Atlan/DataHub 支持)
316
+ from openlineage.client import OpenLineageClient
317
+ from openlineage.client.run import RunEvent, Job, Run, Dataset
318
+
319
+ client = OpenLineageClient.from_environment()
320
+
321
+ # 记录数据流转关系
322
+ client.emit(RunEvent(
323
+ eventType="COMPLETE",
324
+ job=Job(namespace="etl", name="daily_orders_transform"),
325
+ run=Run(runId=str(uuid4())),
326
+ inputs=[Dataset(namespace="postgres", name="raw_orders")],
327
+ outputs=[Dataset(namespace="bigquery", name="orders_dw.fact_orders")]
328
+ ))
329
+ \`\`\`
330
+
331
+ **管道健康仪表盘指标**
332
+ \`\`\`
333
+ 关键指标(Grafana 面板):
334
+ ─────────────────────────────
335
+ 延迟指标:
336
+ - 数据新鲜度(最新记录时间戳)
337
+ - Pipeline P95 执行时长
338
+ - Kafka Consumer Lag
339
+
340
+ 质量指标:
341
+ - 每日 NULL 率趋势
342
+ - 行数异常检测(±30% 告警)
343
+ - 数据质量测试通过率
344
+
345
+ 运维指标:
346
+ - 管道成功率(目标 > 99%)
347
+ - 重试次数分布
348
+ - DLQ 消息积压
349
+ \`\`\`
350
+
351
+ **Airflow SLA 监控**
352
+ \`\`\`python
353
+ with DAG(
354
+ 'daily_orders_etl',
355
+ sla_miss_callback=sla_miss_alert, # SLA 超时回调
356
+ ...
357
+ ) as dag:
358
+ load_task = PythonOperator(
359
+ task_id='load',
360
+ python_callable=load_to_warehouse,
361
+ sla=timedelta(hours=4), # 必须在4小时内完成
362
+ )
363
+ \`\`\`
364
+
365
+ **数据目录集成**
366
+ \`\`\`yaml
367
+ # dbt docs(自动生成)
368
+ # 运行后访问:dbt docs serve
369
+ # 包含:模型血缘图、列描述、测试结果
370
+
371
+ # 推荐工具
372
+ 生产级:DataHub, Atlan, Alation
373
+ 开源轻量:Marquez, OpenMetadata
374
+ dbt 生态:dbt docs + Elementary
375
+ \`\`\`
376
+
377
+ **输出**:数据血缘配置 + 监控仪表盘指标定义 + 数据目录方案`,
378
+ },
379
+ ],
380
+ outputFormat: '架构选型文档 + 数据流图 + 质量规则配置(dbt/GE)+ 管道代码模板 + 容错策略 + 血缘追踪方案',
381
+ examples: [],
382
+ notes: [
383
+ '幂等性是数据管道的第一原则——任何任务必须可以安全重试而不产生重复数据',
384
+ '从 Batch 开始,在有真实需求时再迁移到 Streaming,避免过早的复杂度',
385
+ '数据质量检查应与管道强耦合(而非事后补救),质量不达标应阻断下游加载',
386
+ '死信队列不是垃圾桶——DLQ 中的消息代表业务异常,必须定期审查和处理',
387
+ 'dbt + Airflow + BigQuery 是目前最常见的现代数据栈,对大多数团队足够用',
388
+ ],
389
+ category: '执行侧',
390
+ nextSkill: 'database-optimize',
391
+ };
392
+ //# sourceMappingURL=35-data-pipeline.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"35-data-pipeline.js","sourceRoot":"","sources":["../../src/skills/35-data-pipeline.ts"],"names":[],"mappings":";;;AAEa,QAAA,iBAAiB,GAAoB;IAChD,EAAE,EAAE,eAAe;IACnB,IAAI,EAAE,QAAQ;IACd,MAAM,EAAE,eAAe;IACvB,KAAK,EAAE,EAAE;IACT,WAAW,EAAE,8CAA8C;IAC3D,aAAa,EAAE,gHAAgH;IAC/H,iBAAiB,EAAE;;;2CAGsB;IACzC,QAAQ,EAAE;QACR,MAAM;QACN,eAAe;QACf,KAAK;QACL,KAAK;QACL,kBAAkB;QAClB,MAAM;QACN,SAAS;QACT,gBAAgB;QAChB,oBAAoB;QACpB,sBAAsB;QACtB,gBAAgB;KACjB;IACD,KAAK,EAAE;QACL;YACE,KAAK,EAAE,SAAS;YAChB,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;qBAwDM;SAChB;QACD;YACE,KAAK,EAAE,WAAW;YAClB,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;8BA6Ee;SACzB;QACD;YACE,KAAK,EAAE,cAAc;YACrB,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;0CAqE2B;SACrC;QACD;YACE,KAAK,EAAE,YAAY;YACnB,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;gCA8DiB;SAC3B;QACD;YACE,KAAK,EAAE,cAAc;YACrB,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;mCAkEoB;SAC9B;KACF;IACD,YAAY,EAAE,wDAAwD;IACtE,QAAQ,EAAE,EAAE;IACZ,KAAK,EAAE;QACL,qCAAqC;QACrC,2CAA2C;QAC3C,oCAAoC;QACpC,qCAAqC;QACrC,iDAAiD;KAClD;IACD,QAAQ,EAAE,KAAK;IACf,SAAS,EAAE,mBAAmB;CAC/B,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { SkillDefinition } from './types';
2
+ export declare const mlExperimentSkill: SkillDefinition;
3
+ //# sourceMappingURL=36-ml-experiment.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"36-ml-experiment.d.ts","sourceRoot":"","sources":["../../src/skills/36-ml-experiment.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,iBAAiB,EAAE,eA0Z/B,CAAC"}