code-abyss 1.6.16 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/package.json +2 -2
  2. package/skills/SKILL.md +24 -16
  3. package/skills/domains/ai/SKILL.md +2 -2
  4. package/skills/domains/ai/prompt-and-eval.md +279 -0
  5. package/skills/domains/architecture/SKILL.md +2 -3
  6. package/skills/domains/architecture/security-arch.md +87 -0
  7. package/skills/domains/data-engineering/SKILL.md +188 -26
  8. package/skills/domains/development/SKILL.md +1 -4
  9. package/skills/domains/devops/SKILL.md +3 -5
  10. package/skills/domains/devops/performance.md +63 -0
  11. package/skills/domains/devops/testing.md +97 -0
  12. package/skills/domains/frontend-design/SKILL.md +12 -3
  13. package/skills/domains/frontend-design/claymorphism/SKILL.md +117 -0
  14. package/skills/domains/frontend-design/claymorphism/references/tokens.css +52 -0
  15. package/skills/domains/frontend-design/engineering.md +287 -0
  16. package/skills/domains/frontend-design/glassmorphism/SKILL.md +138 -0
  17. package/skills/domains/frontend-design/glassmorphism/references/tokens.css +32 -0
  18. package/skills/domains/frontend-design/liquid-glass/SKILL.md +135 -0
  19. package/skills/domains/frontend-design/liquid-glass/references/tokens.css +81 -0
  20. package/skills/domains/frontend-design/neubrutalism/SKILL.md +141 -0
  21. package/skills/domains/frontend-design/neubrutalism/references/tokens.css +44 -0
  22. package/skills/domains/infrastructure/SKILL.md +174 -34
  23. package/skills/domains/mobile/SKILL.md +211 -21
  24. package/skills/domains/orchestration/SKILL.md +1 -0
  25. package/skills/domains/security/SKILL.md +4 -6
  26. package/skills/domains/security/blue-team.md +57 -0
  27. package/skills/domains/security/red-team.md +54 -0
  28. package/skills/domains/security/threat-intel.md +50 -0
  29. package/skills/orchestration/multi-agent/SKILL.md +195 -46
  30. package/skills/run_skill.js +134 -0
  31. package/skills/tools/gen-docs/SKILL.md +6 -4
  32. package/skills/tools/gen-docs/scripts/doc_generator.js +349 -0
  33. package/skills/tools/verify-change/SKILL.md +8 -6
  34. package/skills/tools/verify-change/scripts/change_analyzer.js +270 -0
  35. package/skills/tools/verify-module/SKILL.md +6 -4
  36. package/skills/tools/verify-module/scripts/module_scanner.js +145 -0
  37. package/skills/tools/verify-quality/SKILL.md +5 -3
  38. package/skills/tools/verify-quality/scripts/quality_checker.js +276 -0
  39. package/skills/tools/verify-security/SKILL.md +7 -5
  40. package/skills/tools/verify-security/scripts/security_scanner.js +133 -0
  41. package/skills/__pycache__/run_skill.cpython-312.pyc +0 -0
  42. package/skills/domains/COVERAGE_PLAN.md +0 -232
  43. package/skills/domains/ai/model-evaluation.md +0 -790
  44. package/skills/domains/ai/prompt-engineering.md +0 -703
  45. package/skills/domains/architecture/compliance.md +0 -299
  46. package/skills/domains/architecture/data-security.md +0 -184
  47. package/skills/domains/data-engineering/data-pipeline.md +0 -762
  48. package/skills/domains/data-engineering/data-quality.md +0 -894
  49. package/skills/domains/data-engineering/stream-processing.md +0 -791
  50. package/skills/domains/development/dart.md +0 -963
  51. package/skills/domains/development/kotlin.md +0 -834
  52. package/skills/domains/development/php.md +0 -659
  53. package/skills/domains/development/swift.md +0 -755
  54. package/skills/domains/devops/e2e-testing.md +0 -914
  55. package/skills/domains/devops/performance-testing.md +0 -734
  56. package/skills/domains/devops/testing-strategy.md +0 -667
  57. package/skills/domains/frontend-design/build-tools.md +0 -743
  58. package/skills/domains/frontend-design/performance.md +0 -734
  59. package/skills/domains/frontend-design/testing.md +0 -699
  60. package/skills/domains/infrastructure/gitops.md +0 -735
  61. package/skills/domains/infrastructure/iac.md +0 -855
  62. package/skills/domains/infrastructure/kubernetes.md +0 -1018
  63. package/skills/domains/mobile/android-dev.md +0 -979
  64. package/skills/domains/mobile/cross-platform.md +0 -795
  65. package/skills/domains/mobile/ios-dev.md +0 -931
  66. package/skills/domains/security/secrets-management.md +0 -834
  67. package/skills/domains/security/supply-chain.md +0 -931
  68. package/skills/domains/security/threat-modeling.md +0 -828
  69. package/skills/run_skill.py +0 -153
  70. package/skills/tests/README.md +0 -225
  71. package/skills/tests/SUMMARY.md +0 -362
  72. package/skills/tests/__init__.py +0 -3
  73. package/skills/tests/__pycache__/test_change_analyzer.cpython-312.pyc +0 -0
  74. package/skills/tests/__pycache__/test_doc_generator.cpython-312.pyc +0 -0
  75. package/skills/tests/__pycache__/test_module_scanner.cpython-312.pyc +0 -0
  76. package/skills/tests/__pycache__/test_quality_checker.cpython-312.pyc +0 -0
  77. package/skills/tests/__pycache__/test_security_scanner.cpython-312.pyc +0 -0
  78. package/skills/tests/test_change_analyzer.py +0 -558
  79. package/skills/tests/test_doc_generator.py +0 -538
  80. package/skills/tests/test_module_scanner.py +0 -376
  81. package/skills/tests/test_quality_checker.py +0 -516
  82. package/skills/tests/test_security_scanner.py +0 -426
  83. package/skills/tools/gen-docs/scripts/__pycache__/doc_generator.cpython-312.pyc +0 -0
  84. package/skills/tools/gen-docs/scripts/doc_generator.py +0 -520
  85. package/skills/tools/verify-change/scripts/__pycache__/change_analyzer.cpython-312.pyc +0 -0
  86. package/skills/tools/verify-change/scripts/change_analyzer.py +0 -529
  87. package/skills/tools/verify-module/scripts/__pycache__/module_scanner.cpython-312.pyc +0 -0
  88. package/skills/tools/verify-module/scripts/module_scanner.py +0 -321
  89. package/skills/tools/verify-quality/scripts/__pycache__/quality_checker.cpython-312.pyc +0 -0
  90. package/skills/tools/verify-quality/scripts/quality_checker.py +0 -481
  91. package/skills/tools/verify-security/scripts/__pycache__/security_scanner.cpython-312.pyc +0 -0
  92. package/skills/tools/verify-security/scripts/security_scanner.py +0 -374
@@ -1,762 +0,0 @@
1
- ---
2
- name: data-pipeline
3
- description: 数据管道编排。Airflow、Dagster、Prefect、ETL、数据编排、调度策略。当用户提到数据管道、Airflow、Dagster、Prefect、ETL、数据编排时使用。
4
- ---
5
-
6
- # 🔄 数据管道秘典 · Data Pipeline
7
-
8
- ## 管道架构
9
-
10
- ```
11
- 数据源 → 提取 → 转换 → 加载 → 目标
12
- │ │ │ │ │
13
- └─ API ─┴─ 清洗 ─┴─ 聚合 ─┴─ 存储
14
- ```
15
-
16
- ## Airflow DAG 开发
17
-
18
- ### 基础 DAG 结构
19
-
20
- ```python
21
- from airflow import DAG
22
- from airflow.operators.python import PythonOperator
23
- from airflow.operators.bash import BashOperator
24
- from datetime import datetime, timedelta
25
-
26
- default_args = {
27
- 'owner': 'data-team',
28
- 'depends_on_past': False,
29
- 'start_date': datetime(2024, 1, 1),
30
- 'email_on_failure': True,
31
- 'email_on_retry': False,
32
- 'retries': 3,
33
- 'retry_delay': timedelta(minutes=5),
34
- }
35
-
36
- with DAG(
37
- 'etl_pipeline',
38
- default_args=default_args,
39
- description='ETL pipeline for user data',
40
- schedule_interval='0 2 * * *', # 每天凌晨2点
41
- catchup=False,
42
- tags=['etl', 'production'],
43
- ) as dag:
44
-
45
- def extract_data(**context):
46
- """提取数据"""
47
- execution_date = context['execution_date']
48
- # 提取逻辑
49
- return {'records': 1000}
50
-
51
- def transform_data(**context):
52
- """转换数据"""
53
- ti = context['ti']
54
- data = ti.xcom_pull(task_ids='extract')
55
- # 转换逻辑
56
- return {'processed': data['records']}
57
-
58
- def load_data(**context):
59
- """加载数据"""
60
- ti = context['ti']
61
- data = ti.xcom_pull(task_ids='transform')
62
- # 加载逻辑
63
- print(f"Loaded {data['processed']} records")
64
-
65
- extract = PythonOperator(
66
- task_id='extract',
67
- python_callable=extract_data,
68
- )
69
-
70
- transform = PythonOperator(
71
- task_id='transform',
72
- python_callable=transform_data,
73
- )
74
-
75
- load = PythonOperator(
76
- task_id='load',
77
- python_callable=load_data,
78
- )
79
-
80
- extract >> transform >> load
81
- ```
82
-
83
- ### Operators 使用
84
-
85
- ```python
86
- from airflow.providers.postgres.operators.postgres import PostgresOperator
87
- from airflow.providers.http.operators.http import SimpleHttpOperator
88
- from airflow.providers.amazon.aws.operators.s3 import S3CreateBucketOperator
89
-
90
- # SQL Operator
91
- create_table = PostgresOperator(
92
- task_id='create_table',
93
- postgres_conn_id='postgres_default',
94
- sql="""
95
- CREATE TABLE IF NOT EXISTS user_stats (
96
- date DATE,
97
- user_count INT,
98
- active_count INT
99
- );
100
- """,
101
- )
102
-
103
- # HTTP Operator
104
- fetch_api = SimpleHttpOperator(
105
- task_id='fetch_api',
106
- http_conn_id='api_default',
107
- endpoint='/users',
108
- method='GET',
109
- headers={'Authorization': 'Bearer {{ var.value.api_token }}'},
110
- response_filter=lambda response: response.json(),
111
- )
112
-
113
- # S3 Operator
114
- upload_to_s3 = S3CreateBucketOperator(
115
- task_id='upload_to_s3',
116
- bucket_name='data-lake-{{ ds_nodash }}',
117
- aws_conn_id='aws_default',
118
- )
119
- ```
120
-
121
- ### Sensors 使用
122
-
123
- ```python
124
- from airflow.sensors.filesystem import FileSensor
125
- from airflow.providers.http.sensors.http import HttpSensor
126
- from airflow.sensors.external_task import ExternalTaskSensor
127
-
128
- # 文件传感器
129
- wait_for_file = FileSensor(
130
- task_id='wait_for_file',
131
- filepath='/data/input/{{ ds }}/data.csv',
132
- poke_interval=60, # 每60秒检查一次
133
- timeout=3600, # 1小时超时
134
- mode='poke',
135
- )
136
-
137
- # HTTP 传感器
138
- wait_for_api = HttpSensor(
139
- task_id='wait_for_api',
140
- http_conn_id='api_default',
141
- endpoint='/health',
142
- request_params={},
143
- response_check=lambda response: response.status_code == 200,
144
- poke_interval=30,
145
- )
146
-
147
- # 外部任务传感器
148
- wait_for_upstream = ExternalTaskSensor(
149
- task_id='wait_for_upstream',
150
- external_dag_id='upstream_dag',
151
- external_task_id='final_task',
152
- execution_delta=timedelta(hours=1),
153
- )
154
- ```
155
-
156
- ### XCom 数据传递
157
-
158
- ```python
159
- from airflow.decorators import task
160
-
161
- @task
162
- def extract_data():
163
- """使用 TaskFlow API"""
164
- data = {'users': [1, 2, 3], 'count': 3}
165
- return data
166
-
167
- @task
168
- def transform_data(data: dict):
169
- """自动接收上游数据"""
170
- transformed = {
171
- 'users': [u * 2 for u in data['users']],
172
- 'count': data['count']
173
- }
174
- return transformed
175
-
176
- @task
177
- def load_data(data: dict):
178
- """加载数据"""
179
- print(f"Loading {data['count']} users")
180
-
181
- # 链式调用
182
- data = extract_data()
183
- transformed = transform_data(data)
184
- load_data(transformed)
185
- ```
186
-
187
- ### 动态任务生成
188
-
189
- ```python
190
- from airflow.decorators import task
191
-
192
- @task
193
- def get_partitions():
194
- """获取分区列表"""
195
- return ['2024-01', '2024-02', '2024-03']
196
-
197
- @task
198
- def process_partition(partition: str):
199
- """处理单个分区"""
200
- print(f"Processing {partition}")
201
-
202
- # 动态生成任务
203
- partitions = get_partitions()
204
- process_partition.expand(partition=partitions)
205
- ```
206
-
207
- ## Dagster 资源管理
208
-
209
- ### Assets 定义
210
-
211
- ```python
212
- from dagster import asset, AssetExecutionContext, MaterializeResult
213
- import pandas as pd
214
-
215
- @asset(
216
- description="Raw user data from API",
217
- group_name="ingestion",
218
- compute_kind="python",
219
- )
220
- def raw_users(context: AssetExecutionContext) -> pd.DataFrame:
221
- """提取原始用户数据"""
222
- context.log.info("Fetching users from API")
223
- df = pd.DataFrame({
224
- 'user_id': [1, 2, 3],
225
- 'name': ['Alice', 'Bob', 'Charlie']
226
- })
227
- return df
228
-
229
- @asset(
230
- description="Cleaned user data",
231
- group_name="transformation",
232
- deps=[raw_users],
233
- )
234
- def cleaned_users(context: AssetExecutionContext, raw_users: pd.DataFrame) -> pd.DataFrame:
235
- """清洗用户数据"""
236
- context.log.info(f"Cleaning {len(raw_users)} users")
237
- df = raw_users.dropna()
238
- df['name'] = df['name'].str.upper()
239
- return df
240
-
241
- @asset(
242
- description="User statistics",
243
- group_name="analytics",
244
- deps=[cleaned_users],
245
- )
246
- def user_stats(context: AssetExecutionContext, cleaned_users: pd.DataFrame) -> MaterializeResult:
247
- """计算用户统计"""
248
- count = len(cleaned_users)
249
- context.log.info(f"Total users: {count}")
250
-
251
- return MaterializeResult(
252
- metadata={
253
- "user_count": count,
254
- "preview": cleaned_users.head().to_markdown(),
255
- }
256
- )
257
- ```
258
-
259
- ### Resources 配置
260
-
261
- ```python
262
- from dagster import resource, ConfigurableResource
263
- from pydantic import Field
264
- import psycopg2
265
-
266
- class PostgresResource(ConfigurableResource):
267
- """Postgres 资源"""
268
- host: str = Field(description="Database host")
269
- port: int = Field(default=5432)
270
- database: str
271
- user: str
272
- password: str
273
-
274
- def get_connection(self):
275
- return psycopg2.connect(
276
- host=self.host,
277
- port=self.port,
278
- database=self.database,
279
- user=self.user,
280
- password=self.password,
281
- )
282
-
283
- @asset
284
- def users_from_db(postgres: PostgresResource) -> pd.DataFrame:
285
- """从数据库读取用户"""
286
- conn = postgres.get_connection()
287
- df = pd.read_sql("SELECT * FROM users", conn)
288
- conn.close()
289
- return df
290
- ```
291
-
292
- ### Jobs 和 Schedules
293
-
294
- ```python
295
- from dagster import define_asset_job, ScheduleDefinition, AssetSelection
296
-
297
- # 定义 Job
298
- etl_job = define_asset_job(
299
- name="etl_job",
300
- selection=AssetSelection.groups("ingestion", "transformation"),
301
- description="ETL pipeline job",
302
- )
303
-
304
- analytics_job = define_asset_job(
305
- name="analytics_job",
306
- selection=AssetSelection.groups("analytics"),
307
- )
308
-
309
- # 定义 Schedule
310
- daily_schedule = ScheduleDefinition(
311
- job=etl_job,
312
- cron_schedule="0 2 * * *", # 每天凌晨2点
313
- )
314
-
315
- hourly_schedule = ScheduleDefinition(
316
- job=analytics_job,
317
- cron_schedule="0 * * * *", # 每小时
318
- )
319
- ```
320
-
321
- ### Sensors 监听
322
-
323
- ```python
324
- from dagster import sensor, RunRequest, SensorEvaluationContext
325
- import os
326
-
327
- @sensor(
328
- job=etl_job,
329
- minimum_interval_seconds=60,
330
- )
331
- def file_sensor(context: SensorEvaluationContext):
332
- """监听文件到达"""
333
- files = os.listdir('/data/input')
334
- for file in files:
335
- if file.endswith('.csv'):
336
- yield RunRequest(
337
- run_key=file,
338
- run_config={
339
- "ops": {
340
- "process_file": {
341
- "config": {"filename": file}
342
- }
343
- }
344
- }
345
- )
346
- ```
347
-
348
- ### Partitions 分区
349
-
350
- ```python
351
- from dagster import DailyPartitionsDefinition, asset
352
-
353
- daily_partitions = DailyPartitionsDefinition(start_date="2024-01-01")
354
-
355
- @asset(
356
- partitions_def=daily_partitions,
357
- )
358
- def daily_users(context: AssetExecutionContext) -> pd.DataFrame:
359
- """按日分区的用户数据"""
360
- partition_date = context.partition_key
361
- context.log.info(f"Processing partition: {partition_date}")
362
- # 处理特定日期的数据
363
- return pd.DataFrame()
364
- ```
365
-
366
- ## Prefect 工作流
367
-
368
- ### Tasks 和 Flows
369
-
370
- ```python
371
- from prefect import task, flow
372
- from prefect.tasks import task_input_hash
373
- from datetime import timedelta
374
-
375
- @task(
376
- retries=3,
377
- retry_delay_seconds=60,
378
- cache_key_fn=task_input_hash,
379
- cache_expiration=timedelta(hours=1),
380
- )
381
- def extract_data(source: str) -> dict:
382
- """提取数据任务"""
383
- print(f"Extracting from {source}")
384
- return {'records': 1000}
385
-
386
- @task
387
- def transform_data(data: dict) -> dict:
388
- """转换数据任务"""
389
- print(f"Transforming {data['records']} records")
390
- return {'processed': data['records']}
391
-
392
- @task
393
- def load_data(data: dict):
394
- """加载数据任务"""
395
- print(f"Loading {data['processed']} records")
396
-
397
- @flow(name="ETL Pipeline", log_prints=True)
398
- def etl_flow(source: str = "api"):
399
- """ETL 工作流"""
400
- raw_data = extract_data(source)
401
- transformed = transform_data(raw_data)
402
- load_data(transformed)
403
- ```
404
-
405
- ### 并发控制
406
-
407
- ```python
408
- from prefect import flow, task
409
- from prefect.task_runners import ConcurrentTaskRunner
410
-
411
- @task
412
- def process_item(item: int) -> int:
413
- """处理单个项目"""
414
- return item * 2
415
-
416
- @flow(task_runner=ConcurrentTaskRunner())
417
- def parallel_flow():
418
- """并发执行任务"""
419
- items = range(10)
420
- results = process_item.map(items)
421
- return results
422
- ```
423
-
424
- ### Deployments 部署
425
-
426
- ```python
427
- from prefect.deployments import Deployment
428
- from prefect.server.schemas.schedules import CronSchedule
429
-
430
- deployment = Deployment.build_from_flow(
431
- flow=etl_flow,
432
- name="etl-production",
433
- schedule=CronSchedule(cron="0 2 * * *"),
434
- work_queue_name="production",
435
- parameters={"source": "database"},
436
- tags=["production", "etl"],
437
- )
438
-
439
- deployment.apply()
440
- ```
441
-
442
- ### Blocks 配置
443
-
444
- ```python
445
- from prefect.blocks.system import Secret, JSON
446
-
447
- # 存储密钥
448
- secret = Secret(value="my-secret-key")
449
- secret.save("api-key")
450
-
451
- # 存储配置
452
- config = JSON(value={"host": "localhost", "port": 5432})
453
- config.save("db-config")
454
-
455
- # 使用 Block
456
- @task
457
- def connect_db():
458
- """连接数据库"""
459
- config = JSON.load("db-config")
460
- api_key = Secret.load("api-key")
461
- print(f"Connecting to {config.value['host']}")
462
- ```
463
-
464
- ## 调度策略
465
-
466
- ### Cron 表达式
467
-
468
- | 表达式 | 说明 | 示例 |
469
- |--------|------|------|
470
- | `0 2 * * *` | 每天凌晨2点 | 日批处理 |
471
- | `0 */4 * * *` | 每4小时 | 增量同步 |
472
- | `0 0 * * 0` | 每周日午夜 | 周报生成 |
473
- | `0 0 1 * *` | 每月1号 | 月度汇总 |
474
- | `*/15 * * * *` | 每15分钟 | 实时监控 |
475
-
476
- ### 事件驱动调度
477
-
478
- ```python
479
- # Airflow 文件触发
480
- from airflow.sensors.filesystem import FileSensor
481
-
482
- wait_for_file = FileSensor(
483
- task_id='wait_for_file',
484
- filepath='/data/trigger.flag',
485
- poke_interval=10,
486
- )
487
-
488
- # Dagster 传感器触发
489
- from dagster import sensor, RunRequest
490
-
491
- @sensor(job=my_job)
492
- def s3_sensor(context):
493
- """S3 文件到达触发"""
494
- new_files = check_s3_bucket()
495
- for file in new_files:
496
- yield RunRequest(run_key=file)
497
-
498
- # Prefect 自动化触发
499
- from prefect.events import DeploymentEventTrigger
500
-
501
- trigger = DeploymentEventTrigger(
502
- expect={"resource.id": "s3://bucket/data"},
503
- match_related={"resource.type": "file"},
504
- )
505
- ```
506
-
507
- ### 依赖调度
508
-
509
- ```python
510
- # Airflow 跨 DAG 依赖
511
- from airflow.sensors.external_task import ExternalTaskSensor
512
-
513
- wait_upstream = ExternalTaskSensor(
514
- task_id='wait_upstream',
515
- external_dag_id='upstream_dag',
516
- external_task_id='final_task',
517
- )
518
-
519
- # Dagster 资产依赖
520
- @asset(deps=[upstream_asset])
521
- def downstream_asset():
522
- pass
523
-
524
- # Prefect 子流程
525
- @flow
526
- def parent_flow():
527
- child_flow()
528
- ```
529
-
530
- ## 错误处理
531
-
532
- ### 重试策略
533
-
534
- ```python
535
- # Airflow 重试
536
- default_args = {
537
- 'retries': 3,
538
- 'retry_delay': timedelta(minutes=5),
539
- 'retry_exponential_backoff': True,
540
- 'max_retry_delay': timedelta(hours=1),
541
- }
542
-
543
- # Dagster 重试
544
- from dagster import RetryPolicy
545
-
546
- @asset(
547
- retry_policy=RetryPolicy(
548
- max_retries=3,
549
- delay=60,
550
- )
551
- )
552
- def my_asset():
553
- pass
554
-
555
- # Prefect 重试
556
- @task(
557
- retries=3,
558
- retry_delay_seconds=60,
559
- retry_jitter_factor=0.5,
560
- )
561
- def my_task():
562
- pass
563
- ```
564
-
565
- ### 失败回调
566
-
567
- ```python
568
- # Airflow 回调
569
- def on_failure_callback(context):
570
- """失败回调"""
571
- task = context['task_instance']
572
- send_alert(f"Task {task.task_id} failed")
573
-
574
- task = PythonOperator(
575
- task_id='my_task',
576
- python_callable=my_func,
577
- on_failure_callback=on_failure_callback,
578
- )
579
-
580
- # Dagster 钩子
581
- from dagster import failure_hook
582
-
583
- @failure_hook
584
- def slack_on_failure(context):
585
- """失败通知"""
586
- send_slack_message(f"Asset {context.asset_key} failed")
587
-
588
- @asset(hooks={slack_on_failure})
589
- def my_asset():
590
- pass
591
- ```
592
-
593
- ## 监控告警
594
-
595
- ### 指标收集
596
-
597
- ```python
598
- # Airflow 指标
599
- from airflow.metrics import Stats
600
-
601
- def my_task():
602
- Stats.incr('custom.task.count')
603
- Stats.timing('custom.task.duration', 100)
604
- Stats.gauge('custom.task.records', 1000)
605
-
606
- # Dagster 元数据
607
- from dagster import MaterializeResult
608
-
609
- @asset
610
- def my_asset():
611
- return MaterializeResult(
612
- metadata={
613
- "records_processed": 1000,
614
- "duration_seconds": 45.2,
615
- }
616
- )
617
- ```
618
-
619
- ### SLA 监控
620
-
621
- ```python
622
- # Airflow SLA
623
- with DAG(
624
- 'my_dag',
625
- default_args={
626
- 'sla': timedelta(hours=2),
627
- 'sla_miss_callback': sla_miss_alert,
628
- }
629
- ) as dag:
630
- task = PythonOperator(task_id='task')
631
-
632
- # Dagster 资产检查
633
- from dagster import asset_check, AssetCheckResult
634
-
635
- @asset_check(asset=my_asset)
636
- def check_freshness():
637
- """检查数据新鲜度"""
638
- age = get_data_age()
639
- return AssetCheckResult(
640
- passed=age < timedelta(hours=2),
641
- metadata={"age_hours": age.total_seconds() / 3600}
642
- )
643
- ```
644
-
645
- ## 数据血缘
646
-
647
- ### Airflow Lineage
648
-
649
- ```python
650
- from airflow.lineage import AUTO
651
- from airflow.lineage.entities import File
652
-
653
- input_file = File("/data/input.csv")
654
- output_file = File("/data/output.csv")
655
-
656
- task = PythonOperator(
657
- task_id='transform',
658
- python_callable=transform_func,
659
- inlets={"auto": AUTO, "datasets": [input_file]},
660
- outlets={"datasets": [output_file]},
661
- )
662
- ```
663
-
664
- ### Dagster 血缘追踪
665
-
666
- ```python
667
- from dagster import AssetIn, asset
668
-
669
- @asset
670
- def source_data():
671
- """源数据"""
672
- return pd.DataFrame()
673
-
674
- @asset(
675
- ins={"source": AssetIn("source_data")},
676
- )
677
- def transformed_data(source: pd.DataFrame):
678
- """转换数据 - 自动追踪血缘"""
679
- return source.copy()
680
- ```
681
-
682
- ## 最佳实践
683
-
684
- ### 幂等性设计
685
-
686
- ```python
687
- # 使用 UPSERT 而非 INSERT
688
- def load_data(df: pd.DataFrame):
689
- """幂等加载"""
690
- df.to_sql(
691
- 'users',
692
- engine,
693
- if_exists='replace', # 或使用 ON CONFLICT
694
- index=False,
695
- )
696
-
697
- # 使用分区覆盖
698
- def write_partition(df: pd.DataFrame, date: str):
699
- """分区覆盖写入"""
700
- path = f"s3://bucket/data/date={date}/"
701
- df.to_parquet(path, mode='overwrite')
702
- ```
703
-
704
- ### 增量处理
705
-
706
- ```python
707
- @task
708
- def incremental_extract(last_run: datetime):
709
- """增量提取"""
710
- query = f"""
711
- SELECT * FROM users
712
- WHERE updated_at > '{last_run}'
713
- """
714
- return pd.read_sql(query, engine)
715
-
716
- @flow
717
- def incremental_flow():
718
- """增量流程"""
719
- last_run = get_last_run_time()
720
- new_data = incremental_extract(last_run)
721
- if not new_data.empty:
722
- transform_and_load(new_data)
723
- ```
724
-
725
- ### 数据验证
726
-
727
- ```python
728
- @task
729
- def validate_data(df: pd.DataFrame):
730
- """数据验证"""
731
- assert not df.empty, "DataFrame is empty"
732
- assert df['user_id'].is_unique, "Duplicate user_id"
733
- assert df['email'].notna().all(), "Null emails found"
734
- assert df['age'].between(0, 120).all(), "Invalid age"
735
- ```
736
-
737
- ## 框架对比
738
-
739
- | 特性 | Airflow | Dagster | Prefect |
740
- |------|---------|---------|---------|
741
- | 学习曲线 | 陡峭 | 中等 | 平缓 |
742
- | 资产管理 | ❌ | ✅ | ❌ |
743
- | 动态任务 | ✅ | ✅ | ✅ |
744
- | 本地开发 | 复杂 | 简单 | 简单 |
745
- | UI 体验 | 传统 | 现代 | 现代 |
746
- | 社区生态 | 最大 | 成长中 | 成长中 |
747
- | 企业支持 | Astronomer | Dagster+ | Prefect Cloud |
748
-
749
- ## 工具清单
750
-
751
- | 工具 | 用途 | 推荐场景 |
752
- |------|------|----------|
753
- | Apache Airflow | 批处理编排 | 复杂 DAG、成熟生态 |
754
- | Dagster | 资产管理 | 数据资产、血缘追踪 |
755
- | Prefect | 现代工作流 | 快速开发、动态流程 |
756
- | Luigi | 轻量编排 | 简单管道、Python 原生 |
757
- | Argo Workflows | K8s 编排 | 云原生、容器化 |
758
- | Temporal | 持久化工作流 | 长时任务、状态管理 |
759
-
760
- ## 触发词
761
-
762
- 数据管道、Airflow、Dagster、Prefect、ETL、数据编排、DAG、调度、工作流、数据血缘