@sylix/coworker 2.0.11 → 2.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/slash/config.d.ts.map +1 -1
- package/dist/commands/slash/config.js +22 -4
- package/dist/commands/slash/config.js.map +1 -1
- package/dist/core/CoWorkerAgent.d.ts.map +1 -1
- package/dist/core/CoWorkerAgent.js +6 -3
- package/dist/core/CoWorkerAgent.js.map +1 -1
- package/dist/skills/defaults/accessibility/screen-reader-testing.md +545 -0
- package/dist/skills/defaults/accessibility/wcag-audit-patterns.md +555 -0
- package/dist/skills/defaults/ai-ml/rag.md +276 -0
- package/dist/skills/defaults/backend-development/api-design-principles.md +528 -0
- package/dist/skills/defaults/backend-development/api-design.md +285 -0
- package/dist/skills/defaults/backend-development/architecture-patterns.md +494 -0
- package/dist/skills/defaults/backend-development/async-python.md +237 -0
- package/dist/skills/defaults/backend-development/auth-implementation-patterns.md +638 -0
- package/dist/skills/defaults/backend-development/bazel-build-optimization.md +387 -0
- package/dist/skills/defaults/backend-development/billing-automation/SKILL.md +566 -0
- package/dist/skills/defaults/backend-development/code-review-excellence.md +538 -0
- package/dist/skills/defaults/backend-development/cqrs-implementation.md +554 -0
- package/dist/skills/defaults/backend-development/database-design.md +305 -0
- package/dist/skills/defaults/backend-development/debugging-strategies.md +536 -0
- package/dist/skills/defaults/backend-development/e2e-testing-patterns.md +544 -0
- package/dist/skills/defaults/backend-development/error-handling-patterns.md +641 -0
- package/dist/skills/defaults/backend-development/fastapi-templates.md +559 -0
- package/dist/skills/defaults/backend-development/fastapi.md +309 -0
- package/dist/skills/defaults/backend-development/git-advanced-workflows.md +405 -0
- package/dist/skills/defaults/backend-development/microservices-patterns.md +595 -0
- package/dist/skills/defaults/backend-development/microservices.md +284 -0
- package/dist/skills/defaults/backend-development/monorepo-management.md +623 -0
- package/dist/skills/defaults/backend-development/nodejs-backend-patterns.md +1048 -0
- package/dist/skills/defaults/backend-development/nx-workspace-patterns.md +457 -0
- package/dist/skills/defaults/backend-development/paypal-integration/SKILL.md +478 -0
- package/dist/skills/defaults/backend-development/pci-compliance/SKILL.md +480 -0
- package/dist/skills/defaults/backend-development/python-anti-patterns.md +349 -0
- package/dist/skills/defaults/backend-development/python-background-jobs.md +364 -0
- package/dist/skills/defaults/backend-development/python-code-style.md +360 -0
- package/dist/skills/defaults/backend-development/python-configuration.md +368 -0
- package/dist/skills/defaults/backend-development/python-design-patterns.md +296 -0
- package/dist/skills/defaults/backend-development/python-error-handling.md +323 -0
- package/dist/skills/defaults/backend-development/python-packaging.md +887 -0
- package/dist/skills/defaults/backend-development/python-performance-optimization.md +874 -0
- package/dist/skills/defaults/backend-development/python-project-structure.md +252 -0
- package/dist/skills/defaults/backend-development/python-resilience.md +376 -0
- package/dist/skills/defaults/backend-development/python-resource-management.md +421 -0
- package/dist/skills/defaults/backend-development/python-type-safety.md +428 -0
- package/dist/skills/defaults/backend-development/sql-optimization-patterns.md +509 -0
- package/dist/skills/defaults/backend-development/stripe-integration/SKILL.md +522 -0
- package/dist/skills/defaults/backend-development/turborepo-caching.md +376 -0
- package/dist/skills/defaults/blockchain/defi-protocol-templates.md +430 -0
- package/dist/skills/defaults/blockchain/nft-standards.md +364 -0
- package/dist/skills/defaults/blockchain/solidity-security.md +514 -0
- package/dist/skills/defaults/blockchain/web3-testing.md +360 -0
- package/dist/skills/defaults/business/competitive-landscape/SKILL.md +527 -0
- package/dist/skills/defaults/business/market-sizing-analysis/SKILL.md +451 -0
- package/dist/skills/defaults/business/startup-financial-modeling/SKILL.md +494 -0
- package/dist/skills/defaults/business/startup-metrics-framework/SKILL.md +564 -0
- package/dist/skills/defaults/business/team-composition-analysis.md +437 -0
- package/dist/skills/defaults/compliance/employment-contract-templates/SKILL.md +527 -0
- package/dist/skills/defaults/compliance/gdpr-data-handling/SKILL.md +630 -0
- package/dist/skills/defaults/data-engineering/airflow-dag-patterns.md +436 -0
- package/dist/skills/defaults/data-engineering/airflow.md +519 -0
- package/dist/skills/defaults/data-engineering/data-quality.md +583 -0
- package/dist/skills/defaults/data-engineering/dbt-transformation-patterns.md +482 -0
- package/dist/skills/defaults/data-engineering/dbt.md +556 -0
- package/dist/skills/defaults/data-engineering/ml-pipeline-workflow/SKILL.md +247 -0
- package/dist/skills/defaults/data-engineering/spark-optimization.md +348 -0
- package/dist/skills/defaults/data-engineering/spark.md +411 -0
- package/dist/skills/defaults/database/postgresql.md +202 -0
- package/dist/skills/defaults/debugging/systematic-debugging.md +249 -0
- package/dist/skills/defaults/devops/architecture-decision-records.md +448 -0
- package/dist/skills/defaults/devops/changelog-automation.md +580 -0
- package/dist/skills/defaults/devops/cicd.md +314 -0
- package/dist/skills/defaults/devops/cloud.md +263 -0
- package/dist/skills/defaults/devops/code-review-excellence.md +299 -0
- package/dist/skills/defaults/devops/cost-optimization.md +295 -0
- package/dist/skills/defaults/devops/deployment-pipeline-design.md +356 -0
- package/dist/skills/defaults/devops/docker.md +281 -0
- package/dist/skills/defaults/devops/git-workflows.md +205 -0
- package/dist/skills/defaults/devops/github-actions.md +311 -0
- package/dist/skills/defaults/devops/gitlab-ci-patterns.md +266 -0
- package/dist/skills/defaults/devops/hybrid-cloud-networking.md +241 -0
- package/dist/skills/defaults/devops/istio-traffic-management.md +327 -0
- package/dist/skills/defaults/devops/kubernetes.md +339 -0
- package/dist/skills/defaults/devops/linkerd-patterns.md +311 -0
- package/dist/skills/defaults/devops/multi-cloud-architecture.md +181 -0
- package/dist/skills/defaults/devops/observability.md +243 -0
- package/dist/skills/defaults/devops/openapi-spec-generation.md +1024 -0
- package/dist/skills/defaults/devops/postmortem-writing.md +396 -0
- package/dist/skills/defaults/devops/prometheus-configuration.md +265 -0
- package/dist/skills/defaults/devops/secrets-management.md +341 -0
- package/dist/skills/defaults/devops/service-mesh-observability.md +385 -0
- package/dist/skills/defaults/devops/terraform-module-library.md +244 -0
- package/dist/skills/defaults/finance/backtesting-frameworks/SKILL.md +663 -0
- package/dist/skills/defaults/finance/risk-metrics-calculation/SKILL.md +557 -0
- package/dist/skills/defaults/frontend/accessibility-compliance.md +420 -0
- package/dist/skills/defaults/frontend/design-system-patterns.md +337 -0
- package/dist/skills/defaults/frontend/interaction-design.md +327 -0
- package/dist/skills/defaults/frontend/javascript.md +311 -0
- package/dist/skills/defaults/frontend/modern-javascript-patterns.md +927 -0
- package/dist/skills/defaults/frontend/react-native-design.md +440 -0
- package/dist/skills/defaults/frontend/react.md +345 -0
- package/dist/skills/defaults/frontend/responsive-design.md +472 -0
- package/dist/skills/defaults/frontend/tailwind-design-system.md +337 -0
- package/dist/skills/defaults/frontend/typescript-advanced-types.md +724 -0
- package/dist/skills/defaults/frontend/typescript.md +334 -0
- package/dist/skills/defaults/frontend/visual-design-foundations.md +326 -0
- package/dist/skills/defaults/frontend/web-component-design.md +279 -0
- package/dist/skills/defaults/game-development/godot-gdscript-patterns.md +188 -0
- package/dist/skills/defaults/game-development/unity-ecs-patterns.md +594 -0
- package/dist/skills/defaults/kubernetes/gitops-workflow.md +285 -0
- package/dist/skills/defaults/kubernetes/gitops.md +280 -0
- package/dist/skills/defaults/kubernetes/helm-chart-scaffolding.md +553 -0
- package/dist/skills/defaults/kubernetes/helm.md +343 -0
- package/dist/skills/defaults/kubernetes/k8s-manifest-generator.md +501 -0
- package/dist/skills/defaults/kubernetes/k8s-security-policies.md +342 -0
- package/dist/skills/defaults/kubernetes/manifests.md +330 -0
- package/dist/skills/defaults/kubernetes/security.md +337 -0
- package/dist/skills/defaults/llm-application/embedding-strategies.md +608 -0
- package/dist/skills/defaults/llm-application/hybrid-search-implementation.md +570 -0
- package/dist/skills/defaults/llm-application/hybrid-search.md +570 -0
- package/dist/skills/defaults/llm-application/langchain-architecture.md +666 -0
- package/dist/skills/defaults/llm-application/langchain.md +259 -0
- package/dist/skills/defaults/llm-application/llm-evaluation.md +695 -0
- package/dist/skills/defaults/llm-application/prompt-engineering-patterns.md +449 -0
- package/dist/skills/defaults/llm-application/prompt-engineering.md +219 -0
- package/dist/skills/defaults/llm-application/rag-implementation.md +434 -0
- package/dist/skills/defaults/llm-application/similarity-search-patterns.md +560 -0
- package/dist/skills/defaults/llm-application/similarity-search.md +560 -0
- package/dist/skills/defaults/llm-application/vector-index-tuning.md +523 -0
- package/dist/skills/defaults/mobile/mobile-android-design.md +440 -0
- package/dist/skills/defaults/mobile/mobile-ios-design.md +266 -0
- package/dist/skills/defaults/monitoring/distributed-tracing.md +436 -0
- package/dist/skills/defaults/monitoring/grafana-dashboards.md +370 -0
- package/dist/skills/defaults/monitoring/prometheus-configuration.md +379 -0
- package/dist/skills/defaults/monitoring/slo-implementation.md +323 -0
- package/dist/skills/defaults/refactoring/code-refactoring.md +349 -0
- package/dist/skills/defaults/security/anti-reversing-techniques/SKILL.md +559 -0
- package/dist/skills/defaults/security/auditor.md +168 -0
- package/dist/skills/defaults/security/binary-analysis-patterns/SKILL.md +438 -0
- package/dist/skills/defaults/security/memory-forensics/SKILL.md +483 -0
- package/dist/skills/defaults/security/mtls-configuration.md +349 -0
- package/dist/skills/defaults/security/protocol-reverse-engineering/SKILL.md +520 -0
- package/dist/skills/defaults/security/sast-configuration.md +182 -0
- package/dist/skills/defaults/security/security.md +313 -0
- package/dist/skills/defaults/security/stride-analysis.md +273 -0
- package/dist/skills/defaults/security/threat-mitigation-mapping.md +290 -0
- package/dist/skills/defaults/systems/bash-defensive-patterns/SKILL.md +539 -0
- package/dist/skills/defaults/systems/bats-testing-patterns/SKILL.md +631 -0
- package/dist/skills/defaults/systems/go-concurrency-patterns.md +657 -0
- package/dist/skills/defaults/systems/memory-safety-patterns.md +605 -0
- package/dist/skills/defaults/systems/rust-async-patterns.md +519 -0
- package/dist/skills/defaults/systems/shellcheck-configuration/SKILL.md +456 -0
- package/dist/skills/defaults/team-collaboration/multi-reviewer-patterns.md +126 -0
- package/dist/skills/defaults/team-collaboration/parallel-feature-development.md +151 -0
- package/dist/skills/defaults/testing/javascript-testing-patterns.md +1021 -0
- package/dist/skills/defaults/testing/python-testing-patterns.md +351 -0
- package/dist/skills/defaults/testing/testing.md +332 -0
- package/dist/skills/defaults/workflows/context-driven-development.md +384 -0
- package/dist/skills/defaults/workflows/track-management.md +592 -0
- package/dist/skills/defaults/workflows/workflow-patterns.md +622 -0
- package/dist/skills/index.d.ts +11 -0
- package/dist/skills/index.d.ts.map +1 -0
- package/dist/skills/index.js +129 -0
- package/dist/skills/index.js.map +1 -0
- package/dist/utils/character.js +4 -4
- package/dist/utils/character.js.map +1 -1
- package/dist/utils/inputbar.d.ts.map +1 -1
- package/dist/utils/inputbar.js +7 -0
- package/dist/utils/inputbar.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: airflow-dag-patterns
|
|
3
|
+
description: Build production Apache Airflow DAGs with best practices for operators, sensors, testing, and deployment
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Apache Airflow DAG Patterns
|
|
7
|
+
|
|
8
|
+
Production-ready patterns for Apache Airflow including DAG design, operators, sensors, testing, and deployment strategies.
|
|
9
|
+
|
|
10
|
+
## When to Use This Skill
|
|
11
|
+
|
|
12
|
+
- Creating data pipeline orchestration with Airflow
|
|
13
|
+
- Designing DAG structures and dependencies
|
|
14
|
+
- Implementing custom operators and sensors
|
|
15
|
+
- Testing Airflow DAGs locally
|
|
16
|
+
- Setting up Airflow in production
|
|
17
|
+
- Debugging failed DAG runs
|
|
18
|
+
|
|
19
|
+
## Core Concepts
|
|
20
|
+
|
|
21
|
+
### DAG Design Principles
|
|
22
|
+
|
|
23
|
+
| Principle | Description |
|
|
24
|
+
| --------------- | ----------------------------------- |
|
|
25
|
+
| **Idempotent** | Running twice produces same result |
|
|
26
|
+
| **Atomic** | Tasks succeed or fail completely |
|
|
27
|
+
| **Incremental** | Process only new/changed data |
|
|
28
|
+
| **Observable** | Logs, metrics, alerts at every step |
|
|
29
|
+
|
|
30
|
+
### Task Dependencies
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
# Linear
|
|
34
|
+
task1 >> task2 >> task3
|
|
35
|
+
|
|
36
|
+
# Fan-out
|
|
37
|
+
task1 >> [task2, task3, task4]
|
|
38
|
+
|
|
39
|
+
# Fan-in
|
|
40
|
+
[task1, task2, task3] >> task4
|
|
41
|
+
|
|
42
|
+
# Complex
|
|
43
|
+
task1 >> task2 >> task4
|
|
44
|
+
task1 >> task3 >> task4
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from datetime import datetime, timedelta
|
|
51
|
+
from airflow import DAG
|
|
52
|
+
from airflow.operators.python import PythonOperator
|
|
53
|
+
from airflow.operators.empty import EmptyOperator
|
|
54
|
+
|
|
55
|
+
default_args = {
|
|
56
|
+
'owner': 'data-team',
|
|
57
|
+
'depends_on_past': False,
|
|
58
|
+
'email_on_failure': True,
|
|
59
|
+
'email_on_retry': False,
|
|
60
|
+
'retries': 3,
|
|
61
|
+
'retry_delay': timedelta(minutes=5),
|
|
62
|
+
'retry_exponential_backoff': True,
|
|
63
|
+
'max_retry_delay': timedelta(hours=1),
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
with DAG(
|
|
67
|
+
dag_id='example_etl',
|
|
68
|
+
default_args=default_args,
|
|
69
|
+
description='Example ETL pipeline',
|
|
70
|
+
schedule='0 6 * * *',
|
|
71
|
+
start_date=datetime(2024, 1, 1),
|
|
72
|
+
catchup=False,
|
|
73
|
+
tags=['etl', 'example'],
|
|
74
|
+
max_active_runs=1,
|
|
75
|
+
) as dag:
|
|
76
|
+
|
|
77
|
+
start = EmptyOperator(task_id='start')
|
|
78
|
+
|
|
79
|
+
def extract_data(**context):
|
|
80
|
+
execution_date = context['ds']
|
|
81
|
+
return {'records': 1000}
|
|
82
|
+
|
|
83
|
+
extract = PythonOperator(
|
|
84
|
+
task_id='extract',
|
|
85
|
+
python_callable=extract_data,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
end = EmptyOperator(task_id='end')
|
|
89
|
+
|
|
90
|
+
start >> extract >> end
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Patterns
|
|
94
|
+
|
|
95
|
+
### Pattern 1: TaskFlow API (Airflow 2.0+)
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from datetime import datetime
|
|
99
|
+
from airflow.decorators import dag, task
|
|
100
|
+
from airflow.models import Variable
|
|
101
|
+
|
|
102
|
+
@dag(
|
|
103
|
+
dag_id='taskflow_etl',
|
|
104
|
+
schedule='@daily',
|
|
105
|
+
start_date=datetime(2024, 1, 1),
|
|
106
|
+
catchup=False,
|
|
107
|
+
tags=['etl', 'taskflow'],
|
|
108
|
+
)
|
|
109
|
+
def taskflow_etl():
|
|
110
|
+
|
|
111
|
+
@task()
|
|
112
|
+
def extract(source: str) -> dict:
|
|
113
|
+
import pandas as pd
|
|
114
|
+
df = pd.read_csv(f's3://bucket/{source}/{{ ds }}.csv')
|
|
115
|
+
return {'data': df.to_dict(), 'rows': len(df)}
|
|
116
|
+
|
|
117
|
+
@task()
|
|
118
|
+
def transform(extracted: dict) -> dict:
|
|
119
|
+
import pandas as pd
|
|
120
|
+
df = pd.DataFrame(extracted['data'])
|
|
121
|
+
df['processed_at'] = datetime.now()
|
|
122
|
+
df = df.dropna()
|
|
123
|
+
return {'data': df.to_dict(), 'rows': len(df)}
|
|
124
|
+
|
|
125
|
+
@task()
|
|
126
|
+
def load(transformed: dict, target: str):
|
|
127
|
+
import pandas as pd
|
|
128
|
+
df = pd.DataFrame(transformed['data'])
|
|
129
|
+
df.to_parquet(f's3://bucket/{target}/{{ ds }}.parquet')
|
|
130
|
+
return transformed['rows']
|
|
131
|
+
|
|
132
|
+
@task()
|
|
133
|
+
def notify(rows_loaded: int):
|
|
134
|
+
print(f'Loaded {rows_loaded} rows')
|
|
135
|
+
|
|
136
|
+
extracted = extract(source='raw_data')
|
|
137
|
+
transformed = transform(extracted)
|
|
138
|
+
loaded = load(transformed, target='processed_data')
|
|
139
|
+
notify(loaded)
|
|
140
|
+
|
|
141
|
+
taskflow_etl()
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Pattern 2: Dynamic DAG Generation
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from datetime import datetime, timedelta
|
|
148
|
+
from airflow import DAG
|
|
149
|
+
from airflow.operators.python import PythonOperator
|
|
150
|
+
|
|
151
|
+
PIPELINE_CONFIGS = [
|
|
152
|
+
{'name': 'customers', 'schedule': '@daily', 'source': 's3://raw/customers'},
|
|
153
|
+
{'name': 'orders', 'schedule': '@hourly', 'source': 's3://raw/orders'},
|
|
154
|
+
{'name': 'products', 'schedule': '@weekly', 'source': 's3://raw/products'},
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
def create_dag(config: dict) -> DAG:
|
|
158
|
+
dag_id = f"etl_{config['name']}"
|
|
159
|
+
|
|
160
|
+
default_args = {
|
|
161
|
+
'owner': 'data-team',
|
|
162
|
+
'retries': 3,
|
|
163
|
+
'retry_delay': timedelta(minutes=5),
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
dag = DAG(
|
|
167
|
+
dag_id=dag_id,
|
|
168
|
+
default_args=default_args,
|
|
169
|
+
schedule=config['schedule'],
|
|
170
|
+
start_date=datetime(2024, 1, 1),
|
|
171
|
+
catchup=False,
|
|
172
|
+
tags=['etl', 'dynamic', config['name']],
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
with dag:
|
|
176
|
+
def extract_fn(source, **context):
|
|
177
|
+
print(f"Extracting from {source} for {context['ds']}")
|
|
178
|
+
|
|
179
|
+
def transform_fn(**context):
|
|
180
|
+
print(f"Transforming data for {context['ds']}")
|
|
181
|
+
|
|
182
|
+
def load_fn(table_name, **context):
|
|
183
|
+
print(f"Loading to {table_name} for {context['ds']}")
|
|
184
|
+
|
|
185
|
+
extract = PythonOperator(task_id='extract', python_callable=extract_fn, op_kwargs={'source': config['source']})
|
|
186
|
+
transform = PythonOperator(task_id='transform', python_callable=transform_fn)
|
|
187
|
+
load = PythonOperator(task_id='load', python_callable=load_fn, op_kwargs={'table_name': config['name']})
|
|
188
|
+
|
|
189
|
+
extract >> transform >> load
|
|
190
|
+
|
|
191
|
+
return dag
|
|
192
|
+
|
|
193
|
+
for config in PIPELINE_CONFIGS:
|
|
194
|
+
globals()[f"dag_{config['name']}"] = create_dag(config)
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Pattern 3: Branching and Conditional Logic
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
from airflow.decorators import dag, task
|
|
201
|
+
from airflow.operators.python import BranchPythonOperator
|
|
202
|
+
from airflow.operators.empty import EmptyOperator
|
|
203
|
+
from airflow.utils.trigger_rule import TriggerRule
|
|
204
|
+
|
|
205
|
+
@dag(dag_id='branching_pipeline', schedule='@daily', start_date=datetime(2024, 1, 1), catchup=False)
|
|
206
|
+
def branching_pipeline():
|
|
207
|
+
|
|
208
|
+
@task()
|
|
209
|
+
def check_data_quality() -> dict:
|
|
210
|
+
quality_score = 0.95
|
|
211
|
+
return {'score': quality_score, 'rows': 10000}
|
|
212
|
+
|
|
213
|
+
def choose_branch(**context) -> str:
|
|
214
|
+
ti = context['ti']
|
|
215
|
+
metrics = ti.xcom_pull(task_ids='check_data_quality')
|
|
216
|
+
|
|
217
|
+
if metrics['score'] >= 0.9:
|
|
218
|
+
return 'high_quality_path'
|
|
219
|
+
elif metrics['score'] >= 0.7:
|
|
220
|
+
return 'medium_quality_path'
|
|
221
|
+
else:
|
|
222
|
+
return 'low_quality_path'
|
|
223
|
+
|
|
224
|
+
quality_check = check_data_quality()
|
|
225
|
+
|
|
226
|
+
branch = BranchPythonOperator(task_id='branch', python_callable=choose_branch)
|
|
227
|
+
|
|
228
|
+
high_quality = EmptyOperator(task_id='high_quality_path')
|
|
229
|
+
medium_quality = EmptyOperator(task_id='medium_quality_path')
|
|
230
|
+
low_quality = EmptyOperator(task_id='low_quality_path')
|
|
231
|
+
|
|
232
|
+
join = EmptyOperator(
|
|
233
|
+
task_id='join',
|
|
234
|
+
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
quality_check >> branch >> [high_quality, medium_quality, low_quality] >> join
|
|
238
|
+
|
|
239
|
+
branching_pipeline()
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### Pattern 4: Sensors and External Dependencies
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
from datetime import datetime
|
|
246
|
+
from airflow import DAG
|
|
247
|
+
from airflow.sensors.filesystem import FileSensor
|
|
248
|
+
from airflow.providers.amazon.aws.sensors.s3 import S3KeySensor
|
|
249
|
+
from airflow.sensors.external_task import ExternalTaskSensor
|
|
250
|
+
from airflow.operators.python import PythonOperator
|
|
251
|
+
|
|
252
|
+
with DAG(dag_id='sensor_example', schedule='@daily', start_date=datetime(2024, 1, 1), catchup=False) as dag:
|
|
253
|
+
|
|
254
|
+
wait_for_file = S3KeySensor(
|
|
255
|
+
task_id='wait_for_s3_file',
|
|
256
|
+
bucket_name='data-lake',
|
|
257
|
+
bucket_key='raw/{{ ds }}/data.parquet',
|
|
258
|
+
aws_conn_id='aws_default',
|
|
259
|
+
timeout=60 * 60 * 2,
|
|
260
|
+
poke_interval=60 * 5,
|
|
261
|
+
mode='reschedule',
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
wait_for_upstream = ExternalTaskSensor(
|
|
265
|
+
task_id='wait_for_upstream_dag',
|
|
266
|
+
external_dag_id='upstream_etl',
|
|
267
|
+
external_task_id='final_task',
|
|
268
|
+
execution_date_fn=lambda dt: dt,
|
|
269
|
+
timeout=60 * 60 * 3,
|
|
270
|
+
mode='reschedule',
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
@task.sensor(poke_interval=60, timeout=3600, mode='reschedule')
|
|
274
|
+
def wait_for_api() -> PokeReturnValue:
|
|
275
|
+
import requests
|
|
276
|
+
response = requests.get('https://api.example.com/health')
|
|
277
|
+
is_done = response.status_code == 200
|
|
278
|
+
return PokeReturnValue(is_done=is_done, xcom_value=response.json())
|
|
279
|
+
|
|
280
|
+
api_ready = wait_for_api()
|
|
281
|
+
|
|
282
|
+
def process_data(**context):
|
|
283
|
+
api_result = context['ti'].xcom_pull(task_ids='wait_for_api')
|
|
284
|
+
print(f"API returned: {api_result}")
|
|
285
|
+
|
|
286
|
+
process = PythonOperator(task_id='process', python_callable=process_data)
|
|
287
|
+
|
|
288
|
+
[wait_for_file, wait_for_upstream, api_ready] >> process
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### Pattern 5: Error Handling and Alerts
|
|
292
|
+
|
|
293
|
+
```python
|
|
294
|
+
from datetime import datetime, timedelta
|
|
295
|
+
from airflow import DAG
|
|
296
|
+
from airflow.operators.python import PythonOperator
|
|
297
|
+
from airflow.utils.trigger_rule import TriggerRule
|
|
298
|
+
|
|
299
|
+
def task_failure_callback(context):
|
|
300
|
+
task_instance = context['task_instance']
|
|
301
|
+
exception = context.get('exception')
|
|
302
|
+
|
|
303
|
+
message = f"""
|
|
304
|
+
Task Failed!
|
|
305
|
+
DAG: {task_instance.dag_id}
|
|
306
|
+
Task: {task_instance.task_id}
|
|
307
|
+
Execution Date: {context['ds']}
|
|
308
|
+
Error: {exception}
|
|
309
|
+
Log URL: {task_instance.log_url}
|
|
310
|
+
"""
|
|
311
|
+
print(message)
|
|
312
|
+
|
|
313
|
+
def dag_failure_callback(context):
|
|
314
|
+
pass
|
|
315
|
+
|
|
316
|
+
with DAG(
|
|
317
|
+
dag_id='error_handling_example',
|
|
318
|
+
schedule='@daily',
|
|
319
|
+
start_date=datetime(2024, 1, 1),
|
|
320
|
+
catchup=False,
|
|
321
|
+
on_failure_callback=dag_failure_callback,
|
|
322
|
+
default_args={
|
|
323
|
+
'on_failure_callback': task_failure_callback,
|
|
324
|
+
'retries': 3,
|
|
325
|
+
'retry_delay': timedelta(minutes=5),
|
|
326
|
+
},
|
|
327
|
+
) as dag:
|
|
328
|
+
|
|
329
|
+
def might_fail(**context):
|
|
330
|
+
import random
|
|
331
|
+
if random.random() < 0.3:
|
|
332
|
+
raise ValueError("Random failure!")
|
|
333
|
+
return "Success"
|
|
334
|
+
|
|
335
|
+
risky_task = PythonOperator(task_id='risky_task', python_callable=might_fail)
|
|
336
|
+
|
|
337
|
+
def cleanup(**context):
|
|
338
|
+
print("Cleaning up...")
|
|
339
|
+
|
|
340
|
+
cleanup_task = PythonOperator(
|
|
341
|
+
task_id='cleanup',
|
|
342
|
+
python_callable=cleanup,
|
|
343
|
+
trigger_rule=TriggerRule.ALL_DONE,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
def notify_success(**context):
|
|
347
|
+
print("All tasks succeeded!")
|
|
348
|
+
|
|
349
|
+
success_notification = PythonOperator(
|
|
350
|
+
task_id='notify_success',
|
|
351
|
+
python_callable=notify_success,
|
|
352
|
+
trigger_rule=TriggerRule.ALL_SUCCESS,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
risky_task >> [cleanup_task, success_notification]
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
### Pattern 6: Testing DAGs
|
|
359
|
+
|
|
360
|
+
```python
|
|
361
|
+
import pytest
|
|
362
|
+
from datetime import datetime
|
|
363
|
+
from airflow.models import DagBag
|
|
364
|
+
|
|
365
|
+
@pytest.fixture
|
|
366
|
+
def dagbag():
|
|
367
|
+
return DagBag(dag_folder='dags/', include_examples=False)
|
|
368
|
+
|
|
369
|
+
def test_dag_loaded(dagbag):
|
|
370
|
+
assert len(dagbag.import_errors) == 0, f"DAG import errors: {dagbag.import_errors}"
|
|
371
|
+
|
|
372
|
+
def test_dag_structure(dagbag):
|
|
373
|
+
dag = dagbag.get_dag('example_etl')
|
|
374
|
+
assert dag is not None
|
|
375
|
+
assert len(dag.tasks) == 3
|
|
376
|
+
assert dag.schedule_interval == '0 6 * * *'
|
|
377
|
+
|
|
378
|
+
def test_task_dependencies(dagbag):
|
|
379
|
+
dag = dagbag.get_dag('example_etl')
|
|
380
|
+
extract_task = dag.get_task('extract')
|
|
381
|
+
assert 'start' in [t.task_id for t in extract_task.upstream_list]
|
|
382
|
+
assert 'end' in [t.task_id for t in extract_task.downstream_list]
|
|
383
|
+
|
|
384
|
+
def test_dag_integrity(dagbag):
|
|
385
|
+
for dag_id, dag in dagbag.dags.items():
|
|
386
|
+
assert dag.test_cycle() is None, f"Cycle detected in {dag_id}"
|
|
387
|
+
|
|
388
|
+
def test_extract_function():
|
|
389
|
+
from dags.example_dag import extract_data
|
|
390
|
+
result = extract_data(ds='2024-01-01')
|
|
391
|
+
assert 'records' in result
|
|
392
|
+
assert isinstance(result['records'], int)
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
## Project Structure
|
|
396
|
+
|
|
397
|
+
```
|
|
398
|
+
airflow/
|
|
399
|
+
├── dags/
|
|
400
|
+
│ ├── __init__.py
|
|
401
|
+
│ ├── common/
|
|
402
|
+
│ │ ├── operators.py
|
|
403
|
+
│ │ ├── sensors.py
|
|
404
|
+
│ │ └── callbacks.py
|
|
405
|
+
│ ├── etl/
|
|
406
|
+
│ │ ├── customers.py
|
|
407
|
+
│ │ └── orders.py
|
|
408
|
+
│ └── ml/
|
|
409
|
+
│ └── training.py
|
|
410
|
+
├── plugins/
|
|
411
|
+
│ └── custom_plugin.py
|
|
412
|
+
├── tests/
|
|
413
|
+
│ ├── __init__.py
|
|
414
|
+
│ ├── test_dags.py
|
|
415
|
+
│ └── test_operators.py
|
|
416
|
+
├── docker-compose.yml
|
|
417
|
+
└── requirements.txt
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
## Best Practices
|
|
421
|
+
|
|
422
|
+
### Do's
|
|
423
|
+
|
|
424
|
+
- **Use TaskFlow API** - Cleaner code, automatic XCom
|
|
425
|
+
- **Set timeouts** - Prevent zombie tasks
|
|
426
|
+
- **Use `mode='reschedule'`** - For sensors, free up workers
|
|
427
|
+
- **Test DAGs** - Unit tests and integration tests
|
|
428
|
+
- **Idempotent tasks** - Safe to retry
|
|
429
|
+
|
|
430
|
+
### Don'ts
|
|
431
|
+
|
|
432
|
+
- **Don't use `depends_on_past=True`** - Creates bottlenecks
|
|
433
|
+
- **Don't hardcode dates** - Use `{{ ds }}` macros
|
|
434
|
+
- **Don't use global state** - Tasks should be stateless
|
|
435
|
+
- **Don't skip catchup blindly** - Understand implications
|
|
436
|
+
- **Don't put heavy logic in DAG file** - Import from modules
|