agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
# Performance Optimization
|
|
2
|
+
|
|
3
|
+
Patterns for building efficient, cost-effective data pipelines.
|
|
4
|
+
|
|
5
|
+
## Partitioning
|
|
6
|
+
|
|
7
|
+
### Choose Partition Columns Wisely
|
|
8
|
+
|
|
9
|
+
Partition by columns used in most query filters (usually date/time).
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
# Good: Partition by date - most queries filter by date
|
|
13
|
+
(orders_df
|
|
14
|
+
.write
|
|
15
|
+
.partitionBy("order_date")
|
|
16
|
+
.saveAsTable("curated.orders"))
|
|
17
|
+
|
|
18
|
+
# Query with partition pruning
|
|
19
|
+
spark.sql("SELECT * FROM curated.orders WHERE order_date = '2024-01-15'")
|
|
20
|
+
|
|
21
|
+
# Bad: Over-partitioning creates too many small files
|
|
22
|
+
(orders_df
|
|
23
|
+
.write
|
|
24
|
+
.partitionBy("order_date", "customer_id", "product_id") # Millions of partitions!
|
|
25
|
+
.saveAsTable("curated.orders"))
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Partition Size Guidelines
|
|
29
|
+
|
|
30
|
+
| Records per Partition | Assessment |
|
|
31
|
+
|-----------------------|------------|
|
|
32
|
+
| < 100,000 | Too small - consider fewer partitions |
|
|
33
|
+
| 100K - 10M | Optimal range |
|
|
34
|
+
| > 10M | Consider sub-partitioning |
|
|
35
|
+
|
|
36
|
+
### Optimize File Sizes
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
# Set target file size (Delta Lake)
|
|
40
|
+
spark.conf.set("spark.databricks.delta.optimizeWrite.fileSize", "128mb")
|
|
41
|
+
|
|
42
|
+
# Compact small files
|
|
43
|
+
spark.sql("OPTIMIZE curated.orders")
|
|
44
|
+
|
|
45
|
+
# Set max records per file
|
|
46
|
+
(df.write
|
|
47
|
+
.option("maxRecordsPerFile", 1_000_000)
|
|
48
|
+
.saveAsTable("curated.orders"))
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Query Optimization
|
|
52
|
+
|
|
53
|
+
### Predicate Pushdown
|
|
54
|
+
|
|
55
|
+
Ensure filters are pushed to storage layer.
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
# Good: Predicate pushdown works
|
|
59
|
+
orders = spark.read.table("curated.orders").filter("order_date = '2024-01-15'")
|
|
60
|
+
|
|
61
|
+
# Bad: UDF blocks pushdown - full table scan!
|
|
62
|
+
@udf(returnType=BooleanType())
|
|
63
|
+
def is_recent(d):
|
|
64
|
+
return d > datetime.now() - timedelta(days=7)
|
|
65
|
+
|
|
66
|
+
orders = spark.read.table("curated.orders").filter(is_recent(F.col("order_date")))
|
|
67
|
+
|
|
68
|
+
# Good: Use native Spark functions
|
|
69
|
+
orders = spark.read.table("curated.orders").filter(
|
|
70
|
+
F.col("order_date") > F.date_sub(F.current_date(), 7)
|
|
71
|
+
)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Column Pruning
|
|
75
|
+
|
|
76
|
+
Select only needed columns early.
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
# Good: Select needed columns early
|
|
80
|
+
(spark.read.table("curated.orders")
|
|
81
|
+
.select("order_id", "customer_id", "total_amount") # Prune early
|
|
82
|
+
.filter("total_amount > 100")
|
|
83
|
+
.groupBy("customer_id")
|
|
84
|
+
.agg(F.sum("total_amount")))
|
|
85
|
+
|
|
86
|
+
# Bad: Select all, filter later
|
|
87
|
+
(spark.read.table("curated.orders") # Reads all 50 columns
|
|
88
|
+
.filter("total_amount > 100")
|
|
89
|
+
.select("order_id", "customer_id", "total_amount") # Too late!
|
|
90
|
+
.groupBy("customer_id")
|
|
91
|
+
.agg(F.sum("total_amount")))
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Broadcast Joins
|
|
95
|
+
|
|
96
|
+
Broadcast small tables to avoid shuffle.
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
# Good: Broadcast small dimension table
|
|
100
|
+
from pyspark.sql.functions import broadcast
|
|
101
|
+
|
|
102
|
+
orders = spark.table("curated.orders") # 1B rows
|
|
103
|
+
products = spark.table("dims.products") # 10K rows
|
|
104
|
+
|
|
105
|
+
joined = orders.join(broadcast(products), "product_id")
|
|
106
|
+
|
|
107
|
+
# Configure broadcast threshold
|
|
108
|
+
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "100mb")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Avoid Expensive Operations
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
# Bad: Distinct before join (expensive shuffle)
|
|
115
|
+
orders.select("customer_id").distinct().join(customers, "customer_id")
|
|
116
|
+
|
|
117
|
+
# Good: Filter on indexed column, let database handle uniqueness
|
|
118
|
+
customers.filter("is_active = true")
|
|
119
|
+
|
|
120
|
+
# Bad: Order by on large dataset
|
|
121
|
+
spark.table("curated.orders").orderBy("order_date")
|
|
122
|
+
|
|
123
|
+
# Good: Only sort when necessary, limit first if possible
|
|
124
|
+
(spark.table("curated.orders")
|
|
125
|
+
.filter("customer_id = 'C123'")
|
|
126
|
+
.orderBy(F.desc("order_date"))
|
|
127
|
+
.limit(10))
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Caching
|
|
131
|
+
|
|
132
|
+
### When to Cache
|
|
133
|
+
|
|
134
|
+
Cache intermediate results that are:
|
|
135
|
+
1. Used multiple times
|
|
136
|
+
2. Expensive to compute
|
|
137
|
+
3. Small enough to fit in memory
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
def process_with_caching():
|
|
141
|
+
# Read and filter once
|
|
142
|
+
base_orders = (
|
|
143
|
+
spark.read.table("curated.orders")
|
|
144
|
+
.filter("order_date >= '2024-01-01'")
|
|
145
|
+
.cache() # Cache in memory
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
# Multiple aggregations on same filtered data
|
|
150
|
+
daily_totals = base_orders.groupBy("order_date").agg(F.sum("total"))
|
|
151
|
+
customer_totals = base_orders.groupBy("customer_id").agg(F.sum("total"))
|
|
152
|
+
product_totals = base_orders.groupBy("product_id").agg(F.sum("total"))
|
|
153
|
+
|
|
154
|
+
# Write all
|
|
155
|
+
daily_totals.write.saveAsTable("marts.daily_totals")
|
|
156
|
+
customer_totals.write.saveAsTable("marts.customer_totals")
|
|
157
|
+
product_totals.write.saveAsTable("marts.product_totals")
|
|
158
|
+
finally:
|
|
159
|
+
base_orders.unpersist() # Always clean up
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Cache Levels
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from pyspark import StorageLevel
|
|
166
|
+
|
|
167
|
+
# Memory only (default) - fastest, may spill
|
|
168
|
+
df.cache() # Same as persist(StorageLevel.MEMORY_ONLY)
|
|
169
|
+
|
|
170
|
+
# Memory and disk - won't recompute if evicted
|
|
171
|
+
df.persist(StorageLevel.MEMORY_AND_DISK)
|
|
172
|
+
|
|
173
|
+
# Disk only - for very large datasets
|
|
174
|
+
df.persist(StorageLevel.DISK_ONLY)
|
|
175
|
+
|
|
176
|
+
# Serialized - more compact, slower access
|
|
177
|
+
df.persist(StorageLevel.MEMORY_ONLY_SER)
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Shuffle Optimization
|
|
181
|
+
|
|
182
|
+
### Reduce Shuffle Size
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
# Good: Aggregate before join
|
|
186
|
+
customer_totals = orders.groupBy("customer_id").agg(F.sum("amount").alias("total"))
|
|
187
|
+
result = customer_totals.join(customers, "customer_id")
|
|
188
|
+
|
|
189
|
+
# Bad: Join then aggregate (shuffles full orders table)
|
|
190
|
+
result = orders.join(customers, "customer_id").groupBy("customer_id").agg(F.sum("amount"))
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Partition Count
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
# Check partition count
|
|
197
|
+
print(df.rdd.getNumPartitions())
|
|
198
|
+
|
|
199
|
+
# Reduce partitions before write (coalesce doesn't shuffle)
|
|
200
|
+
df.coalesce(100).write.saveAsTable("output")
|
|
201
|
+
|
|
202
|
+
# Increase partitions for parallelism (repartition shuffles)
|
|
203
|
+
df.repartition(200).write.saveAsTable("output")
|
|
204
|
+
|
|
205
|
+
# Repartition by key for co-located data
|
|
206
|
+
df.repartition("customer_id").write.partitionBy("customer_id").saveAsTable("output")
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Configure Shuffle Partitions
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
# Default is 200 - tune based on data size
|
|
213
|
+
spark.conf.set("spark.sql.shuffle.partitions", "auto") # Spark 3.0+
|
|
214
|
+
# Or set explicitly
|
|
215
|
+
spark.conf.set("spark.sql.shuffle.partitions", "400")
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## Z-Ordering (Delta Lake)
|
|
219
|
+
|
|
220
|
+
Co-locate related data for faster queries on multiple columns.
|
|
221
|
+
|
|
222
|
+
```sql
|
|
223
|
+
-- Z-order by commonly filtered/joined columns
|
|
224
|
+
OPTIMIZE curated.orders
|
|
225
|
+
ZORDER BY (customer_id, product_id)
|
|
226
|
+
|
|
227
|
+
-- Queries on these columns will be faster
|
|
228
|
+
SELECT * FROM curated.orders WHERE customer_id = 'C123'
|
|
229
|
+
SELECT * FROM curated.orders WHERE product_id = 'P456'
|
|
230
|
+
SELECT * FROM curated.orders WHERE customer_id = 'C123' AND product_id = 'P456'
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Incremental Processing
|
|
234
|
+
|
|
235
|
+
Process only what changed.
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
def incremental_pipeline(source: str, target: str, watermark_col: str):
|
|
239
|
+
# Get last processed watermark
|
|
240
|
+
last_watermark = spark.sql(f"""
|
|
241
|
+
SELECT MAX({watermark_col}) FROM {target}
|
|
242
|
+
""").collect()[0][0]
|
|
243
|
+
|
|
244
|
+
# Read only new data
|
|
245
|
+
new_data = (
|
|
246
|
+
spark.read.table(source)
|
|
247
|
+
.filter(F.col(watermark_col) > last_watermark)
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
if new_data.isEmpty():
|
|
251
|
+
logger.info("No new data to process")
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
# Process and write
|
|
255
|
+
processed = transform(new_data)
|
|
256
|
+
processed.write.mode("append").saveAsTable(target)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
## Cost Management
|
|
260
|
+
|
|
261
|
+
### Monitor Compute Costs
|
|
262
|
+
|
|
263
|
+
```sql
|
|
264
|
+
-- Track pipeline costs over time
|
|
265
|
+
SELECT
|
|
266
|
+
pipeline_name,
|
|
267
|
+
DATE(run_date) as run_date,
|
|
268
|
+
SUM(total_dbu) as daily_dbu,
|
|
269
|
+
SUM(bytes_scanned) / 1e12 as tb_scanned,
|
|
270
|
+
AVG(duration_seconds) as avg_duration
|
|
271
|
+
FROM pipeline_metrics
|
|
272
|
+
WHERE run_date >= CURRENT_DATE - 30
|
|
273
|
+
GROUP BY pipeline_name, DATE(run_date)
|
|
274
|
+
ORDER BY daily_dbu DESC;
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
### Identify Expensive Queries
|
|
278
|
+
|
|
279
|
+
```sql
|
|
280
|
+
-- Find most expensive queries
|
|
281
|
+
SELECT
|
|
282
|
+
query_id,
|
|
283
|
+
user,
|
|
284
|
+
LEFT(query_text, 100) as query_preview,
|
|
285
|
+
bytes_scanned / 1e9 as gb_scanned,
|
|
286
|
+
duration_ms / 1000 as duration_seconds
|
|
287
|
+
FROM query_history
|
|
288
|
+
WHERE timestamp >= CURRENT_DATE - 7
|
|
289
|
+
ORDER BY bytes_scanned DESC
|
|
290
|
+
LIMIT 20;
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
### Optimize Storage Costs
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
# Remove old partitions
|
|
297
|
+
spark.sql("""
|
|
298
|
+
DELETE FROM curated.orders
|
|
299
|
+
WHERE order_date < DATE_SUB(CURRENT_DATE, 365)
|
|
300
|
+
""")
|
|
301
|
+
|
|
302
|
+
# Vacuum deleted files (Delta Lake)
|
|
303
|
+
spark.sql("VACUUM curated.orders RETAIN 168 HOURS")
|
|
304
|
+
|
|
305
|
+
# Convert to more efficient format
|
|
306
|
+
(spark.read.table("legacy.orders")
|
|
307
|
+
.write
|
|
308
|
+
.format("delta")
|
|
309
|
+
.option("compression", "zstd") # Better compression
|
|
310
|
+
.saveAsTable("curated.orders"))
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
## Performance Checklist
|
|
314
|
+
|
|
315
|
+
Before deploying a pipeline, verify:
|
|
316
|
+
|
|
317
|
+
### Query Optimization
|
|
318
|
+
- [ ] Filters use partition columns
|
|
319
|
+
- [ ] Only needed columns are selected
|
|
320
|
+
- [ ] Small tables are broadcast in joins
|
|
321
|
+
- [ ] No unnecessary shuffles
|
|
322
|
+
|
|
323
|
+
### File Optimization
|
|
324
|
+
- [ ] Partition column is appropriate
|
|
325
|
+
- [ ] Files are right-sized (100MB-1GB)
|
|
326
|
+
- [ ] Z-ordering on common filter columns
|
|
327
|
+
- [ ] No excessive small files
|
|
328
|
+
|
|
329
|
+
### Resource Optimization
|
|
330
|
+
- [ ] Cluster size matches workload
|
|
331
|
+
- [ ] Shuffle partitions configured
|
|
332
|
+
- [ ] Caching used for reused DataFrames
|
|
333
|
+
- [ ] Memory settings appropriate
|
|
334
|
+
|
|
335
|
+
### Cost Optimization
|
|
336
|
+
- [ ] Incremental processing where possible
|
|
337
|
+
- [ ] Data retention policy applied
|
|
338
|
+
- [ ] Query costs monitored
|
|
339
|
+
- [ ] Unused tables/data removed
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
# Pipeline Design
|
|
2
|
+
|
|
3
|
+
Patterns and practices for building reliable data pipelines.
|
|
4
|
+
|
|
5
|
+
## Core Principles
|
|
6
|
+
|
|
7
|
+
### 1. Idempotency
|
|
8
|
+
|
|
9
|
+
Every pipeline run must produce identical results for the same inputs.
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
# Good: Delete-insert pattern ensures idempotency
|
|
13
|
+
def process_daily_orders(execution_date: date) -> None:
|
|
14
|
+
partition = execution_date.strftime("%Y-%m-%d")
|
|
15
|
+
|
|
16
|
+
# Clear target partition first
|
|
17
|
+
spark.sql(f"DELETE FROM curated.orders WHERE order_date = '{partition}'")
|
|
18
|
+
|
|
19
|
+
# Then insert
|
|
20
|
+
orders_df.write.mode("append").saveAsTable("curated.orders")
|
|
21
|
+
|
|
22
|
+
# Bad: Append without clearing creates duplicates on re-run
|
|
23
|
+
orders_df.write.mode("append").saveAsTable("curated.orders")
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### 2. Determinism
|
|
27
|
+
|
|
28
|
+
Same inputs must always produce same outputs. Avoid:
|
|
29
|
+
- `current_timestamp()` in transformations (use execution_date)
|
|
30
|
+
- Random sampling without seeds
|
|
31
|
+
- Order-dependent operations on unordered data
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
# Good: Use execution_date for reproducibility
|
|
35
|
+
df.withColumn("processed_at", F.lit(execution_date))
|
|
36
|
+
|
|
37
|
+
# Bad: Non-deterministic timestamp
|
|
38
|
+
df.withColumn("processed_at", F.current_timestamp())
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### 3. Atomicity
|
|
42
|
+
|
|
43
|
+
Pipeline outputs should be all-or-nothing. Partial writes corrupt data.
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
# Good: Write to staging, then atomic swap
|
|
47
|
+
df.write.mode("overwrite").saveAsTable("staging.orders_temp")
|
|
48
|
+
spark.sql("ALTER TABLE curated.orders SWAP WITH staging.orders_temp")
|
|
49
|
+
|
|
50
|
+
# Good: Use Delta Lake transactions
|
|
51
|
+
df.write.format("delta").mode("overwrite").saveAsTable("curated.orders")
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Pipeline Patterns
|
|
55
|
+
|
|
56
|
+
### Batch Full Refresh
|
|
57
|
+
|
|
58
|
+
Use when: Source doesn't support incremental, data is small, or simplicity matters.
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
def full_refresh_pipeline(source: str, target: str) -> None:
|
|
62
|
+
df = spark.read.table(source)
|
|
63
|
+
df = transform(df)
|
|
64
|
+
df.write.mode("overwrite").saveAsTable(target)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Batch Incremental
|
|
68
|
+
|
|
69
|
+
Use when: Data volume is large, source supports watermarks.
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
def incremental_pipeline(source: str, target: str, watermark_col: str) -> None:
|
|
73
|
+
# Get high watermark from previous run
|
|
74
|
+
last_watermark = get_watermark(target, watermark_col)
|
|
75
|
+
|
|
76
|
+
# Read only new/changed records
|
|
77
|
+
df = spark.read.table(source).filter(F.col(watermark_col) > last_watermark)
|
|
78
|
+
|
|
79
|
+
if df.isEmpty():
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
# Merge into target
|
|
83
|
+
target_table = DeltaTable.forName(spark, target)
|
|
84
|
+
(target_table.alias("t")
|
|
85
|
+
.merge(df.alias("s"), "t.id = s.id")
|
|
86
|
+
.whenMatchedUpdateAll()
|
|
87
|
+
.whenNotMatchedInsertAll()
|
|
88
|
+
.execute())
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Change Data Capture (CDC)
|
|
92
|
+
|
|
93
|
+
Use when: Need to track all changes, support point-in-time queries.
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
def cdc_pipeline(cdc_events: DataFrame, target: str) -> None:
|
|
97
|
+
"""Process CDC events (insert, update, delete operations)."""
|
|
98
|
+
|
|
99
|
+
target_table = DeltaTable.forName(spark, target)
|
|
100
|
+
|
|
101
|
+
(target_table.alias("t")
|
|
102
|
+
.merge(cdc_events.alias("s"), "t.id = s.id")
|
|
103
|
+
.whenMatchedDelete(condition="s.operation = 'DELETE'")
|
|
104
|
+
.whenMatchedUpdateAll(condition="s.operation = 'UPDATE'")
|
|
105
|
+
.whenNotMatchedInsertAll(condition="s.operation = 'INSERT'")
|
|
106
|
+
.execute())
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Streaming
|
|
110
|
+
|
|
111
|
+
Use when: Low latency required, source is event stream.
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
def streaming_pipeline() -> None:
|
|
115
|
+
events = (
|
|
116
|
+
spark.readStream
|
|
117
|
+
.format("kafka")
|
|
118
|
+
.option("subscribe", "events")
|
|
119
|
+
.load()
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
processed = events.transform(process_events)
|
|
123
|
+
|
|
124
|
+
(processed
|
|
125
|
+
.writeStream
|
|
126
|
+
.format("delta")
|
|
127
|
+
.option("checkpointLocation", CHECKPOINT_PATH)
|
|
128
|
+
.outputMode("append")
|
|
129
|
+
.trigger(processingTime="1 minute")
|
|
130
|
+
.toTable("curated.events"))
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Handling Late Data
|
|
134
|
+
|
|
135
|
+
### Reprocessing Window
|
|
136
|
+
|
|
137
|
+
Reprocess recent partitions to catch late arrivals.
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
def process_with_late_data_handling(execution_date: date) -> None:
|
|
141
|
+
# Reprocess last 3 days to catch late arrivals
|
|
142
|
+
start_date = execution_date - timedelta(days=3)
|
|
143
|
+
|
|
144
|
+
for date in date_range(start_date, execution_date):
|
|
145
|
+
process_partition(date)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Watermarking (Streaming)
|
|
149
|
+
|
|
150
|
+
Define how long to wait for late data.
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
events_with_watermark = (
|
|
154
|
+
events
|
|
155
|
+
.withWatermark("event_time", "1 hour") # Wait up to 1 hour for late events
|
|
156
|
+
.groupBy(F.window("event_time", "5 minutes"))
|
|
157
|
+
.count()
|
|
158
|
+
)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Orchestration Patterns
|
|
162
|
+
|
|
163
|
+
### DAG Design
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
[extract_orders] --> [validate_orders] --> [transform_orders] --> [load_orders]
|
|
167
|
+
|
|
|
168
|
+
[extract_products] --> [validate_products] --> [transform_products] --+
|
|
169
|
+
|
|
|
170
|
+
v
|
|
171
|
+
[build_order_mart]
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Retry Strategy
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
default_args = {
|
|
178
|
+
'retries': 3,
|
|
179
|
+
'retry_delay': timedelta(minutes=5),
|
|
180
|
+
'retry_exponential_backoff': True,
|
|
181
|
+
'max_retry_delay': timedelta(hours=1),
|
|
182
|
+
}
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Backfill Strategy
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
# Support backfill via parameterized execution date
|
|
189
|
+
@task
|
|
190
|
+
def process_orders(execution_date: date = None):
|
|
191
|
+
if execution_date is None:
|
|
192
|
+
execution_date = date.today() - timedelta(days=1)
|
|
193
|
+
|
|
194
|
+
# Use execution_date, not current date
|
|
195
|
+
process_partition(execution_date)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Error Handling
|
|
199
|
+
|
|
200
|
+
### Fail Fast
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
def process_orders(df: DataFrame) -> DataFrame:
|
|
204
|
+
# Validate critical assumptions early
|
|
205
|
+
if df.filter("order_id IS NULL").count() > 0:
|
|
206
|
+
raise DataQualityError("Found null order_ids")
|
|
207
|
+
|
|
208
|
+
if df.count() == 0:
|
|
209
|
+
raise EmptyDataError("No orders to process")
|
|
210
|
+
|
|
211
|
+
return transform(df)
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Dead Letter Queue
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
def process_with_dlq(df: DataFrame) -> DataFrame:
|
|
218
|
+
# Separate valid and invalid records
|
|
219
|
+
valid = df.filter(is_valid_record)
|
|
220
|
+
invalid = df.filter(~is_valid_record)
|
|
221
|
+
|
|
222
|
+
# Write invalid records to DLQ for investigation
|
|
223
|
+
if invalid.count() > 0:
|
|
224
|
+
invalid.write.mode("append").saveAsTable("dlq.orders")
|
|
225
|
+
logger.warning(f"Sent {invalid.count()} records to DLQ")
|
|
226
|
+
|
|
227
|
+
return valid
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## Best Practices
|
|
231
|
+
|
|
232
|
+
### Explicit Dependencies
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
# Good: Explicit data dependencies
|
|
236
|
+
orders = spark.read.table("raw.orders")
|
|
237
|
+
products = spark.read.table("raw.products")
|
|
238
|
+
result = orders.join(products, "product_id")
|
|
239
|
+
|
|
240
|
+
# Bad: Hidden dependencies via side effects
|
|
241
|
+
process_orders() # Reads from orders table
|
|
242
|
+
process_products() # Reads from products table
|
|
243
|
+
build_mart() # What does this depend on?
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Parameterize Everything
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
# Good: Parameterized and testable
|
|
250
|
+
def process_orders(
|
|
251
|
+
source_table: str,
|
|
252
|
+
target_table: str,
|
|
253
|
+
execution_date: date,
|
|
254
|
+
) -> None:
|
|
255
|
+
...
|
|
256
|
+
|
|
257
|
+
# Bad: Hardcoded values
|
|
258
|
+
def process_orders():
|
|
259
|
+
df = spark.read.table("prod.orders") # Hardcoded!
|
|
260
|
+
df.write.saveAsTable("prod.curated_orders")
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### Logging and Metrics
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
def process_orders(execution_date: date) -> None:
|
|
267
|
+
logger.info(f"Starting order processing for {execution_date}")
|
|
268
|
+
|
|
269
|
+
df = spark.read.table("raw.orders").filter(f"order_date = '{execution_date}'")
|
|
270
|
+
logger.info(f"Read {df.count()} orders")
|
|
271
|
+
|
|
272
|
+
result = transform(df)
|
|
273
|
+
logger.info(f"Writing {result.count()} records")
|
|
274
|
+
|
|
275
|
+
result.write.saveAsTable("curated.orders")
|
|
276
|
+
|
|
277
|
+
# Emit metrics
|
|
278
|
+
metrics.gauge("orders.processed", result.count())
|
|
279
|
+
metrics.gauge("orders.processing_time_seconds", elapsed_time)
|
|
280
|
+
```
|