agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,339 @@
1
+ # Performance Optimization
2
+
3
+ Patterns for building efficient, cost-effective data pipelines.
4
+
5
+ ## Partitioning
6
+
7
+ ### Choose Partition Columns Wisely
8
+
9
+ Partition by columns used in most query filters (usually date/time).
10
+
11
+ ```python
12
+ # Good: Partition by date - most queries filter by date
13
+ (orders_df
14
+ .write
15
+ .partitionBy("order_date")
16
+ .saveAsTable("curated.orders"))
17
+
18
+ # Query with partition pruning
19
+ spark.sql("SELECT * FROM curated.orders WHERE order_date = '2024-01-15'")
20
+
21
+ # Bad: Over-partitioning creates too many small files
22
+ (orders_df
23
+ .write
24
+ .partitionBy("order_date", "customer_id", "product_id") # Millions of partitions!
25
+ .saveAsTable("curated.orders"))
26
+ ```
27
+
28
+ ### Partition Size Guidelines
29
+
30
+ | Records per Partition | Assessment |
31
+ |-----------------------|------------|
32
+ | < 100,000 | Too small - consider fewer partitions |
33
+ | 100K - 10M | Optimal range |
34
+ | > 10M | Consider sub-partitioning |
35
+
36
+ ### Optimize File Sizes
37
+
38
+ ```python
39
+ # Set target file size (Delta Lake)
40
+ spark.conf.set("spark.databricks.delta.optimizeWrite.fileSize", "128mb")
41
+
42
+ # Compact small files
43
+ spark.sql("OPTIMIZE curated.orders")
44
+
45
+ # Set max records per file
46
+ (df.write
47
+ .option("maxRecordsPerFile", 1_000_000)
48
+ .saveAsTable("curated.orders"))
49
+ ```
50
+
51
+ ## Query Optimization
52
+
53
+ ### Predicate Pushdown
54
+
55
+ Ensure filters are pushed to storage layer.
56
+
57
+ ```python
58
+ # Good: Predicate pushdown works
59
+ orders = spark.read.table("curated.orders").filter("order_date = '2024-01-15'")
60
+
61
+ # Bad: UDF blocks pushdown - full table scan!
62
+ @udf(returnType=BooleanType())
63
+ def is_recent(d):
64
+ return d > datetime.now() - timedelta(days=7)
65
+
66
+ orders = spark.read.table("curated.orders").filter(is_recent(F.col("order_date")))
67
+
68
+ # Good: Use native Spark functions
69
+ orders = spark.read.table("curated.orders").filter(
70
+ F.col("order_date") > F.date_sub(F.current_date(), 7)
71
+ )
72
+ ```
73
+
74
+ ### Column Pruning
75
+
76
+ Select only needed columns early.
77
+
78
+ ```python
79
+ # Good: Select needed columns early
80
+ (spark.read.table("curated.orders")
81
+ .select("order_id", "customer_id", "total_amount") # Prune early
82
+ .filter("total_amount > 100")
83
+ .groupBy("customer_id")
84
+ .agg(F.sum("total_amount")))
85
+
86
+ # Bad: Select all, filter later
87
+ (spark.read.table("curated.orders") # Reads all 50 columns
88
+ .filter("total_amount > 100")
89
+ .select("order_id", "customer_id", "total_amount") # Too late!
90
+ .groupBy("customer_id")
91
+ .agg(F.sum("total_amount")))
92
+ ```
93
+
94
+ ### Broadcast Joins
95
+
96
+ Broadcast small tables to avoid shuffle.
97
+
98
+ ```python
99
+ # Good: Broadcast small dimension table
100
+ from pyspark.sql.functions import broadcast
101
+
102
+ orders = spark.table("curated.orders") # 1B rows
103
+ products = spark.table("dims.products") # 10K rows
104
+
105
+ joined = orders.join(broadcast(products), "product_id")
106
+
107
+ # Configure broadcast threshold
108
+ spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "100mb")
109
+ ```
110
+
111
+ ### Avoid Expensive Operations
112
+
113
+ ```python
114
+ # Bad: Distinct before join (expensive shuffle)
115
+ orders.select("customer_id").distinct().join(customers, "customer_id")
116
+
117
+ # Good: Filter on indexed column, let database handle uniqueness
118
+ customers.filter("is_active = true")
119
+
120
+ # Bad: Order by on large dataset
121
+ spark.table("curated.orders").orderBy("order_date")
122
+
123
+ # Good: Only sort when necessary, limit first if possible
124
+ (spark.table("curated.orders")
125
+ .filter("customer_id = 'C123'")
126
+ .orderBy(F.desc("order_date"))
127
+ .limit(10))
128
+ ```
129
+
130
+ ## Caching
131
+
132
+ ### When to Cache
133
+
134
+ Cache intermediate results that are:
135
+ 1. Used multiple times
136
+ 2. Expensive to compute
137
+ 3. Small enough to fit in memory
138
+
139
+ ```python
140
+ def process_with_caching():
141
+ # Read and filter once
142
+ base_orders = (
143
+ spark.read.table("curated.orders")
144
+ .filter("order_date >= '2024-01-01'")
145
+ .cache() # Cache in memory
146
+ )
147
+
148
+ try:
149
+ # Multiple aggregations on same filtered data
150
+ daily_totals = base_orders.groupBy("order_date").agg(F.sum("total"))
151
+ customer_totals = base_orders.groupBy("customer_id").agg(F.sum("total"))
152
+ product_totals = base_orders.groupBy("product_id").agg(F.sum("total"))
153
+
154
+ # Write all
155
+ daily_totals.write.saveAsTable("marts.daily_totals")
156
+ customer_totals.write.saveAsTable("marts.customer_totals")
157
+ product_totals.write.saveAsTable("marts.product_totals")
158
+ finally:
159
+ base_orders.unpersist() # Always clean up
160
+ ```
161
+
162
+ ### Cache Levels
163
+
164
+ ```python
165
+ from pyspark import StorageLevel
166
+
167
+ # Memory only (default) - fastest, may spill
168
+ df.cache() # Same as persist(StorageLevel.MEMORY_ONLY)
169
+
170
+ # Memory and disk - won't recompute if evicted
171
+ df.persist(StorageLevel.MEMORY_AND_DISK)
172
+
173
+ # Disk only - for very large datasets
174
+ df.persist(StorageLevel.DISK_ONLY)
175
+
176
+ # Serialized - more compact, slower access
177
+ df.persist(StorageLevel.MEMORY_ONLY_SER)
178
+ ```
179
+
180
+ ## Shuffle Optimization
181
+
182
+ ### Reduce Shuffle Size
183
+
184
+ ```python
185
+ # Good: Aggregate before join
186
+ customer_totals = orders.groupBy("customer_id").agg(F.sum("amount").alias("total"))
187
+ result = customer_totals.join(customers, "customer_id")
188
+
189
+ # Bad: Join then aggregate (shuffles full orders table)
190
+ result = orders.join(customers, "customer_id").groupBy("customer_id").agg(F.sum("amount"))
191
+ ```
192
+
193
+ ### Partition Count
194
+
195
+ ```python
196
+ # Check partition count
197
+ print(df.rdd.getNumPartitions())
198
+
199
+ # Reduce partitions before write (coalesce doesn't shuffle)
200
+ df.coalesce(100).write.saveAsTable("output")
201
+
202
+ # Increase partitions for parallelism (repartition shuffles)
203
+ df.repartition(200).write.saveAsTable("output")
204
+
205
+ # Repartition by key for co-located data
206
+ df.repartition("customer_id").write.partitionBy("customer_id").saveAsTable("output")
207
+ ```
208
+
209
+ ### Configure Shuffle Partitions
210
+
211
+ ```python
212
+ # Default is 200 - tune based on data size
213
+ spark.conf.set("spark.sql.shuffle.partitions", "auto") # Spark 3.0+
214
+ # Or set explicitly
215
+ spark.conf.set("spark.sql.shuffle.partitions", "400")
216
+ ```
217
+
218
+ ## Z-Ordering (Delta Lake)
219
+
220
+ Co-locate related data for faster queries on multiple columns.
221
+
222
+ ```sql
223
+ -- Z-order by commonly filtered/joined columns
224
+ OPTIMIZE curated.orders
225
+ ZORDER BY (customer_id, product_id)
226
+
227
+ -- Queries on these columns will be faster
228
+ SELECT * FROM curated.orders WHERE customer_id = 'C123'
229
+ SELECT * FROM curated.orders WHERE product_id = 'P456'
230
+ SELECT * FROM curated.orders WHERE customer_id = 'C123' AND product_id = 'P456'
231
+ ```
232
+
233
+ ## Incremental Processing
234
+
235
+ Process only what changed.
236
+
237
+ ```python
238
+ def incremental_pipeline(source: str, target: str, watermark_col: str):
239
+ # Get last processed watermark
240
+ last_watermark = spark.sql(f"""
241
+ SELECT MAX({watermark_col}) FROM {target}
242
+ """).collect()[0][0]
243
+
244
+ # Read only new data
245
+ new_data = (
246
+ spark.read.table(source)
247
+ .filter(F.col(watermark_col) > last_watermark)
248
+ )
249
+
250
+ if new_data.isEmpty():
251
+ logger.info("No new data to process")
252
+ return
253
+
254
+ # Process and write
255
+ processed = transform(new_data)
256
+ processed.write.mode("append").saveAsTable(target)
257
+ ```
258
+
259
+ ## Cost Management
260
+
261
+ ### Monitor Compute Costs
262
+
263
+ ```sql
264
+ -- Track pipeline costs over time
265
+ SELECT
266
+ pipeline_name,
267
+ DATE(run_date) as run_date,
268
+ SUM(total_dbu) as daily_dbu,
269
+ SUM(bytes_scanned) / 1e12 as tb_scanned,
270
+ AVG(duration_seconds) as avg_duration
271
+ FROM pipeline_metrics
272
+ WHERE run_date >= CURRENT_DATE - 30
273
+ GROUP BY pipeline_name, DATE(run_date)
274
+ ORDER BY daily_dbu DESC;
275
+ ```
276
+
277
+ ### Identify Expensive Queries
278
+
279
+ ```sql
280
+ -- Find most expensive queries
281
+ SELECT
282
+ query_id,
283
+ user,
284
+ LEFT(query_text, 100) as query_preview,
285
+ bytes_scanned / 1e9 as gb_scanned,
286
+ duration_ms / 1000 as duration_seconds
287
+ FROM query_history
288
+ WHERE timestamp >= CURRENT_DATE - 7
289
+ ORDER BY bytes_scanned DESC
290
+ LIMIT 20;
291
+ ```
292
+
293
+ ### Optimize Storage Costs
294
+
295
+ ```python
296
+ # Remove old partitions
297
+ spark.sql("""
298
+ DELETE FROM curated.orders
299
+ WHERE order_date < DATE_SUB(CURRENT_DATE, 365)
300
+ """)
301
+
302
+ # Vacuum deleted files (Delta Lake)
303
+ spark.sql("VACUUM curated.orders RETAIN 168 HOURS")
304
+
305
+ # Convert to more efficient format
306
+ (spark.read.table("legacy.orders")
307
+ .write
308
+ .format("delta")
309
+ .option("compression", "zstd") # Better compression
310
+ .saveAsTable("curated.orders"))
311
+ ```
312
+
313
+ ## Performance Checklist
314
+
315
+ Before deploying a pipeline, verify:
316
+
317
+ ### Query Optimization
318
+ - [ ] Filters use partition columns
319
+ - [ ] Only needed columns are selected
320
+ - [ ] Small tables are broadcast in joins
321
+ - [ ] No unnecessary shuffles
322
+
323
+ ### File Optimization
324
+ - [ ] Partition column is appropriate
325
+ - [ ] Files are right-sized (100MB-1GB)
326
+ - [ ] Z-ordering on common filter columns
327
+ - [ ] No excessive small files
328
+
329
+ ### Resource Optimization
330
+ - [ ] Cluster size matches workload
331
+ - [ ] Shuffle partitions configured
332
+ - [ ] Caching used for reused DataFrames
333
+ - [ ] Memory settings appropriate
334
+
335
+ ### Cost Optimization
336
+ - [ ] Incremental processing where possible
337
+ - [ ] Data retention policy applied
338
+ - [ ] Query costs monitored
339
+ - [ ] Unused tables/data removed
@@ -0,0 +1,280 @@
1
+ # Pipeline Design
2
+
3
+ Patterns and practices for building reliable data pipelines.
4
+
5
+ ## Core Principles
6
+
7
+ ### 1. Idempotency
8
+
9
+ Every pipeline run must produce identical results for the same inputs.
10
+
11
+ ```python
12
+ # Good: Delete-insert pattern ensures idempotency
13
+ def process_daily_orders(execution_date: date) -> None:
14
+ partition = execution_date.strftime("%Y-%m-%d")
15
+
16
+ # Clear target partition first
17
+ spark.sql(f"DELETE FROM curated.orders WHERE order_date = '{partition}'")
18
+
19
+ # Then insert
20
+ orders_df.write.mode("append").saveAsTable("curated.orders")
21
+
22
+ # Bad: Append without clearing creates duplicates on re-run
23
+ orders_df.write.mode("append").saveAsTable("curated.orders")
24
+ ```
25
+
26
+ ### 2. Determinism
27
+
28
+ Same inputs must always produce same outputs. Avoid:
29
+ - `current_timestamp()` in transformations (use execution_date)
30
+ - Random sampling without seeds
31
+ - Order-dependent operations on unordered data
32
+
33
+ ```python
34
+ # Good: Use execution_date for reproducibility
35
+ df.withColumn("processed_at", F.lit(execution_date))
36
+
37
+ # Bad: Non-deterministic timestamp
38
+ df.withColumn("processed_at", F.current_timestamp())
39
+ ```
40
+
41
+ ### 3. Atomicity
42
+
43
+ Pipeline outputs should be all-or-nothing. Partial writes corrupt data.
44
+
45
+ ```python
46
+ # Good: Write to staging, then atomic swap
47
+ df.write.mode("overwrite").saveAsTable("staging.orders_temp")
48
+ spark.sql("ALTER TABLE curated.orders SWAP WITH staging.orders_temp")
49
+
50
+ # Good: Use Delta Lake transactions
51
+ df.write.format("delta").mode("overwrite").saveAsTable("curated.orders")
52
+ ```
53
+
54
+ ## Pipeline Patterns
55
+
56
+ ### Batch Full Refresh
57
+
58
+ Use when: Source doesn't support incremental, data is small, or simplicity matters.
59
+
60
+ ```python
61
+ def full_refresh_pipeline(source: str, target: str) -> None:
62
+ df = spark.read.table(source)
63
+ df = transform(df)
64
+ df.write.mode("overwrite").saveAsTable(target)
65
+ ```
66
+
67
+ ### Batch Incremental
68
+
69
+ Use when: Data volume is large, source supports watermarks.
70
+
71
+ ```python
72
+ def incremental_pipeline(source: str, target: str, watermark_col: str) -> None:
73
+ # Get high watermark from previous run
74
+ last_watermark = get_watermark(target, watermark_col)
75
+
76
+ # Read only new/changed records
77
+ df = spark.read.table(source).filter(F.col(watermark_col) > last_watermark)
78
+
79
+ if df.isEmpty():
80
+ return
81
+
82
+ # Merge into target
83
+ target_table = DeltaTable.forName(spark, target)
84
+ (target_table.alias("t")
85
+ .merge(df.alias("s"), "t.id = s.id")
86
+ .whenMatchedUpdateAll()
87
+ .whenNotMatchedInsertAll()
88
+ .execute())
89
+ ```
90
+
91
+ ### Change Data Capture (CDC)
92
+
93
+ Use when: Need to track all changes, support point-in-time queries.
94
+
95
+ ```python
96
+ def cdc_pipeline(cdc_events: DataFrame, target: str) -> None:
97
+ """Process CDC events (insert, update, delete operations)."""
98
+
99
+ target_table = DeltaTable.forName(spark, target)
100
+
101
+ (target_table.alias("t")
102
+ .merge(cdc_events.alias("s"), "t.id = s.id")
103
+ .whenMatchedDelete(condition="s.operation = 'DELETE'")
104
+ .whenMatchedUpdateAll(condition="s.operation = 'UPDATE'")
105
+ .whenNotMatchedInsertAll(condition="s.operation = 'INSERT'")
106
+ .execute())
107
+ ```
108
+
109
+ ### Streaming
110
+
111
+ Use when: Low latency required, source is event stream.
112
+
113
+ ```python
114
+ def streaming_pipeline() -> None:
115
+ events = (
116
+ spark.readStream
117
+ .format("kafka")
118
+ .option("subscribe", "events")
119
+ .load()
120
+ )
121
+
122
+ processed = events.transform(process_events)
123
+
124
+ (processed
125
+ .writeStream
126
+ .format("delta")
127
+ .option("checkpointLocation", CHECKPOINT_PATH)
128
+ .outputMode("append")
129
+ .trigger(processingTime="1 minute")
130
+ .toTable("curated.events"))
131
+ ```
132
+
133
+ ## Handling Late Data
134
+
135
+ ### Reprocessing Window
136
+
137
+ Reprocess recent partitions to catch late arrivals.
138
+
139
+ ```python
140
+ def process_with_late_data_handling(execution_date: date) -> None:
141
+ # Reprocess last 3 days to catch late arrivals
142
+ start_date = execution_date - timedelta(days=3)
143
+
144
+ for date in date_range(start_date, execution_date):
145
+ process_partition(date)
146
+ ```
147
+
148
+ ### Watermarking (Streaming)
149
+
150
+ Define how long to wait for late data.
151
+
152
+ ```python
153
+ events_with_watermark = (
154
+ events
155
+ .withWatermark("event_time", "1 hour") # Wait up to 1 hour for late events
156
+ .groupBy(F.window("event_time", "5 minutes"))
157
+ .count()
158
+ )
159
+ ```
160
+
161
+ ## Orchestration Patterns
162
+
163
+ ### DAG Design
164
+
165
+ ```
166
+ [extract_orders] --> [validate_orders] --> [transform_orders] --> [load_orders]
167
+ |
168
+ [extract_products] --> [validate_products] --> [transform_products] --+
169
+ |
170
+ v
171
+ [build_order_mart]
172
+ ```
173
+
174
+ ### Retry Strategy
175
+
176
+ ```python
177
+ default_args = {
178
+ 'retries': 3,
179
+ 'retry_delay': timedelta(minutes=5),
180
+ 'retry_exponential_backoff': True,
181
+ 'max_retry_delay': timedelta(hours=1),
182
+ }
183
+ ```
184
+
185
+ ### Backfill Strategy
186
+
187
+ ```python
188
+ # Support backfill via parameterized execution date
189
+ @task
190
+ def process_orders(execution_date: date = None):
191
+ if execution_date is None:
192
+ execution_date = date.today() - timedelta(days=1)
193
+
194
+ # Use execution_date, not current date
195
+ process_partition(execution_date)
196
+ ```
197
+
198
+ ## Error Handling
199
+
200
+ ### Fail Fast
201
+
202
+ ```python
203
+ def process_orders(df: DataFrame) -> DataFrame:
204
+ # Validate critical assumptions early
205
+ if df.filter("order_id IS NULL").count() > 0:
206
+ raise DataQualityError("Found null order_ids")
207
+
208
+ if df.count() == 0:
209
+ raise EmptyDataError("No orders to process")
210
+
211
+ return transform(df)
212
+ ```
213
+
214
+ ### Dead Letter Queue
215
+
216
+ ```python
217
+ def process_with_dlq(df: DataFrame) -> DataFrame:
218
+ # Separate valid and invalid records
219
+ valid = df.filter(is_valid_record)
220
+ invalid = df.filter(~is_valid_record)
221
+
222
+ # Write invalid records to DLQ for investigation
223
+ if invalid.count() > 0:
224
+ invalid.write.mode("append").saveAsTable("dlq.orders")
225
+ logger.warning(f"Sent {invalid.count()} records to DLQ")
226
+
227
+ return valid
228
+ ```
229
+
230
+ ## Best Practices
231
+
232
+ ### Explicit Dependencies
233
+
234
+ ```python
235
+ # Good: Explicit data dependencies
236
+ orders = spark.read.table("raw.orders")
237
+ products = spark.read.table("raw.products")
238
+ result = orders.join(products, "product_id")
239
+
240
+ # Bad: Hidden dependencies via side effects
241
+ process_orders() # Reads from orders table
242
+ process_products() # Reads from products table
243
+ build_mart() # What does this depend on?
244
+ ```
245
+
246
+ ### Parameterize Everything
247
+
248
+ ```python
249
+ # Good: Parameterized and testable
250
+ def process_orders(
251
+ source_table: str,
252
+ target_table: str,
253
+ execution_date: date,
254
+ ) -> None:
255
+ ...
256
+
257
+ # Bad: Hardcoded values
258
+ def process_orders():
259
+ df = spark.read.table("prod.orders") # Hardcoded!
260
+ df.write.saveAsTable("prod.curated_orders")
261
+ ```
262
+
263
+ ### Logging and Metrics
264
+
265
+ ```python
266
+ def process_orders(execution_date: date) -> None:
267
+ logger.info(f"Starting order processing for {execution_date}")
268
+
269
+ df = spark.read.table("raw.orders").filter(f"order_date = '{execution_date}'")
270
+ logger.info(f"Read {df.count()} orders")
271
+
272
+ result = transform(df)
273
+ logger.info(f"Writing {result.count()} records")
274
+
275
+ result.write.saveAsTable("curated.orders")
276
+
277
+ # Emit metrics
278
+ metrics.gauge("orders.processed", result.count())
279
+ metrics.gauge("orders.processing_time_seconds", elapsed_time)
280
+ ```