agentic-team-templates 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +280 -0
  2. package/bin/cli.js +5 -0
  3. package/package.json +47 -0
  4. package/src/index.js +521 -0
  5. package/templates/_shared/code-quality.md +162 -0
  6. package/templates/_shared/communication.md +114 -0
  7. package/templates/_shared/core-principles.md +62 -0
  8. package/templates/_shared/git-workflow.md +165 -0
  9. package/templates/_shared/security-fundamentals.md +173 -0
  10. package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
  11. package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
  12. package/templates/blockchain/.cursorrules/overview.md +130 -0
  13. package/templates/blockchain/.cursorrules/security.md +318 -0
  14. package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
  15. package/templates/blockchain/.cursorrules/testing.md +415 -0
  16. package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
  17. package/templates/blockchain/CLAUDE.md +389 -0
  18. package/templates/cli-tools/.cursorrules/architecture.md +412 -0
  19. package/templates/cli-tools/.cursorrules/arguments.md +406 -0
  20. package/templates/cli-tools/.cursorrules/distribution.md +546 -0
  21. package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
  22. package/templates/cli-tools/.cursorrules/overview.md +136 -0
  23. package/templates/cli-tools/.cursorrules/testing.md +537 -0
  24. package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
  25. package/templates/cli-tools/CLAUDE.md +356 -0
  26. package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
  27. package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
  28. package/templates/data-engineering/.cursorrules/overview.md +85 -0
  29. package/templates/data-engineering/.cursorrules/performance.md +339 -0
  30. package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
  31. package/templates/data-engineering/.cursorrules/security.md +460 -0
  32. package/templates/data-engineering/.cursorrules/testing.md +452 -0
  33. package/templates/data-engineering/CLAUDE.md +974 -0
  34. package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
  35. package/templates/devops-sre/.cursorrules/change-management.md +584 -0
  36. package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
  37. package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
  38. package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
  39. package/templates/devops-sre/.cursorrules/observability.md +714 -0
  40. package/templates/devops-sre/.cursorrules/overview.md +230 -0
  41. package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
  42. package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
  43. package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
  44. package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
  45. package/templates/devops-sre/CLAUDE.md +1007 -0
  46. package/templates/documentation/.cursorrules/adr.md +277 -0
  47. package/templates/documentation/.cursorrules/api-documentation.md +411 -0
  48. package/templates/documentation/.cursorrules/code-comments.md +253 -0
  49. package/templates/documentation/.cursorrules/maintenance.md +260 -0
  50. package/templates/documentation/.cursorrules/overview.md +82 -0
  51. package/templates/documentation/.cursorrules/readme-standards.md +306 -0
  52. package/templates/documentation/CLAUDE.md +120 -0
  53. package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
  54. package/templates/fullstack/.cursorrules/architecture.md +298 -0
  55. package/templates/fullstack/.cursorrules/overview.md +109 -0
  56. package/templates/fullstack/.cursorrules/shared-types.md +348 -0
  57. package/templates/fullstack/.cursorrules/testing.md +386 -0
  58. package/templates/fullstack/CLAUDE.md +349 -0
  59. package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
  60. package/templates/ml-ai/.cursorrules/deployment.md +601 -0
  61. package/templates/ml-ai/.cursorrules/model-development.md +538 -0
  62. package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
  63. package/templates/ml-ai/.cursorrules/overview.md +131 -0
  64. package/templates/ml-ai/.cursorrules/security.md +637 -0
  65. package/templates/ml-ai/.cursorrules/testing.md +678 -0
  66. package/templates/ml-ai/CLAUDE.md +1136 -0
  67. package/templates/mobile/.cursorrules/navigation.md +246 -0
  68. package/templates/mobile/.cursorrules/offline-first.md +302 -0
  69. package/templates/mobile/.cursorrules/overview.md +71 -0
  70. package/templates/mobile/.cursorrules/performance.md +345 -0
  71. package/templates/mobile/.cursorrules/testing.md +339 -0
  72. package/templates/mobile/CLAUDE.md +233 -0
  73. package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
  74. package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
  75. package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
  76. package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
  77. package/templates/platform-engineering/.cursorrules/observability.md +747 -0
  78. package/templates/platform-engineering/.cursorrules/overview.md +215 -0
  79. package/templates/platform-engineering/.cursorrules/security.md +855 -0
  80. package/templates/platform-engineering/.cursorrules/testing.md +878 -0
  81. package/templates/platform-engineering/CLAUDE.md +850 -0
  82. package/templates/utility-agent/.cursorrules/action-control.md +284 -0
  83. package/templates/utility-agent/.cursorrules/context-management.md +186 -0
  84. package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
  85. package/templates/utility-agent/.cursorrules/overview.md +78 -0
  86. package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
  87. package/templates/utility-agent/CLAUDE.md +513 -0
  88. package/templates/web-backend/.cursorrules/api-design.md +255 -0
  89. package/templates/web-backend/.cursorrules/authentication.md +309 -0
  90. package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
  91. package/templates/web-backend/.cursorrules/error-handling.md +366 -0
  92. package/templates/web-backend/.cursorrules/overview.md +69 -0
  93. package/templates/web-backend/.cursorrules/security.md +358 -0
  94. package/templates/web-backend/.cursorrules/testing.md +395 -0
  95. package/templates/web-backend/CLAUDE.md +366 -0
  96. package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
  97. package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
  98. package/templates/web-frontend/.cursorrules/overview.md +72 -0
  99. package/templates/web-frontend/.cursorrules/performance.md +325 -0
  100. package/templates/web-frontend/.cursorrules/state-management.md +227 -0
  101. package/templates/web-frontend/.cursorrules/styling.md +271 -0
  102. package/templates/web-frontend/.cursorrules/testing.md +311 -0
  103. package/templates/web-frontend/CLAUDE.md +399 -0
@@ -0,0 +1,974 @@
1
+ # Data Engineering Development Guide
2
+
3
+ Staff-level guidelines for building robust, scalable data platforms and pipelines.
4
+
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ This guide applies to:
10
+
11
+ - Batch and streaming data pipelines
12
+ - Data warehouses and lakehouses
13
+ - ETL/ELT orchestration
14
+ - Real-time data processing
15
+ - Data platform infrastructure
16
+ - Analytics engineering
17
+
18
+ ### Key Principles
19
+
20
+ 1. **Idempotency Is Non-Negotiable** - Every pipeline must produce the same result on re-run
21
+ 2. **Data Quality Is a Feature** - Validate early, monitor continuously, alert proactively
22
+ 3. **Schema Is a Contract** - Breaking changes require coordination and versioning
23
+ 4. **Observability Over Debugging** - Instrument everything, debug nothing in production
24
+ 5. **Cost-Aware Engineering** - Compute and storage have real costs; optimize deliberately
25
+
26
+ ### Technology Stack
27
+
28
+ | Layer | Technologies |
29
+ |-------|--------------|
30
+ | Orchestration | Airflow, Dagster, Prefect, Temporal |
31
+ | Batch Processing | Spark, DBT, Pandas, Polars |
32
+ | Stream Processing | Kafka, Flink, Spark Streaming, Pulsar |
33
+ | Storage | Delta Lake, Iceberg, Parquet, S3/GCS/ADLS |
34
+ | Warehouses | Snowflake, BigQuery, Redshift, Databricks |
35
+ | Quality | Great Expectations, Soda, DBT Tests, Monte Carlo |
36
+ | Metadata | DataHub, Atlan, OpenMetadata, Unity Catalog |
37
+
38
+ ---
39
+
40
+ ## Project Structure
41
+
42
+ ```
43
+ data-platform/
44
+ ├── pipelines/ # Pipeline definitions
45
+ │ ├── ingestion/ # Source → Raw layer
46
+ │ ├── transformation/ # Raw → Curated layer
47
+ │ └── serving/ # Curated → Consumption layer
48
+ ├── models/ # DBT or Spark SQL models
49
+ │ ├── staging/ # 1:1 source mappings
50
+ │ ├── intermediate/ # Business logic transforms
51
+ │ └── marts/ # Consumption-ready tables
52
+ ├── schemas/ # Schema definitions & contracts
53
+ │ ├── avro/
54
+ │ ├── protobuf/
55
+ │ └── json-schema/
56
+ ├── quality/ # Data quality checks
57
+ │ ├── expectations/ # Great Expectations suites
58
+ │ └── tests/ # DBT tests
59
+ ├── infrastructure/ # IaC for data platform
60
+ │ ├── terraform/
61
+ │ └── kubernetes/
62
+ ├── scripts/ # Utility scripts
63
+ ├── tests/ # Pipeline tests
64
+ │ ├── unit/
65
+ │ └── integration/
66
+ └── docs/ # Documentation
67
+ └── data-dictionary/
68
+ ```
69
+
70
+ ---
71
+
72
+ ## Pipeline Design Patterns
73
+
74
+ ### Idempotent Pipeline Pattern
75
+
76
+ ```python
77
+ def process_daily_orders(execution_date: date) -> None:
78
+ """
79
+ Idempotent pipeline: safe to re-run any number of times.
80
+
81
+ Key principles:
82
+ 1. Delete-then-insert for the partition being processed
83
+ 2. Use execution_date, not current timestamp
84
+ 3. No side effects outside the target partition
85
+ """
86
+ partition = execution_date.strftime("%Y-%m-%d")
87
+
88
+ # 1. Clear target partition (idempotency)
89
+ spark.sql(f"""
90
+ DELETE FROM curated.orders
91
+ WHERE order_date = '{partition}'
92
+ """)
93
+
94
+ # 2. Process source data for this partition only
95
+ orders_df = (
96
+ spark.read.table("raw.orders")
97
+ .filter(F.col("order_date") == partition)
98
+ .transform(validate_orders)
99
+ .transform(enrich_orders)
100
+ .transform(apply_business_rules)
101
+ )
102
+
103
+ # 3. Write to target partition
104
+ (orders_df
105
+ .write
106
+ .mode("append")
107
+ .partitionBy("order_date")
108
+ .saveAsTable("curated.orders"))
109
+ ```
110
+
111
+ ### Streaming with Exactly-Once Semantics
112
+
113
+ ```python
114
+ def process_events_stream() -> None:
115
+ """
116
+ Streaming pipeline with exactly-once guarantees.
117
+
118
+ Key principles:
119
+ 1. Checkpoint for fault tolerance
120
+ 2. Idempotent sink operations
121
+ 3. Watermarking for late data handling
122
+ """
123
+ events = (
124
+ spark.readStream
125
+ .format("kafka")
126
+ .option("kafka.bootstrap.servers", KAFKA_BROKERS)
127
+ .option("subscribe", "user-events")
128
+ .option("startingOffsets", "earliest")
129
+ .load()
130
+ )
131
+
132
+ processed = (
133
+ events
134
+ .select(F.from_json(F.col("value").cast("string"), schema).alias("data"))
135
+ .select("data.*")
136
+ .withWatermark("event_time", "1 hour") # Handle late arrivals
137
+ .groupBy(
138
+ F.window("event_time", "5 minutes"),
139
+ "user_id"
140
+ )
141
+ .agg(F.count("*").alias("event_count"))
142
+ )
143
+
144
+ (processed
145
+ .writeStream
146
+ .format("delta")
147
+ .outputMode("append")
148
+ .option("checkpointLocation", CHECKPOINT_PATH)
149
+ .trigger(processingTime="1 minute")
150
+ .toTable("curated.user_activity"))
151
+ ```
152
+
153
+ ### Incremental Processing Pattern
154
+
155
+ ```python
156
+ def incremental_load(
157
+ source_table: str,
158
+ target_table: str,
159
+ watermark_column: str,
160
+ ) -> None:
161
+ """
162
+ Efficient incremental loads using high watermark.
163
+
164
+ Key principles:
165
+ 1. Track last processed watermark
166
+ 2. Process only new/changed records
167
+ 3. Handle both inserts and updates (CDC)
168
+ """
169
+ # Get high watermark from previous run
170
+ last_watermark = get_watermark(target_table, watermark_column)
171
+
172
+ # Read only new records
173
+ new_records = (
174
+ spark.read.table(source_table)
175
+ .filter(F.col(watermark_column) > last_watermark)
176
+ )
177
+
178
+ if new_records.isEmpty():
179
+ logger.info("No new records to process")
180
+ return
181
+
182
+ # Merge into target (upsert pattern)
183
+ target = DeltaTable.forName(spark, target_table)
184
+
185
+ (target.alias("target")
186
+ .merge(
187
+ new_records.alias("source"),
188
+ "target.id = source.id"
189
+ )
190
+ .whenMatchedUpdateAll()
191
+ .whenNotMatchedInsertAll()
192
+ .execute())
193
+
194
+ # Update watermark
195
+ set_watermark(target_table, watermark_column, new_records.agg(F.max(watermark_column)))
196
+ ```
197
+
198
+ ---
199
+
200
+ ## Data Modeling
201
+
202
+ ### Layered Architecture (Medallion)
203
+
204
+ | Layer | Purpose | SLA | Example |
205
+ |-------|---------|-----|---------|
206
+ | **Bronze/Raw** | Exact copy of source | Minutes | `raw.salesforce_accounts` |
207
+ | **Silver/Curated** | Cleaned, validated, typed | Hours | `curated.accounts` |
208
+ | **Gold/Marts** | Business-ready aggregates | Daily | `marts.account_metrics` |
209
+
210
+ ### Dimensional Modeling
211
+
212
+ ```sql
213
+ -- Fact table: Immutable events with foreign keys
214
+ CREATE TABLE facts.orders (
215
+ order_id STRING NOT NULL,
216
+ order_date DATE NOT NULL,
217
+ customer_key BIGINT NOT NULL, -- FK to dimension
218
+ product_key BIGINT NOT NULL, -- FK to dimension
219
+ quantity INT NOT NULL,
220
+ unit_price DECIMAL(10,2) NOT NULL,
221
+ total_amount DECIMAL(12,2) NOT NULL,
222
+ -- Metadata
223
+ _loaded_at TIMESTAMP NOT NULL,
224
+ _source_file STRING NOT NULL
225
+ )
226
+ USING DELTA
227
+ PARTITIONED BY (order_date)
228
+ TBLPROPERTIES ('delta.autoOptimize.optimizeWrite' = 'true');
229
+
230
+ -- Dimension table: Type 2 SCD for history tracking
231
+ CREATE TABLE dims.customers (
232
+ customer_key BIGINT GENERATED ALWAYS AS IDENTITY,
233
+ customer_id STRING NOT NULL,
234
+ name STRING NOT NULL,
235
+ email STRING,
236
+ segment STRING,
237
+ -- SCD Type 2 columns
238
+ effective_from DATE NOT NULL,
239
+ effective_to DATE,
240
+ is_current BOOLEAN NOT NULL,
241
+ -- Metadata
242
+ _loaded_at TIMESTAMP NOT NULL
243
+ )
244
+ USING DELTA;
245
+ ```
246
+
247
+ ### Type 2 Slowly Changing Dimension
248
+
249
+ ```python
250
+ def apply_scd_type_2(
251
+ spark: SparkSession,
252
+ source_df: DataFrame,
253
+ target_table: str,
254
+ key_columns: list[str],
255
+ tracked_columns: list[str],
256
+ ) -> None:
257
+ """
258
+ Implement Type 2 SCD: Track full history of changes.
259
+
260
+ - New records: Insert with is_current=True
261
+ - Changed records: Close old record, insert new
262
+ - Unchanged records: No action
263
+ """
264
+ target = DeltaTable.forName(spark, target_table)
265
+
266
+ # Identify changes
267
+ changes = (
268
+ source_df.alias("source")
269
+ .join(
270
+ target.toDF().filter("is_current = true").alias("target"),
271
+ on=key_columns,
272
+ how="left"
273
+ )
274
+ .withColumn("_action",
275
+ F.when(F.col("target.customer_key").isNull(), "INSERT")
276
+ .when(
277
+ F.concat_ws("|", *[F.col(f"source.{c}") for c in tracked_columns]) !=
278
+ F.concat_ws("|", *[F.col(f"target.{c}") for c in tracked_columns]),
279
+ "UPDATE"
280
+ )
281
+ .otherwise("NONE")
282
+ )
283
+ .filter("_action != 'NONE'")
284
+ )
285
+
286
+ # Close old records
287
+ (target.alias("target")
288
+ .merge(
289
+ changes.filter("_action = 'UPDATE'").alias("updates"),
290
+ " AND ".join([f"target.{c} = updates.{c}" for c in key_columns]) +
291
+ " AND target.is_current = true"
292
+ )
293
+ .whenMatchedUpdate(set={
294
+ "effective_to": "current_date()",
295
+ "is_current": "false"
296
+ })
297
+ .execute())
298
+
299
+ # Insert new/changed records
300
+ new_records = (
301
+ changes
302
+ .filter("_action IN ('INSERT', 'UPDATE')")
303
+ .select(*[F.col(f"source.{c}") for c in source_df.columns])
304
+ .withColumn("effective_from", F.current_date())
305
+ .withColumn("effective_to", F.lit(None).cast("date"))
306
+ .withColumn("is_current", F.lit(True))
307
+ .withColumn("_loaded_at", F.current_timestamp())
308
+ )
309
+
310
+ new_records.write.mode("append").saveAsTable(target_table)
311
+ ```
312
+
313
+ ---
314
+
315
+ ## Data Quality
316
+
317
+ ### Validation Framework
318
+
319
+ ```python
320
+ from great_expectations.core import ExpectationSuite
321
+ from great_expectations.dataset import SparkDFDataset
322
+
323
+ def validate_orders(df: DataFrame) -> DataFrame:
324
+ """
325
+ Apply data quality checks. Fail fast on critical issues.
326
+ """
327
+ ge_df = SparkDFDataset(df)
328
+
329
+ # Critical checks - pipeline fails if violated
330
+ critical_results = [
331
+ ge_df.expect_column_values_to_not_be_null("order_id"),
332
+ ge_df.expect_column_values_to_not_be_null("customer_id"),
333
+ ge_df.expect_column_values_to_be_positive("quantity"),
334
+ ge_df.expect_column_values_to_be_between("unit_price", min_value=0, max_value=100000),
335
+ ]
336
+
337
+ failures = [r for r in critical_results if not r.success]
338
+ if failures:
339
+ raise DataQualityError(f"Critical validation failed: {failures}")
340
+
341
+ # Warning checks - log but continue
342
+ warning_results = [
343
+ ge_df.expect_column_values_to_match_regex("email", r"^[\w.-]+@[\w.-]+\.\w+$"),
344
+ ge_df.expect_column_values_to_be_in_set("status", ["pending", "shipped", "delivered"]),
345
+ ]
346
+
347
+ for result in warning_results:
348
+ if not result.success:
349
+ logger.warning(f"Data quality warning: {result}")
350
+ metrics.increment("data_quality.warnings")
351
+
352
+ return df
353
+ ```
354
+
355
+ ### Data Freshness Monitoring
356
+
357
+ ```python
358
+ @dataclass
359
+ class FreshnessCheck:
360
+ table: str
361
+ timestamp_column: str
362
+ max_delay_hours: int
363
+ severity: str # "critical" | "warning"
364
+
365
+ FRESHNESS_CHECKS = [
366
+ FreshnessCheck("curated.orders", "order_date", max_delay_hours=2, severity="critical"),
367
+ FreshnessCheck("curated.inventory", "updated_at", max_delay_hours=1, severity="critical"),
368
+ FreshnessCheck("marts.daily_sales", "report_date", max_delay_hours=24, severity="warning"),
369
+ ]
370
+
371
+ def check_data_freshness() -> list[Alert]:
372
+ """
373
+ Monitor data freshness and alert on SLA violations.
374
+ """
375
+ alerts = []
376
+
377
+ for check in FRESHNESS_CHECKS:
378
+ max_timestamp = spark.sql(f"""
379
+ SELECT MAX({check.timestamp_column}) as max_ts
380
+ FROM {check.table}
381
+ """).collect()[0]["max_ts"]
382
+
383
+ delay_hours = (datetime.now() - max_timestamp).total_seconds() / 3600
384
+
385
+ if delay_hours > check.max_delay_hours:
386
+ alerts.append(Alert(
387
+ severity=check.severity,
388
+ message=f"Table {check.table} is {delay_hours:.1f}h stale (SLA: {check.max_delay_hours}h)",
389
+ metric_name="data_freshness_delay_hours",
390
+ metric_value=delay_hours,
391
+ ))
392
+
393
+ return alerts
394
+ ```
395
+
396
+ ### Anomaly Detection
397
+
398
+ ```python
399
+ def detect_volume_anomalies(
400
+ table: str,
401
+ partition_column: str,
402
+ lookback_days: int = 30,
403
+ threshold_std: float = 3.0,
404
+ ) -> Optional[Alert]:
405
+ """
406
+ Detect unusual record counts that may indicate pipeline issues.
407
+ """
408
+ stats = spark.sql(f"""
409
+ WITH daily_counts AS (
410
+ SELECT
411
+ {partition_column},
412
+ COUNT(*) as record_count
413
+ FROM {table}
414
+ WHERE {partition_column} >= current_date() - INTERVAL {lookback_days} DAYS
415
+ GROUP BY {partition_column}
416
+ ),
417
+ statistics AS (
418
+ SELECT
419
+ AVG(record_count) as mean_count,
420
+ STDDEV(record_count) as std_count
421
+ FROM daily_counts
422
+ WHERE {partition_column} < current_date() -- Exclude today for baseline
423
+ )
424
+ SELECT
425
+ dc.record_count as today_count,
426
+ s.mean_count,
427
+ s.std_count,
428
+ ABS(dc.record_count - s.mean_count) / NULLIF(s.std_count, 0) as z_score
429
+ FROM daily_counts dc, statistics s
430
+ WHERE dc.{partition_column} = current_date()
431
+ """).collect()[0]
432
+
433
+ if stats["z_score"] and stats["z_score"] > threshold_std:
434
+ direction = "high" if stats["today_count"] > stats["mean_count"] else "low"
435
+ return Alert(
436
+ severity="warning",
437
+ message=f"Anomaly in {table}: {stats['today_count']} records ({direction}), "
438
+ f"expected ~{stats['mean_count']:.0f} ± {stats['std_count']:.0f}",
439
+ )
440
+
441
+ return None
442
+ ```
443
+
444
+ ---
445
+
446
+ ## Testing Strategy
447
+
448
+ ### Unit Tests for Transformations
449
+
450
+ ```python
451
+ import pytest
452
+ from pyspark.sql import SparkSession
453
+ from chispa import assert_df_equality
454
+
455
+ @pytest.fixture(scope="session")
456
+ def spark():
457
+ return SparkSession.builder.master("local[*]").getOrCreate()
458
+
459
+ class TestOrderTransformations:
460
+
461
+ def test_calculate_order_total(self, spark):
462
+ """Test that order totals are calculated correctly."""
463
+ input_df = spark.createDataFrame([
464
+ {"order_id": "1", "quantity": 2, "unit_price": 10.00},
465
+ {"order_id": "2", "quantity": 3, "unit_price": 5.50},
466
+ ])
467
+
468
+ expected_df = spark.createDataFrame([
469
+ {"order_id": "1", "quantity": 2, "unit_price": 10.00, "total": 20.00},
470
+ {"order_id": "2", "quantity": 3, "unit_price": 5.50, "total": 16.50},
471
+ ])
472
+
473
+ result_df = calculate_order_total(input_df)
474
+
475
+ assert_df_equality(result_df, expected_df, ignore_row_order=True)
476
+
477
+ def test_filter_valid_orders(self, spark):
478
+ """Test that invalid orders are filtered out."""
479
+ input_df = spark.createDataFrame([
480
+ {"order_id": "1", "quantity": 2, "status": "confirmed"},
481
+ {"order_id": "2", "quantity": 0, "status": "confirmed"}, # Invalid: zero quantity
482
+ {"order_id": "3", "quantity": 1, "status": "cancelled"}, # Invalid: cancelled
483
+ ])
484
+
485
+ result_df = filter_valid_orders(input_df)
486
+
487
+ assert result_df.count() == 1
488
+ assert result_df.collect()[0]["order_id"] == "1"
489
+
490
+ def test_handles_null_values(self, spark):
491
+ """Test graceful handling of null values."""
492
+ input_df = spark.createDataFrame([
493
+ {"order_id": "1", "customer_email": None},
494
+ {"order_id": "2", "customer_email": "test@example.com"},
495
+ ])
496
+
497
+ result_df = enrich_customer_data(input_df)
498
+
499
+ # Should not raise, should handle nulls gracefully
500
+ assert result_df.filter("order_id = '1'").collect()[0]["email_domain"] is None
501
+ ```
502
+
503
+ ### Integration Tests for Pipelines
504
+
505
+ ```python
506
+ @pytest.fixture(scope="class")
507
+ def test_database(spark):
508
+ """Set up isolated test database."""
509
+ spark.sql("CREATE DATABASE IF NOT EXISTS test_data_platform")
510
+ yield "test_data_platform"
511
+ spark.sql("DROP DATABASE test_data_platform CASCADE")
512
+
513
+ class TestOrdersPipeline:
514
+
515
+ def test_end_to_end_pipeline(self, spark, test_database):
516
+ """Test full pipeline from raw to mart."""
517
+ # Arrange: Create test data in raw layer
518
+ raw_orders = spark.createDataFrame([
519
+ {"id": "1", "customer_id": "C1", "amount": 100.0, "order_date": "2024-01-15"},
520
+ {"id": "2", "customer_id": "C1", "amount": 50.0, "order_date": "2024-01-15"},
521
+ {"id": "3", "customer_id": "C2", "amount": 200.0, "order_date": "2024-01-15"},
522
+ ])
523
+ raw_orders.write.mode("overwrite").saveAsTable(f"{test_database}.raw_orders")
524
+
525
+ # Act: Run pipeline
526
+ run_orders_pipeline(
527
+ source_table=f"{test_database}.raw_orders",
528
+ target_table=f"{test_database}.curated_orders",
529
+ execution_date=date(2024, 1, 15),
530
+ )
531
+
532
+ # Assert: Verify output
533
+ result = spark.table(f"{test_database}.curated_orders")
534
+
535
+ assert result.count() == 3
536
+ assert result.filter("customer_id = 'C1'").count() == 2
537
+
538
+ # Verify data quality columns added
539
+ assert "_loaded_at" in result.columns
540
+ assert "_source_file" in result.columns
541
+
542
+ def test_idempotency(self, spark, test_database):
543
+ """Verify pipeline produces same result on re-run."""
544
+ # Run pipeline twice
545
+ for _ in range(2):
546
+ run_orders_pipeline(
547
+ source_table=f"{test_database}.raw_orders",
548
+ target_table=f"{test_database}.curated_orders",
549
+ execution_date=date(2024, 1, 15),
550
+ )
551
+
552
+ # Should have same count, not doubled
553
+ result = spark.table(f"{test_database}.curated_orders")
554
+ assert result.count() == 3
555
+ ```
556
+
557
+ ### Data Contract Tests
558
+
559
+ ```python
560
+ def test_schema_compatibility():
561
+ """Ensure schema changes don't break downstream consumers."""
562
+ current_schema = spark.table("curated.orders").schema
563
+
564
+ # Required columns that consumers depend on
565
+ required_columns = {
566
+ "order_id": StringType(),
567
+ "customer_id": StringType(),
568
+ "order_date": DateType(),
569
+ "total_amount": DecimalType(12, 2),
570
+ }
571
+
572
+ for col_name, expected_type in required_columns.items():
573
+ assert col_name in [f.name for f in current_schema.fields], \
574
+ f"Required column {col_name} missing from schema"
575
+
576
+ actual_type = current_schema[col_name].dataType
577
+ assert actual_type == expected_type, \
578
+ f"Column {col_name} type changed: {actual_type} != {expected_type}"
579
+ ```
580
+
581
+ ---
582
+
583
+ ## Performance Optimization
584
+
585
+ ### Partitioning Strategy
586
+
587
+ ```python
588
+ # Good: Partition by query patterns
589
+ (orders_df
590
+ .write
591
+ .partitionBy("order_date") # Most queries filter by date
592
+ .option("maxRecordsPerFile", 1_000_000)
593
+ .saveAsTable("curated.orders"))
594
+
595
+ # Bad: Over-partitioning creates small files
596
+ (orders_df
597
+ .write
598
+ .partitionBy("order_date", "customer_id", "product_id") # Too many partitions!
599
+ .saveAsTable("curated.orders"))
600
+
601
+ # Optimize file sizes for Delta
602
+ spark.sql("""
603
+ OPTIMIZE curated.orders
604
+ ZORDER BY (customer_id) -- Co-locate data for common join key
605
+ """)
606
+ ```
607
+
608
+ ### Query Optimization
609
+
610
+ ```python
611
+ # Good: Predicate pushdown works
612
+ orders = spark.read.table("curated.orders").filter("order_date = '2024-01-15'")
613
+
614
+ # Bad: Predicate pushdown blocked by UDF
615
+ @udf(returnType=BooleanType())
616
+ def is_recent(date):
617
+ return date > datetime.now() - timedelta(days=7)
618
+
619
+ orders = spark.read.table("curated.orders").filter(is_recent(F.col("order_date"))) # Full scan!
620
+
621
+ # Good: Use native functions instead
622
+ orders = spark.read.table("curated.orders").filter(
623
+ F.col("order_date") > F.current_date() - F.expr("INTERVAL 7 DAYS")
624
+ )
625
+ ```
626
+
627
+ ### Caching Strategy
628
+
629
+ ```python
630
+ def process_with_caching(spark: SparkSession) -> None:
631
+ """
632
+ Cache intermediate results that are reused multiple times.
633
+ """
634
+ # Read once, use multiple times
635
+ base_orders = (
636
+ spark.read.table("curated.orders")
637
+ .filter("order_date >= '2024-01-01'")
638
+ .cache() # Cache in memory
639
+ )
640
+
641
+ try:
642
+ # Multiple aggregations on same data
643
+ daily_totals = base_orders.groupBy("order_date").agg(F.sum("total_amount"))
644
+ customer_totals = base_orders.groupBy("customer_id").agg(F.sum("total_amount"))
645
+ product_totals = base_orders.groupBy("product_id").agg(F.sum("total_amount"))
646
+
647
+ # Write all outputs
648
+ daily_totals.write.mode("overwrite").saveAsTable("marts.daily_totals")
649
+ customer_totals.write.mode("overwrite").saveAsTable("marts.customer_totals")
650
+ product_totals.write.mode("overwrite").saveAsTable("marts.product_totals")
651
+ finally:
652
+ base_orders.unpersist() # Always clean up
653
+ ```
654
+
655
+ ### Cost Management
656
+
657
+ ```sql
658
+ -- Monitor compute costs by pipeline
659
+ SELECT
660
+ pipeline_name,
661
+ SUM(total_task_duration_ms) / 1000 / 60 as compute_minutes,
662
+ SUM(bytes_spilled_to_disk) / 1e9 as disk_spill_gb,
663
+ COUNT(*) as runs
664
+ FROM pipeline_metrics
665
+ WHERE run_date >= current_date - 7
666
+ GROUP BY pipeline_name
667
+ ORDER BY compute_minutes DESC;
668
+
669
+ -- Identify expensive queries
670
+ SELECT
671
+ query_hash,
672
+ AVG(execution_time_ms) as avg_time_ms,
673
+ AVG(bytes_scanned) / 1e9 as avg_gb_scanned,
674
+ COUNT(*) as executions
675
+ FROM query_history
676
+ WHERE timestamp >= current_date - 7
677
+ GROUP BY query_hash
678
+ ORDER BY avg_gb_scanned DESC
679
+ LIMIT 20;
680
+ ```
681
+
682
+ ---
683
+
684
+ ## Security & Governance
685
+
686
+ ### PII Handling
687
+
688
+ ```python
689
+ from cryptography.fernet import Fernet
690
+
691
+ class PIIHandler:
692
+ """Handle PII data securely."""
693
+
694
+ ENCRYPTION_KEY = os.environ["PII_ENCRYPTION_KEY"]
695
+
696
+ PII_COLUMNS = {
697
+ "email": "hash", # One-way hash for matching
698
+ "phone": "encrypt", # Reversible encryption
699
+ "ssn": "encrypt",
700
+ "name": "tokenize", # Replace with token
701
+ }
702
+
703
+ @classmethod
704
+ def process_pii(cls, df: DataFrame) -> DataFrame:
705
+ """Apply appropriate PII handling to each column."""
706
+ for column, method in cls.PII_COLUMNS.items():
707
+ if column in df.columns:
708
+ if method == "hash":
709
+ df = df.withColumn(column, F.sha2(F.col(column), 256))
710
+ elif method == "encrypt":
711
+ df = df.withColumn(column, cls._encrypt_udf(F.col(column)))
712
+ elif method == "tokenize":
713
+ df = df.withColumn(column, cls._tokenize_udf(F.col(column)))
714
+ return df
715
+
716
+ @staticmethod
717
+ @udf(returnType=StringType())
718
+ def _encrypt_udf(value: str) -> Optional[str]:
719
+ if value is None:
720
+ return None
721
+ cipher = Fernet(PIIHandler.ENCRYPTION_KEY.encode())
722
+ return cipher.encrypt(value.encode()).decode()
723
+ ```
724
+
725
+ ### Row-Level Security
726
+
727
+ ```sql
728
+ -- Create view with row-level security
729
+ CREATE OR REPLACE VIEW secure_views.orders AS
730
+ SELECT *
731
+ FROM curated.orders
732
+ WHERE
733
+ -- Admins see all
734
+ IS_ACCOUNT_GROUP_MEMBER('data_admins')
735
+ OR
736
+ -- Regional managers see their region only
737
+ (IS_ACCOUNT_GROUP_MEMBER('regional_managers')
738
+ AND region = CURRENT_USER_ATTRIBUTE('region'))
739
+ OR
740
+ -- Analysts see anonymized data only
741
+ (IS_ACCOUNT_GROUP_MEMBER('analysts'));
742
+ ```
743
+
744
+ ### Audit Logging
745
+
746
+ ```python
747
+ def log_data_access(
748
+ user: str,
749
+ table: str,
750
+ operation: str,
751
+ row_count: int,
752
+ filters: dict,
753
+ ) -> None:
754
+ """
755
+ Log all data access for compliance and security.
756
+ """
757
+ audit_record = {
758
+ "timestamp": datetime.utcnow().isoformat(),
759
+ "user": user,
760
+ "table": table,
761
+ "operation": operation,
762
+ "row_count": row_count,
763
+ "filters": json.dumps(filters),
764
+ "client_ip": get_client_ip(),
765
+ "session_id": get_session_id(),
766
+ }
767
+
768
+ spark.createDataFrame([audit_record]).write.mode("append").saveAsTable("audit.data_access_log")
769
+ ```
770
+
771
+ ### Data Classification
772
+
773
+ ```python
774
+ DATA_CLASSIFICATION = {
775
+ "public": {
776
+ "description": "Non-sensitive, can be shared externally",
777
+ "retention_days": None,
778
+ "encryption": False,
779
+ },
780
+ "internal": {
781
+ "description": "Business data, internal use only",
782
+ "retention_days": 365 * 7,
783
+ "encryption": False,
784
+ },
785
+ "confidential": {
786
+ "description": "Sensitive business data",
787
+ "retention_days": 365 * 3,
788
+ "encryption": True,
789
+ },
790
+ "restricted": {
791
+ "description": "PII, financial, or regulated data",
792
+ "retention_days": 365, # Or as required by regulation
793
+ "encryption": True,
794
+ "access_logging": True,
795
+ "masking_required": True,
796
+ },
797
+ }
798
+ ```
799
+
800
+ ---
801
+
802
+ ## Observability
803
+
804
+ ### Pipeline Metrics
805
+
806
+ ```python
807
+ @dataclass
808
+ class PipelineMetrics:
809
+ pipeline_name: str
810
+ run_id: str
811
+ start_time: datetime
812
+ end_time: datetime
813
+ status: str # "success" | "failed" | "skipped"
814
+ records_read: int
815
+ records_written: int
816
+ bytes_processed: int
817
+ error_message: Optional[str] = None
818
+
819
+ def emit_metrics(metrics: PipelineMetrics) -> None:
820
+ """Send metrics to monitoring system."""
821
+ # To Prometheus/StatsD
822
+ statsd.gauge(f"pipeline.duration_seconds.{metrics.pipeline_name}",
823
+ (metrics.end_time - metrics.start_time).total_seconds())
824
+ statsd.gauge(f"pipeline.records_written.{metrics.pipeline_name}",
825
+ metrics.records_written)
826
+
827
+ # To data catalog/lineage
828
+ spark.createDataFrame([asdict(metrics)]).write.mode("append").saveAsTable("metrics.pipeline_runs")
829
+ ```
830
+
831
+ ### Alerting Rules
832
+
833
+ ```yaml
834
+ # alerts.yaml
835
+ alerts:
836
+ - name: pipeline_failure
837
+ condition: status == "failed"
838
+ severity: critical
839
+ channels: [pagerduty, slack]
840
+ message: "Pipeline {pipeline_name} failed: {error_message}"
841
+
842
+ - name: data_freshness_sla
843
+ condition: freshness_hours > sla_hours
844
+ severity: high
845
+ channels: [slack]
846
+ message: "Table {table} is {freshness_hours}h stale (SLA: {sla_hours}h)"
847
+
848
+ - name: volume_anomaly
849
+ condition: abs(z_score) > 3
850
+ severity: warning
851
+ channels: [slack]
852
+ message: "Unusual volume in {table}: {record_count} records (expected: {expected})"
853
+
854
+ - name: cost_spike
855
+ condition: daily_cost > 1.5 * avg_daily_cost
856
+ severity: warning
857
+ channels: [slack]
858
+ message: "Cost spike detected: ${daily_cost} (avg: ${avg_daily_cost})"
859
+ ```
860
+
861
+ ---
862
+
863
+ ## Definition of Done
864
+
865
+ A data pipeline is complete when:
866
+
867
+ ### Functionality
868
+ - [ ] Pipeline produces correct output for all test cases
869
+ - [ ] Idempotency verified (re-run produces same result)
870
+ - [ ] Handles edge cases (nulls, empty batches, duplicates)
871
+ - [ ] Incremental logic works correctly
872
+ - [ ] Backfill capability tested
873
+
874
+ ### Data Quality
875
+ - [ ] Schema documented and versioned
876
+ - [ ] Validation rules implemented
877
+ - [ ] Data quality checks pass
878
+ - [ ] Freshness SLA defined and monitored
879
+ - [ ] Anomaly detection configured
880
+
881
+ ### Testing
882
+ - [ ] Unit tests for transformations (>80% coverage)
883
+ - [ ] Integration tests for end-to-end flow
884
+ - [ ] Data contract tests for schema
885
+ - [ ] Performance benchmarks documented
886
+
887
+ ### Observability
888
+ - [ ] Logging implemented (start, end, errors, metrics)
889
+ - [ ] Metrics emitted to monitoring system
890
+ - [ ] Alerts configured for failures and SLA breaches
891
+ - [ ] Runbook/playbook documented
892
+
893
+ ### Security & Compliance
894
+ - [ ] PII handled appropriately
895
+ - [ ] Access controls configured
896
+ - [ ] Audit logging enabled for sensitive data
897
+ - [ ] Retention policy applied
898
+
899
+ ### Operations
900
+ - [ ] Pipeline registered in orchestrator
901
+ - [ ] Dependencies documented
902
+ - [ ] Recovery procedure tested
903
+ - [ ] Cost estimate documented
904
+
905
+ ---
906
+
907
+ ## Common Pitfalls
908
+
909
+ ### 1. Non-Idempotent Pipelines
910
+
911
+ ```python
912
+ # Bad: Appends every run, creates duplicates
913
+ df.write.mode("append").saveAsTable("target")
914
+
915
+ # Good: Delete-insert or merge for idempotency
916
+ spark.sql(f"DELETE FROM target WHERE date = '{execution_date}'")
917
+ df.write.mode("append").saveAsTable("target")
918
+ ```
919
+
920
+ ### 2. Ignoring Late-Arriving Data
921
+
922
+ ```python
923
+ # Bad: Only process today's data
924
+ df.filter("event_date = current_date()")
925
+
926
+ # Good: Reprocess recent window for late arrivals
927
+ df.filter("event_date >= current_date() - INTERVAL 3 DAYS")
928
+ ```
929
+
930
+ ### 3. Schema Evolution Without Contracts
931
+
932
+ ```python
933
+ # Bad: No schema enforcement
934
+ df.write.mode("overwrite").saveAsTable("output")
935
+
936
+ # Good: Enforce schema, fail on unexpected changes
937
+ df.write.option("mergeSchema", "false").mode("overwrite").saveAsTable("output")
938
+ ```
939
+
940
+ ### 4. Missing Partition Pruning
941
+
942
+ ```sql
943
+ -- Bad: Filter on derived column prevents pruning
944
+ SELECT * FROM orders WHERE YEAR(order_date) = 2024
945
+
946
+ -- Good: Filter directly on partition column
947
+ SELECT * FROM orders WHERE order_date >= '2024-01-01' AND order_date < '2025-01-01'
948
+ ```
949
+
950
+ ### 5. Inadequate Testing
951
+
952
+ ```python
953
+ # Bad: Only happy path
954
+ def test_pipeline():
955
+ result = run_pipeline(sample_data)
956
+ assert result.count() > 0
957
+
958
+ # Good: Test edge cases
959
+ def test_pipeline_handles_nulls(): ...
960
+ def test_pipeline_handles_duplicates(): ...
961
+ def test_pipeline_handles_empty_input(): ...
962
+ def test_pipeline_is_idempotent(): ...
963
+ ```
964
+
965
+ ---
966
+
967
+ ## Resources
968
+
969
+ - [Delta Lake Documentation](https://docs.delta.io/)
970
+ - [Apache Spark Best Practices](https://spark.apache.org/docs/latest/sql-performance-tuning.html)
971
+ - [DBT Best Practices](https://docs.getdbt.com/guides/best-practices)
972
+ - [Great Expectations](https://docs.greatexpectations.io/)
973
+ - [Data Engineering Patterns](https://www.dedp.online/)
974
+ - [The Data Warehouse Toolkit (Kimball)](https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/)