agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,974 @@
|
|
|
1
|
+
# Data Engineering Development Guide
|
|
2
|
+
|
|
3
|
+
Staff-level guidelines for building robust, scalable data platforms and pipelines.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
This guide applies to:
|
|
10
|
+
|
|
11
|
+
- Batch and streaming data pipelines
|
|
12
|
+
- Data warehouses and lakehouses
|
|
13
|
+
- ETL/ELT orchestration
|
|
14
|
+
- Real-time data processing
|
|
15
|
+
- Data platform infrastructure
|
|
16
|
+
- Analytics engineering
|
|
17
|
+
|
|
18
|
+
### Key Principles
|
|
19
|
+
|
|
20
|
+
1. **Idempotency Is Non-Negotiable** - Every pipeline must produce the same result on re-run
|
|
21
|
+
2. **Data Quality Is a Feature** - Validate early, monitor continuously, alert proactively
|
|
22
|
+
3. **Schema Is a Contract** - Breaking changes require coordination and versioning
|
|
23
|
+
4. **Observability Over Debugging** - Instrument everything, debug nothing in production
|
|
24
|
+
5. **Cost-Aware Engineering** - Compute and storage have real costs; optimize deliberately
|
|
25
|
+
|
|
26
|
+
### Technology Stack
|
|
27
|
+
|
|
28
|
+
| Layer | Technologies |
|
|
29
|
+
|-------|--------------|
|
|
30
|
+
| Orchestration | Airflow, Dagster, Prefect, Temporal |
|
|
31
|
+
| Batch Processing | Spark, DBT, Pandas, Polars |
|
|
32
|
+
| Stream Processing | Kafka, Flink, Spark Streaming, Pulsar |
|
|
33
|
+
| Storage | Delta Lake, Iceberg, Parquet, S3/GCS/ADLS |
|
|
34
|
+
| Warehouses | Snowflake, BigQuery, Redshift, Databricks |
|
|
35
|
+
| Quality | Great Expectations, Soda, DBT Tests, Monte Carlo |
|
|
36
|
+
| Metadata | DataHub, Atlan, OpenMetadata, Unity Catalog |
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Project Structure
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
data-platform/
|
|
44
|
+
├── pipelines/ # Pipeline definitions
|
|
45
|
+
│ ├── ingestion/ # Source → Raw layer
|
|
46
|
+
│ ├── transformation/ # Raw → Curated layer
|
|
47
|
+
│ └── serving/ # Curated → Consumption layer
|
|
48
|
+
├── models/ # DBT or Spark SQL models
|
|
49
|
+
│ ├── staging/ # 1:1 source mappings
|
|
50
|
+
│ ├── intermediate/ # Business logic transforms
|
|
51
|
+
│ └── marts/ # Consumption-ready tables
|
|
52
|
+
├── schemas/ # Schema definitions & contracts
|
|
53
|
+
│ ├── avro/
|
|
54
|
+
│ ├── protobuf/
|
|
55
|
+
│ └── json-schema/
|
|
56
|
+
├── quality/ # Data quality checks
|
|
57
|
+
│ ├── expectations/ # Great Expectations suites
|
|
58
|
+
│ └── tests/ # DBT tests
|
|
59
|
+
├── infrastructure/ # IaC for data platform
|
|
60
|
+
│ ├── terraform/
|
|
61
|
+
│ └── kubernetes/
|
|
62
|
+
├── scripts/ # Utility scripts
|
|
63
|
+
├── tests/ # Pipeline tests
|
|
64
|
+
│ ├── unit/
|
|
65
|
+
│ └── integration/
|
|
66
|
+
└── docs/ # Documentation
|
|
67
|
+
└── data-dictionary/
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Pipeline Design Patterns
|
|
73
|
+
|
|
74
|
+
### Idempotent Pipeline Pattern
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
def process_daily_orders(execution_date: date) -> None:
|
|
78
|
+
"""
|
|
79
|
+
Idempotent pipeline: safe to re-run any number of times.
|
|
80
|
+
|
|
81
|
+
Key principles:
|
|
82
|
+
1. Delete-then-insert for the partition being processed
|
|
83
|
+
2. Use execution_date, not current timestamp
|
|
84
|
+
3. No side effects outside the target partition
|
|
85
|
+
"""
|
|
86
|
+
partition = execution_date.strftime("%Y-%m-%d")
|
|
87
|
+
|
|
88
|
+
# 1. Clear target partition (idempotency)
|
|
89
|
+
spark.sql(f"""
|
|
90
|
+
DELETE FROM curated.orders
|
|
91
|
+
WHERE order_date = '{partition}'
|
|
92
|
+
""")
|
|
93
|
+
|
|
94
|
+
# 2. Process source data for this partition only
|
|
95
|
+
orders_df = (
|
|
96
|
+
spark.read.table("raw.orders")
|
|
97
|
+
.filter(F.col("order_date") == partition)
|
|
98
|
+
.transform(validate_orders)
|
|
99
|
+
.transform(enrich_orders)
|
|
100
|
+
.transform(apply_business_rules)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# 3. Write to target partition
|
|
104
|
+
(orders_df
|
|
105
|
+
.write
|
|
106
|
+
.mode("append")
|
|
107
|
+
.partitionBy("order_date")
|
|
108
|
+
.saveAsTable("curated.orders"))
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Streaming with Exactly-Once Semantics
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
def process_events_stream() -> None:
|
|
115
|
+
"""
|
|
116
|
+
Streaming pipeline with exactly-once guarantees.
|
|
117
|
+
|
|
118
|
+
Key principles:
|
|
119
|
+
1. Checkpoint for fault tolerance
|
|
120
|
+
2. Idempotent sink operations
|
|
121
|
+
3. Watermarking for late data handling
|
|
122
|
+
"""
|
|
123
|
+
events = (
|
|
124
|
+
spark.readStream
|
|
125
|
+
.format("kafka")
|
|
126
|
+
.option("kafka.bootstrap.servers", KAFKA_BROKERS)
|
|
127
|
+
.option("subscribe", "user-events")
|
|
128
|
+
.option("startingOffsets", "earliest")
|
|
129
|
+
.load()
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
processed = (
|
|
133
|
+
events
|
|
134
|
+
.select(F.from_json(F.col("value").cast("string"), schema).alias("data"))
|
|
135
|
+
.select("data.*")
|
|
136
|
+
.withWatermark("event_time", "1 hour") # Handle late arrivals
|
|
137
|
+
.groupBy(
|
|
138
|
+
F.window("event_time", "5 minutes"),
|
|
139
|
+
"user_id"
|
|
140
|
+
)
|
|
141
|
+
.agg(F.count("*").alias("event_count"))
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
(processed
|
|
145
|
+
.writeStream
|
|
146
|
+
.format("delta")
|
|
147
|
+
.outputMode("append")
|
|
148
|
+
.option("checkpointLocation", CHECKPOINT_PATH)
|
|
149
|
+
.trigger(processingTime="1 minute")
|
|
150
|
+
.toTable("curated.user_activity"))
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Incremental Processing Pattern
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
def incremental_load(
|
|
157
|
+
source_table: str,
|
|
158
|
+
target_table: str,
|
|
159
|
+
watermark_column: str,
|
|
160
|
+
) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Efficient incremental loads using high watermark.
|
|
163
|
+
|
|
164
|
+
Key principles:
|
|
165
|
+
1. Track last processed watermark
|
|
166
|
+
2. Process only new/changed records
|
|
167
|
+
3. Handle both inserts and updates (CDC)
|
|
168
|
+
"""
|
|
169
|
+
# Get high watermark from previous run
|
|
170
|
+
last_watermark = get_watermark(target_table, watermark_column)
|
|
171
|
+
|
|
172
|
+
# Read only new records
|
|
173
|
+
new_records = (
|
|
174
|
+
spark.read.table(source_table)
|
|
175
|
+
.filter(F.col(watermark_column) > last_watermark)
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if new_records.isEmpty():
|
|
179
|
+
logger.info("No new records to process")
|
|
180
|
+
return
|
|
181
|
+
|
|
182
|
+
# Merge into target (upsert pattern)
|
|
183
|
+
target = DeltaTable.forName(spark, target_table)
|
|
184
|
+
|
|
185
|
+
(target.alias("target")
|
|
186
|
+
.merge(
|
|
187
|
+
new_records.alias("source"),
|
|
188
|
+
"target.id = source.id"
|
|
189
|
+
)
|
|
190
|
+
.whenMatchedUpdateAll()
|
|
191
|
+
.whenNotMatchedInsertAll()
|
|
192
|
+
.execute())
|
|
193
|
+
|
|
194
|
+
# Update watermark
|
|
195
|
+
set_watermark(target_table, watermark_column, new_records.agg(F.max(watermark_column)))
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## Data Modeling
|
|
201
|
+
|
|
202
|
+
### Layered Architecture (Medallion)
|
|
203
|
+
|
|
204
|
+
| Layer | Purpose | SLA | Example |
|
|
205
|
+
|-------|---------|-----|---------|
|
|
206
|
+
| **Bronze/Raw** | Exact copy of source | Minutes | `raw.salesforce_accounts` |
|
|
207
|
+
| **Silver/Curated** | Cleaned, validated, typed | Hours | `curated.accounts` |
|
|
208
|
+
| **Gold/Marts** | Business-ready aggregates | Daily | `marts.account_metrics` |
|
|
209
|
+
|
|
210
|
+
### Dimensional Modeling
|
|
211
|
+
|
|
212
|
+
```sql
|
|
213
|
+
-- Fact table: Immutable events with foreign keys
|
|
214
|
+
CREATE TABLE facts.orders (
|
|
215
|
+
order_id STRING NOT NULL,
|
|
216
|
+
order_date DATE NOT NULL,
|
|
217
|
+
customer_key BIGINT NOT NULL, -- FK to dimension
|
|
218
|
+
product_key BIGINT NOT NULL, -- FK to dimension
|
|
219
|
+
quantity INT NOT NULL,
|
|
220
|
+
unit_price DECIMAL(10,2) NOT NULL,
|
|
221
|
+
total_amount DECIMAL(12,2) NOT NULL,
|
|
222
|
+
-- Metadata
|
|
223
|
+
_loaded_at TIMESTAMP NOT NULL,
|
|
224
|
+
_source_file STRING NOT NULL
|
|
225
|
+
)
|
|
226
|
+
USING DELTA
|
|
227
|
+
PARTITIONED BY (order_date)
|
|
228
|
+
TBLPROPERTIES ('delta.autoOptimize.optimizeWrite' = 'true');
|
|
229
|
+
|
|
230
|
+
-- Dimension table: Type 2 SCD for history tracking
|
|
231
|
+
CREATE TABLE dims.customers (
|
|
232
|
+
customer_key BIGINT GENERATED ALWAYS AS IDENTITY,
|
|
233
|
+
customer_id STRING NOT NULL,
|
|
234
|
+
name STRING NOT NULL,
|
|
235
|
+
email STRING,
|
|
236
|
+
segment STRING,
|
|
237
|
+
-- SCD Type 2 columns
|
|
238
|
+
effective_from DATE NOT NULL,
|
|
239
|
+
effective_to DATE,
|
|
240
|
+
is_current BOOLEAN NOT NULL,
|
|
241
|
+
-- Metadata
|
|
242
|
+
_loaded_at TIMESTAMP NOT NULL
|
|
243
|
+
)
|
|
244
|
+
USING DELTA;
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Type 2 Slowly Changing Dimension
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
def apply_scd_type_2(
|
|
251
|
+
spark: SparkSession,
|
|
252
|
+
source_df: DataFrame,
|
|
253
|
+
target_table: str,
|
|
254
|
+
key_columns: list[str],
|
|
255
|
+
tracked_columns: list[str],
|
|
256
|
+
) -> None:
|
|
257
|
+
"""
|
|
258
|
+
Implement Type 2 SCD: Track full history of changes.
|
|
259
|
+
|
|
260
|
+
- New records: Insert with is_current=True
|
|
261
|
+
- Changed records: Close old record, insert new
|
|
262
|
+
- Unchanged records: No action
|
|
263
|
+
"""
|
|
264
|
+
target = DeltaTable.forName(spark, target_table)
|
|
265
|
+
|
|
266
|
+
# Identify changes
|
|
267
|
+
changes = (
|
|
268
|
+
source_df.alias("source")
|
|
269
|
+
.join(
|
|
270
|
+
target.toDF().filter("is_current = true").alias("target"),
|
|
271
|
+
on=key_columns,
|
|
272
|
+
how="left"
|
|
273
|
+
)
|
|
274
|
+
.withColumn("_action",
|
|
275
|
+
F.when(F.col("target.customer_key").isNull(), "INSERT")
|
|
276
|
+
.when(
|
|
277
|
+
F.concat_ws("|", *[F.col(f"source.{c}") for c in tracked_columns]) !=
|
|
278
|
+
F.concat_ws("|", *[F.col(f"target.{c}") for c in tracked_columns]),
|
|
279
|
+
"UPDATE"
|
|
280
|
+
)
|
|
281
|
+
.otherwise("NONE")
|
|
282
|
+
)
|
|
283
|
+
.filter("_action != 'NONE'")
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Close old records
|
|
287
|
+
(target.alias("target")
|
|
288
|
+
.merge(
|
|
289
|
+
changes.filter("_action = 'UPDATE'").alias("updates"),
|
|
290
|
+
" AND ".join([f"target.{c} = updates.{c}" for c in key_columns]) +
|
|
291
|
+
" AND target.is_current = true"
|
|
292
|
+
)
|
|
293
|
+
.whenMatchedUpdate(set={
|
|
294
|
+
"effective_to": "current_date()",
|
|
295
|
+
"is_current": "false"
|
|
296
|
+
})
|
|
297
|
+
.execute())
|
|
298
|
+
|
|
299
|
+
# Insert new/changed records
|
|
300
|
+
new_records = (
|
|
301
|
+
changes
|
|
302
|
+
.filter("_action IN ('INSERT', 'UPDATE')")
|
|
303
|
+
.select(*[F.col(f"source.{c}") for c in source_df.columns])
|
|
304
|
+
.withColumn("effective_from", F.current_date())
|
|
305
|
+
.withColumn("effective_to", F.lit(None).cast("date"))
|
|
306
|
+
.withColumn("is_current", F.lit(True))
|
|
307
|
+
.withColumn("_loaded_at", F.current_timestamp())
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
new_records.write.mode("append").saveAsTable(target_table)
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
---
|
|
314
|
+
|
|
315
|
+
## Data Quality
|
|
316
|
+
|
|
317
|
+
### Validation Framework
|
|
318
|
+
|
|
319
|
+
```python
|
|
320
|
+
from great_expectations.core import ExpectationSuite
|
|
321
|
+
from great_expectations.dataset import SparkDFDataset
|
|
322
|
+
|
|
323
|
+
def validate_orders(df: DataFrame) -> DataFrame:
|
|
324
|
+
"""
|
|
325
|
+
Apply data quality checks. Fail fast on critical issues.
|
|
326
|
+
"""
|
|
327
|
+
ge_df = SparkDFDataset(df)
|
|
328
|
+
|
|
329
|
+
# Critical checks - pipeline fails if violated
|
|
330
|
+
critical_results = [
|
|
331
|
+
ge_df.expect_column_values_to_not_be_null("order_id"),
|
|
332
|
+
ge_df.expect_column_values_to_not_be_null("customer_id"),
|
|
333
|
+
ge_df.expect_column_values_to_be_positive("quantity"),
|
|
334
|
+
ge_df.expect_column_values_to_be_between("unit_price", min_value=0, max_value=100000),
|
|
335
|
+
]
|
|
336
|
+
|
|
337
|
+
failures = [r for r in critical_results if not r.success]
|
|
338
|
+
if failures:
|
|
339
|
+
raise DataQualityError(f"Critical validation failed: {failures}")
|
|
340
|
+
|
|
341
|
+
# Warning checks - log but continue
|
|
342
|
+
warning_results = [
|
|
343
|
+
ge_df.expect_column_values_to_match_regex("email", r"^[\w.-]+@[\w.-]+\.\w+$"),
|
|
344
|
+
ge_df.expect_column_values_to_be_in_set("status", ["pending", "shipped", "delivered"]),
|
|
345
|
+
]
|
|
346
|
+
|
|
347
|
+
for result in warning_results:
|
|
348
|
+
if not result.success:
|
|
349
|
+
logger.warning(f"Data quality warning: {result}")
|
|
350
|
+
metrics.increment("data_quality.warnings")
|
|
351
|
+
|
|
352
|
+
return df
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
### Data Freshness Monitoring
|
|
356
|
+
|
|
357
|
+
```python
|
|
358
|
+
@dataclass
|
|
359
|
+
class FreshnessCheck:
|
|
360
|
+
table: str
|
|
361
|
+
timestamp_column: str
|
|
362
|
+
max_delay_hours: int
|
|
363
|
+
severity: str # "critical" | "warning"
|
|
364
|
+
|
|
365
|
+
FRESHNESS_CHECKS = [
|
|
366
|
+
FreshnessCheck("curated.orders", "order_date", max_delay_hours=2, severity="critical"),
|
|
367
|
+
FreshnessCheck("curated.inventory", "updated_at", max_delay_hours=1, severity="critical"),
|
|
368
|
+
FreshnessCheck("marts.daily_sales", "report_date", max_delay_hours=24, severity="warning"),
|
|
369
|
+
]
|
|
370
|
+
|
|
371
|
+
def check_data_freshness() -> list[Alert]:
|
|
372
|
+
"""
|
|
373
|
+
Monitor data freshness and alert on SLA violations.
|
|
374
|
+
"""
|
|
375
|
+
alerts = []
|
|
376
|
+
|
|
377
|
+
for check in FRESHNESS_CHECKS:
|
|
378
|
+
max_timestamp = spark.sql(f"""
|
|
379
|
+
SELECT MAX({check.timestamp_column}) as max_ts
|
|
380
|
+
FROM {check.table}
|
|
381
|
+
""").collect()[0]["max_ts"]
|
|
382
|
+
|
|
383
|
+
delay_hours = (datetime.now() - max_timestamp).total_seconds() / 3600
|
|
384
|
+
|
|
385
|
+
if delay_hours > check.max_delay_hours:
|
|
386
|
+
alerts.append(Alert(
|
|
387
|
+
severity=check.severity,
|
|
388
|
+
message=f"Table {check.table} is {delay_hours:.1f}h stale (SLA: {check.max_delay_hours}h)",
|
|
389
|
+
metric_name="data_freshness_delay_hours",
|
|
390
|
+
metric_value=delay_hours,
|
|
391
|
+
))
|
|
392
|
+
|
|
393
|
+
return alerts
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
### Anomaly Detection
|
|
397
|
+
|
|
398
|
+
```python
|
|
399
|
+
def detect_volume_anomalies(
|
|
400
|
+
table: str,
|
|
401
|
+
partition_column: str,
|
|
402
|
+
lookback_days: int = 30,
|
|
403
|
+
threshold_std: float = 3.0,
|
|
404
|
+
) -> Optional[Alert]:
|
|
405
|
+
"""
|
|
406
|
+
Detect unusual record counts that may indicate pipeline issues.
|
|
407
|
+
"""
|
|
408
|
+
stats = spark.sql(f"""
|
|
409
|
+
WITH daily_counts AS (
|
|
410
|
+
SELECT
|
|
411
|
+
{partition_column},
|
|
412
|
+
COUNT(*) as record_count
|
|
413
|
+
FROM {table}
|
|
414
|
+
WHERE {partition_column} >= current_date() - INTERVAL {lookback_days} DAYS
|
|
415
|
+
GROUP BY {partition_column}
|
|
416
|
+
),
|
|
417
|
+
statistics AS (
|
|
418
|
+
SELECT
|
|
419
|
+
AVG(record_count) as mean_count,
|
|
420
|
+
STDDEV(record_count) as std_count
|
|
421
|
+
FROM daily_counts
|
|
422
|
+
WHERE {partition_column} < current_date() -- Exclude today for baseline
|
|
423
|
+
)
|
|
424
|
+
SELECT
|
|
425
|
+
dc.record_count as today_count,
|
|
426
|
+
s.mean_count,
|
|
427
|
+
s.std_count,
|
|
428
|
+
ABS(dc.record_count - s.mean_count) / NULLIF(s.std_count, 0) as z_score
|
|
429
|
+
FROM daily_counts dc, statistics s
|
|
430
|
+
WHERE dc.{partition_column} = current_date()
|
|
431
|
+
""").collect()[0]
|
|
432
|
+
|
|
433
|
+
if stats["z_score"] and stats["z_score"] > threshold_std:
|
|
434
|
+
direction = "high" if stats["today_count"] > stats["mean_count"] else "low"
|
|
435
|
+
return Alert(
|
|
436
|
+
severity="warning",
|
|
437
|
+
message=f"Anomaly in {table}: {stats['today_count']} records ({direction}), "
|
|
438
|
+
f"expected ~{stats['mean_count']:.0f} ± {stats['std_count']:.0f}",
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
return None
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
---
|
|
445
|
+
|
|
446
|
+
## Testing Strategy
|
|
447
|
+
|
|
448
|
+
### Unit Tests for Transformations
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
import pytest
|
|
452
|
+
from pyspark.sql import SparkSession
|
|
453
|
+
from chispa import assert_df_equality
|
|
454
|
+
|
|
455
|
+
@pytest.fixture(scope="session")
|
|
456
|
+
def spark():
|
|
457
|
+
return SparkSession.builder.master("local[*]").getOrCreate()
|
|
458
|
+
|
|
459
|
+
class TestOrderTransformations:
|
|
460
|
+
|
|
461
|
+
def test_calculate_order_total(self, spark):
|
|
462
|
+
"""Test that order totals are calculated correctly."""
|
|
463
|
+
input_df = spark.createDataFrame([
|
|
464
|
+
{"order_id": "1", "quantity": 2, "unit_price": 10.00},
|
|
465
|
+
{"order_id": "2", "quantity": 3, "unit_price": 5.50},
|
|
466
|
+
])
|
|
467
|
+
|
|
468
|
+
expected_df = spark.createDataFrame([
|
|
469
|
+
{"order_id": "1", "quantity": 2, "unit_price": 10.00, "total": 20.00},
|
|
470
|
+
{"order_id": "2", "quantity": 3, "unit_price": 5.50, "total": 16.50},
|
|
471
|
+
])
|
|
472
|
+
|
|
473
|
+
result_df = calculate_order_total(input_df)
|
|
474
|
+
|
|
475
|
+
assert_df_equality(result_df, expected_df, ignore_row_order=True)
|
|
476
|
+
|
|
477
|
+
def test_filter_valid_orders(self, spark):
|
|
478
|
+
"""Test that invalid orders are filtered out."""
|
|
479
|
+
input_df = spark.createDataFrame([
|
|
480
|
+
{"order_id": "1", "quantity": 2, "status": "confirmed"},
|
|
481
|
+
{"order_id": "2", "quantity": 0, "status": "confirmed"}, # Invalid: zero quantity
|
|
482
|
+
{"order_id": "3", "quantity": 1, "status": "cancelled"}, # Invalid: cancelled
|
|
483
|
+
])
|
|
484
|
+
|
|
485
|
+
result_df = filter_valid_orders(input_df)
|
|
486
|
+
|
|
487
|
+
assert result_df.count() == 1
|
|
488
|
+
assert result_df.collect()[0]["order_id"] == "1"
|
|
489
|
+
|
|
490
|
+
def test_handles_null_values(self, spark):
|
|
491
|
+
"""Test graceful handling of null values."""
|
|
492
|
+
input_df = spark.createDataFrame([
|
|
493
|
+
{"order_id": "1", "customer_email": None},
|
|
494
|
+
{"order_id": "2", "customer_email": "test@example.com"},
|
|
495
|
+
])
|
|
496
|
+
|
|
497
|
+
result_df = enrich_customer_data(input_df)
|
|
498
|
+
|
|
499
|
+
# Should not raise, should handle nulls gracefully
|
|
500
|
+
assert result_df.filter("order_id = '1'").collect()[0]["email_domain"] is None
|
|
501
|
+
```
|
|
502
|
+
|
|
503
|
+
### Integration Tests for Pipelines
|
|
504
|
+
|
|
505
|
+
```python
|
|
506
|
+
@pytest.fixture(scope="class")
|
|
507
|
+
def test_database(spark):
|
|
508
|
+
"""Set up isolated test database."""
|
|
509
|
+
spark.sql("CREATE DATABASE IF NOT EXISTS test_data_platform")
|
|
510
|
+
yield "test_data_platform"
|
|
511
|
+
spark.sql("DROP DATABASE test_data_platform CASCADE")
|
|
512
|
+
|
|
513
|
+
class TestOrdersPipeline:
|
|
514
|
+
|
|
515
|
+
def test_end_to_end_pipeline(self, spark, test_database):
|
|
516
|
+
"""Test full pipeline from raw to mart."""
|
|
517
|
+
# Arrange: Create test data in raw layer
|
|
518
|
+
raw_orders = spark.createDataFrame([
|
|
519
|
+
{"id": "1", "customer_id": "C1", "amount": 100.0, "order_date": "2024-01-15"},
|
|
520
|
+
{"id": "2", "customer_id": "C1", "amount": 50.0, "order_date": "2024-01-15"},
|
|
521
|
+
{"id": "3", "customer_id": "C2", "amount": 200.0, "order_date": "2024-01-15"},
|
|
522
|
+
])
|
|
523
|
+
raw_orders.write.mode("overwrite").saveAsTable(f"{test_database}.raw_orders")
|
|
524
|
+
|
|
525
|
+
# Act: Run pipeline
|
|
526
|
+
run_orders_pipeline(
|
|
527
|
+
source_table=f"{test_database}.raw_orders",
|
|
528
|
+
target_table=f"{test_database}.curated_orders",
|
|
529
|
+
execution_date=date(2024, 1, 15),
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
# Assert: Verify output
|
|
533
|
+
result = spark.table(f"{test_database}.curated_orders")
|
|
534
|
+
|
|
535
|
+
assert result.count() == 3
|
|
536
|
+
assert result.filter("customer_id = 'C1'").count() == 2
|
|
537
|
+
|
|
538
|
+
# Verify data quality columns added
|
|
539
|
+
assert "_loaded_at" in result.columns
|
|
540
|
+
assert "_source_file" in result.columns
|
|
541
|
+
|
|
542
|
+
def test_idempotency(self, spark, test_database):
|
|
543
|
+
"""Verify pipeline produces same result on re-run."""
|
|
544
|
+
# Run pipeline twice
|
|
545
|
+
for _ in range(2):
|
|
546
|
+
run_orders_pipeline(
|
|
547
|
+
source_table=f"{test_database}.raw_orders",
|
|
548
|
+
target_table=f"{test_database}.curated_orders",
|
|
549
|
+
execution_date=date(2024, 1, 15),
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
# Should have same count, not doubled
|
|
553
|
+
result = spark.table(f"{test_database}.curated_orders")
|
|
554
|
+
assert result.count() == 3
|
|
555
|
+
```
|
|
556
|
+
|
|
557
|
+
### Data Contract Tests
|
|
558
|
+
|
|
559
|
+
```python
|
|
560
|
+
def test_schema_compatibility():
|
|
561
|
+
"""Ensure schema changes don't break downstream consumers."""
|
|
562
|
+
current_schema = spark.table("curated.orders").schema
|
|
563
|
+
|
|
564
|
+
# Required columns that consumers depend on
|
|
565
|
+
required_columns = {
|
|
566
|
+
"order_id": StringType(),
|
|
567
|
+
"customer_id": StringType(),
|
|
568
|
+
"order_date": DateType(),
|
|
569
|
+
"total_amount": DecimalType(12, 2),
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
for col_name, expected_type in required_columns.items():
|
|
573
|
+
assert col_name in [f.name for f in current_schema.fields], \
|
|
574
|
+
f"Required column {col_name} missing from schema"
|
|
575
|
+
|
|
576
|
+
actual_type = current_schema[col_name].dataType
|
|
577
|
+
assert actual_type == expected_type, \
|
|
578
|
+
f"Column {col_name} type changed: {actual_type} != {expected_type}"
|
|
579
|
+
```
|
|
580
|
+
|
|
581
|
+
---
|
|
582
|
+
|
|
583
|
+
## Performance Optimization
|
|
584
|
+
|
|
585
|
+
### Partitioning Strategy
|
|
586
|
+
|
|
587
|
+
```python
|
|
588
|
+
# Good: Partition by query patterns
|
|
589
|
+
(orders_df
|
|
590
|
+
.write
|
|
591
|
+
.partitionBy("order_date") # Most queries filter by date
|
|
592
|
+
.option("maxRecordsPerFile", 1_000_000)
|
|
593
|
+
.saveAsTable("curated.orders"))
|
|
594
|
+
|
|
595
|
+
# Bad: Over-partitioning creates small files
|
|
596
|
+
(orders_df
|
|
597
|
+
.write
|
|
598
|
+
.partitionBy("order_date", "customer_id", "product_id") # Too many partitions!
|
|
599
|
+
.saveAsTable("curated.orders"))
|
|
600
|
+
|
|
601
|
+
# Optimize file sizes for Delta
|
|
602
|
+
spark.sql("""
|
|
603
|
+
OPTIMIZE curated.orders
|
|
604
|
+
ZORDER BY (customer_id) -- Co-locate data for common join key
|
|
605
|
+
""")
|
|
606
|
+
```
|
|
607
|
+
|
|
608
|
+
### Query Optimization
|
|
609
|
+
|
|
610
|
+
```python
|
|
611
|
+
# Good: Predicate pushdown works
|
|
612
|
+
orders = spark.read.table("curated.orders").filter("order_date = '2024-01-15'")
|
|
613
|
+
|
|
614
|
+
# Bad: Predicate pushdown blocked by UDF
|
|
615
|
+
@udf(returnType=BooleanType())
|
|
616
|
+
def is_recent(date):
|
|
617
|
+
return date > datetime.now() - timedelta(days=7)
|
|
618
|
+
|
|
619
|
+
orders = spark.read.table("curated.orders").filter(is_recent(F.col("order_date"))) # Full scan!
|
|
620
|
+
|
|
621
|
+
# Good: Use native functions instead
|
|
622
|
+
orders = spark.read.table("curated.orders").filter(
|
|
623
|
+
F.col("order_date") > F.current_date() - F.expr("INTERVAL 7 DAYS")
|
|
624
|
+
)
|
|
625
|
+
```
|
|
626
|
+
|
|
627
|
+
### Caching Strategy
|
|
628
|
+
|
|
629
|
+
```python
|
|
630
|
+
def process_with_caching(spark: SparkSession) -> None:
|
|
631
|
+
"""
|
|
632
|
+
Cache intermediate results that are reused multiple times.
|
|
633
|
+
"""
|
|
634
|
+
# Read once, use multiple times
|
|
635
|
+
base_orders = (
|
|
636
|
+
spark.read.table("curated.orders")
|
|
637
|
+
.filter("order_date >= '2024-01-01'")
|
|
638
|
+
.cache() # Cache in memory
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
try:
|
|
642
|
+
# Multiple aggregations on same data
|
|
643
|
+
daily_totals = base_orders.groupBy("order_date").agg(F.sum("total_amount"))
|
|
644
|
+
customer_totals = base_orders.groupBy("customer_id").agg(F.sum("total_amount"))
|
|
645
|
+
product_totals = base_orders.groupBy("product_id").agg(F.sum("total_amount"))
|
|
646
|
+
|
|
647
|
+
# Write all outputs
|
|
648
|
+
daily_totals.write.mode("overwrite").saveAsTable("marts.daily_totals")
|
|
649
|
+
customer_totals.write.mode("overwrite").saveAsTable("marts.customer_totals")
|
|
650
|
+
product_totals.write.mode("overwrite").saveAsTable("marts.product_totals")
|
|
651
|
+
finally:
|
|
652
|
+
base_orders.unpersist() # Always clean up
|
|
653
|
+
```
|
|
654
|
+
|
|
655
|
+
### Cost Management
|
|
656
|
+
|
|
657
|
+
```sql
|
|
658
|
+
-- Monitor compute costs by pipeline
|
|
659
|
+
SELECT
|
|
660
|
+
pipeline_name,
|
|
661
|
+
SUM(total_task_duration_ms) / 1000 / 60 as compute_minutes,
|
|
662
|
+
SUM(bytes_spilled_to_disk) / 1e9 as disk_spill_gb,
|
|
663
|
+
COUNT(*) as runs
|
|
664
|
+
FROM pipeline_metrics
|
|
665
|
+
WHERE run_date >= current_date - 7
|
|
666
|
+
GROUP BY pipeline_name
|
|
667
|
+
ORDER BY compute_minutes DESC;
|
|
668
|
+
|
|
669
|
+
-- Identify expensive queries
|
|
670
|
+
SELECT
|
|
671
|
+
query_hash,
|
|
672
|
+
AVG(execution_time_ms) as avg_time_ms,
|
|
673
|
+
AVG(bytes_scanned) / 1e9 as avg_gb_scanned,
|
|
674
|
+
COUNT(*) as executions
|
|
675
|
+
FROM query_history
|
|
676
|
+
WHERE timestamp >= current_date - 7
|
|
677
|
+
GROUP BY query_hash
|
|
678
|
+
ORDER BY avg_gb_scanned DESC
|
|
679
|
+
LIMIT 20;
|
|
680
|
+
```
|
|
681
|
+
|
|
682
|
+
---
|
|
683
|
+
|
|
684
|
+
## Security & Governance
|
|
685
|
+
|
|
686
|
+
### PII Handling
|
|
687
|
+
|
|
688
|
+
```python
|
|
689
|
+
from cryptography.fernet import Fernet
|
|
690
|
+
|
|
691
|
+
class PIIHandler:
|
|
692
|
+
"""Handle PII data securely."""
|
|
693
|
+
|
|
694
|
+
ENCRYPTION_KEY = os.environ["PII_ENCRYPTION_KEY"]
|
|
695
|
+
|
|
696
|
+
PII_COLUMNS = {
|
|
697
|
+
"email": "hash", # One-way hash for matching
|
|
698
|
+
"phone": "encrypt", # Reversible encryption
|
|
699
|
+
"ssn": "encrypt",
|
|
700
|
+
"name": "tokenize", # Replace with token
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
@classmethod
|
|
704
|
+
def process_pii(cls, df: DataFrame) -> DataFrame:
|
|
705
|
+
"""Apply appropriate PII handling to each column."""
|
|
706
|
+
for column, method in cls.PII_COLUMNS.items():
|
|
707
|
+
if column in df.columns:
|
|
708
|
+
if method == "hash":
|
|
709
|
+
df = df.withColumn(column, F.sha2(F.col(column), 256))
|
|
710
|
+
elif method == "encrypt":
|
|
711
|
+
df = df.withColumn(column, cls._encrypt_udf(F.col(column)))
|
|
712
|
+
elif method == "tokenize":
|
|
713
|
+
df = df.withColumn(column, cls._tokenize_udf(F.col(column)))
|
|
714
|
+
return df
|
|
715
|
+
|
|
716
|
+
@staticmethod
|
|
717
|
+
@udf(returnType=StringType())
|
|
718
|
+
def _encrypt_udf(value: str) -> Optional[str]:
|
|
719
|
+
if value is None:
|
|
720
|
+
return None
|
|
721
|
+
cipher = Fernet(PIIHandler.ENCRYPTION_KEY.encode())
|
|
722
|
+
return cipher.encrypt(value.encode()).decode()
|
|
723
|
+
```
|
|
724
|
+
|
|
725
|
+
### Row-Level Security
|
|
726
|
+
|
|
727
|
+
```sql
|
|
728
|
+
-- Create view with row-level security
|
|
729
|
+
CREATE OR REPLACE VIEW secure_views.orders AS
|
|
730
|
+
SELECT *
|
|
731
|
+
FROM curated.orders
|
|
732
|
+
WHERE
|
|
733
|
+
-- Admins see all
|
|
734
|
+
IS_ACCOUNT_GROUP_MEMBER('data_admins')
|
|
735
|
+
OR
|
|
736
|
+
-- Regional managers see their region only
|
|
737
|
+
(IS_ACCOUNT_GROUP_MEMBER('regional_managers')
|
|
738
|
+
AND region = CURRENT_USER_ATTRIBUTE('region'))
|
|
739
|
+
OR
|
|
740
|
+
-- Analysts see anonymized data only
|
|
741
|
+
(IS_ACCOUNT_GROUP_MEMBER('analysts'));
|
|
742
|
+
```
|
|
743
|
+
|
|
744
|
+
### Audit Logging
|
|
745
|
+
|
|
746
|
+
```python
|
|
747
|
+
def log_data_access(
|
|
748
|
+
user: str,
|
|
749
|
+
table: str,
|
|
750
|
+
operation: str,
|
|
751
|
+
row_count: int,
|
|
752
|
+
filters: dict,
|
|
753
|
+
) -> None:
|
|
754
|
+
"""
|
|
755
|
+
Log all data access for compliance and security.
|
|
756
|
+
"""
|
|
757
|
+
audit_record = {
|
|
758
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
759
|
+
"user": user,
|
|
760
|
+
"table": table,
|
|
761
|
+
"operation": operation,
|
|
762
|
+
"row_count": row_count,
|
|
763
|
+
"filters": json.dumps(filters),
|
|
764
|
+
"client_ip": get_client_ip(),
|
|
765
|
+
"session_id": get_session_id(),
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
spark.createDataFrame([audit_record]).write.mode("append").saveAsTable("audit.data_access_log")
|
|
769
|
+
```
|
|
770
|
+
|
|
771
|
+
### Data Classification
|
|
772
|
+
|
|
773
|
+
```python
|
|
774
|
+
DATA_CLASSIFICATION = {
|
|
775
|
+
"public": {
|
|
776
|
+
"description": "Non-sensitive, can be shared externally",
|
|
777
|
+
"retention_days": None,
|
|
778
|
+
"encryption": False,
|
|
779
|
+
},
|
|
780
|
+
"internal": {
|
|
781
|
+
"description": "Business data, internal use only",
|
|
782
|
+
"retention_days": 365 * 7,
|
|
783
|
+
"encryption": False,
|
|
784
|
+
},
|
|
785
|
+
"confidential": {
|
|
786
|
+
"description": "Sensitive business data",
|
|
787
|
+
"retention_days": 365 * 3,
|
|
788
|
+
"encryption": True,
|
|
789
|
+
},
|
|
790
|
+
"restricted": {
|
|
791
|
+
"description": "PII, financial, or regulated data",
|
|
792
|
+
"retention_days": 365, # Or as required by regulation
|
|
793
|
+
"encryption": True,
|
|
794
|
+
"access_logging": True,
|
|
795
|
+
"masking_required": True,
|
|
796
|
+
},
|
|
797
|
+
}
|
|
798
|
+
```
|
|
799
|
+
|
|
800
|
+
---
|
|
801
|
+
|
|
802
|
+
## Observability
|
|
803
|
+
|
|
804
|
+
### Pipeline Metrics
|
|
805
|
+
|
|
806
|
+
```python
|
|
807
|
+
@dataclass
|
|
808
|
+
class PipelineMetrics:
|
|
809
|
+
pipeline_name: str
|
|
810
|
+
run_id: str
|
|
811
|
+
start_time: datetime
|
|
812
|
+
end_time: datetime
|
|
813
|
+
status: str # "success" | "failed" | "skipped"
|
|
814
|
+
records_read: int
|
|
815
|
+
records_written: int
|
|
816
|
+
bytes_processed: int
|
|
817
|
+
error_message: Optional[str] = None
|
|
818
|
+
|
|
819
|
+
def emit_metrics(metrics: PipelineMetrics) -> None:
|
|
820
|
+
"""Send metrics to monitoring system."""
|
|
821
|
+
# To Prometheus/StatsD
|
|
822
|
+
statsd.gauge(f"pipeline.duration_seconds.{metrics.pipeline_name}",
|
|
823
|
+
(metrics.end_time - metrics.start_time).total_seconds())
|
|
824
|
+
statsd.gauge(f"pipeline.records_written.{metrics.pipeline_name}",
|
|
825
|
+
metrics.records_written)
|
|
826
|
+
|
|
827
|
+
# To data catalog/lineage
|
|
828
|
+
spark.createDataFrame([asdict(metrics)]).write.mode("append").saveAsTable("metrics.pipeline_runs")
|
|
829
|
+
```
|
|
830
|
+
|
|
831
|
+
### Alerting Rules
|
|
832
|
+
|
|
833
|
+
```yaml
|
|
834
|
+
# alerts.yaml
|
|
835
|
+
alerts:
|
|
836
|
+
- name: pipeline_failure
|
|
837
|
+
condition: status == "failed"
|
|
838
|
+
severity: critical
|
|
839
|
+
channels: [pagerduty, slack]
|
|
840
|
+
message: "Pipeline {pipeline_name} failed: {error_message}"
|
|
841
|
+
|
|
842
|
+
- name: data_freshness_sla
|
|
843
|
+
condition: freshness_hours > sla_hours
|
|
844
|
+
severity: high
|
|
845
|
+
channels: [slack]
|
|
846
|
+
message: "Table {table} is {freshness_hours}h stale (SLA: {sla_hours}h)"
|
|
847
|
+
|
|
848
|
+
- name: volume_anomaly
|
|
849
|
+
condition: abs(z_score) > 3
|
|
850
|
+
severity: warning
|
|
851
|
+
channels: [slack]
|
|
852
|
+
message: "Unusual volume in {table}: {record_count} records (expected: {expected})"
|
|
853
|
+
|
|
854
|
+
- name: cost_spike
|
|
855
|
+
condition: daily_cost > 1.5 * avg_daily_cost
|
|
856
|
+
severity: warning
|
|
857
|
+
channels: [slack]
|
|
858
|
+
message: "Cost spike detected: ${daily_cost} (avg: ${avg_daily_cost})"
|
|
859
|
+
```
|
|
860
|
+
|
|
861
|
+
---
|
|
862
|
+
|
|
863
|
+
## Definition of Done
|
|
864
|
+
|
|
865
|
+
A data pipeline is complete when:
|
|
866
|
+
|
|
867
|
+
### Functionality
|
|
868
|
+
- [ ] Pipeline produces correct output for all test cases
|
|
869
|
+
- [ ] Idempotency verified (re-run produces same result)
|
|
870
|
+
- [ ] Handles edge cases (nulls, empty batches, duplicates)
|
|
871
|
+
- [ ] Incremental logic works correctly
|
|
872
|
+
- [ ] Backfill capability tested
|
|
873
|
+
|
|
874
|
+
### Data Quality
|
|
875
|
+
- [ ] Schema documented and versioned
|
|
876
|
+
- [ ] Validation rules implemented
|
|
877
|
+
- [ ] Data quality checks pass
|
|
878
|
+
- [ ] Freshness SLA defined and monitored
|
|
879
|
+
- [ ] Anomaly detection configured
|
|
880
|
+
|
|
881
|
+
### Testing
|
|
882
|
+
- [ ] Unit tests for transformations (>80% coverage)
|
|
883
|
+
- [ ] Integration tests for end-to-end flow
|
|
884
|
+
- [ ] Data contract tests for schema
|
|
885
|
+
- [ ] Performance benchmarks documented
|
|
886
|
+
|
|
887
|
+
### Observability
|
|
888
|
+
- [ ] Logging implemented (start, end, errors, metrics)
|
|
889
|
+
- [ ] Metrics emitted to monitoring system
|
|
890
|
+
- [ ] Alerts configured for failures and SLA breaches
|
|
891
|
+
- [ ] Runbook/playbook documented
|
|
892
|
+
|
|
893
|
+
### Security & Compliance
|
|
894
|
+
- [ ] PII handled appropriately
|
|
895
|
+
- [ ] Access controls configured
|
|
896
|
+
- [ ] Audit logging enabled for sensitive data
|
|
897
|
+
- [ ] Retention policy applied
|
|
898
|
+
|
|
899
|
+
### Operations
|
|
900
|
+
- [ ] Pipeline registered in orchestrator
|
|
901
|
+
- [ ] Dependencies documented
|
|
902
|
+
- [ ] Recovery procedure tested
|
|
903
|
+
- [ ] Cost estimate documented
|
|
904
|
+
|
|
905
|
+
---
|
|
906
|
+
|
|
907
|
+
## Common Pitfalls
|
|
908
|
+
|
|
909
|
+
### 1. Non-Idempotent Pipelines
|
|
910
|
+
|
|
911
|
+
```python
|
|
912
|
+
# Bad: Appends every run, creates duplicates
|
|
913
|
+
df.write.mode("append").saveAsTable("target")
|
|
914
|
+
|
|
915
|
+
# Good: Delete-insert or merge for idempotency
|
|
916
|
+
spark.sql(f"DELETE FROM target WHERE date = '{execution_date}'")
|
|
917
|
+
df.write.mode("append").saveAsTable("target")
|
|
918
|
+
```
|
|
919
|
+
|
|
920
|
+
### 2. Ignoring Late-Arriving Data
|
|
921
|
+
|
|
922
|
+
```python
|
|
923
|
+
# Bad: Only process today's data
|
|
924
|
+
df.filter("event_date = current_date()")
|
|
925
|
+
|
|
926
|
+
# Good: Reprocess recent window for late arrivals
|
|
927
|
+
df.filter("event_date >= current_date() - INTERVAL 3 DAYS")
|
|
928
|
+
```
|
|
929
|
+
|
|
930
|
+
### 3. Schema Evolution Without Contracts
|
|
931
|
+
|
|
932
|
+
```python
|
|
933
|
+
# Bad: No schema enforcement
|
|
934
|
+
df.write.mode("overwrite").saveAsTable("output")
|
|
935
|
+
|
|
936
|
+
# Good: Enforce schema, fail on unexpected changes
|
|
937
|
+
df.write.option("mergeSchema", "false").mode("overwrite").saveAsTable("output")
|
|
938
|
+
```
|
|
939
|
+
|
|
940
|
+
### 4. Missing Partition Pruning
|
|
941
|
+
|
|
942
|
+
```sql
|
|
943
|
+
-- Bad: Filter on derived column prevents pruning
|
|
944
|
+
SELECT * FROM orders WHERE YEAR(order_date) = 2024
|
|
945
|
+
|
|
946
|
+
-- Good: Filter directly on partition column
|
|
947
|
+
SELECT * FROM orders WHERE order_date >= '2024-01-01' AND order_date < '2025-01-01'
|
|
948
|
+
```
|
|
949
|
+
|
|
950
|
+
### 5. Inadequate Testing
|
|
951
|
+
|
|
952
|
+
```python
|
|
953
|
+
# Bad: Only happy path
|
|
954
|
+
def test_pipeline():
|
|
955
|
+
result = run_pipeline(sample_data)
|
|
956
|
+
assert result.count() > 0
|
|
957
|
+
|
|
958
|
+
# Good: Test edge cases
|
|
959
|
+
def test_pipeline_handles_nulls(): ...
|
|
960
|
+
def test_pipeline_handles_duplicates(): ...
|
|
961
|
+
def test_pipeline_handles_empty_input(): ...
|
|
962
|
+
def test_pipeline_is_idempotent(): ...
|
|
963
|
+
```
|
|
964
|
+
|
|
965
|
+
---
|
|
966
|
+
|
|
967
|
+
## Resources
|
|
968
|
+
|
|
969
|
+
- [Delta Lake Documentation](https://docs.delta.io/)
|
|
970
|
+
- [Apache Spark Best Practices](https://spark.apache.org/docs/latest/sql-performance-tuning.html)
|
|
971
|
+
- [DBT Best Practices](https://docs.getdbt.com/guides/best-practices)
|
|
972
|
+
- [Great Expectations](https://docs.greatexpectations.io/)
|
|
973
|
+
- [Data Engineering Patterns](https://www.dedp.online/)
|
|
974
|
+
- [The Data Warehouse Toolkit (Kimball)](https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/)
|