agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,452 @@
|
|
|
1
|
+
# Data Pipeline Testing
|
|
2
|
+
|
|
3
|
+
Strategies for testing data pipelines effectively.
|
|
4
|
+
|
|
5
|
+
## Testing Pyramid
|
|
6
|
+
|
|
7
|
+
| Level | Scope | Speed | Purpose |
|
|
8
|
+
|-------|-------|-------|---------|
|
|
9
|
+
| **Unit** | Single transformation | Fast | Test logic in isolation |
|
|
10
|
+
| **Integration** | Pipeline end-to-end | Medium | Test data flow |
|
|
11
|
+
| **Contract** | Schema/interface | Fast | Prevent breaking changes |
|
|
12
|
+
| **Data Quality** | Production data | Slow | Validate real data |
|
|
13
|
+
|
|
14
|
+
## Unit Tests
|
|
15
|
+
|
|
16
|
+
Test individual transformations with small, controlled datasets.
|
|
17
|
+
|
|
18
|
+
### Setup
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
import pytest
|
|
22
|
+
from pyspark.sql import SparkSession
|
|
23
|
+
from chispa import assert_df_equality
|
|
24
|
+
from decimal import Decimal
|
|
25
|
+
|
|
26
|
+
@pytest.fixture(scope="session")
|
|
27
|
+
def spark():
|
|
28
|
+
"""Create a Spark session for testing."""
|
|
29
|
+
return (
|
|
30
|
+
SparkSession.builder
|
|
31
|
+
.master("local[*]")
|
|
32
|
+
.appName("unit-tests")
|
|
33
|
+
.config("spark.sql.shuffle.partitions", "1") # Faster for small data
|
|
34
|
+
.getOrCreate()
|
|
35
|
+
)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Testing Transformations
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
class TestOrderTransformations:
|
|
42
|
+
|
|
43
|
+
def test_calculate_order_total(self, spark):
|
|
44
|
+
"""Test order total calculation."""
|
|
45
|
+
input_df = spark.createDataFrame([
|
|
46
|
+
{"order_id": "1", "quantity": 2, "unit_price": Decimal("10.00")},
|
|
47
|
+
{"order_id": "2", "quantity": 3, "unit_price": Decimal("5.50")},
|
|
48
|
+
])
|
|
49
|
+
|
|
50
|
+
expected_df = spark.createDataFrame([
|
|
51
|
+
{"order_id": "1", "quantity": 2, "unit_price": Decimal("10.00"), "total": Decimal("20.00")},
|
|
52
|
+
{"order_id": "2", "quantity": 3, "unit_price": Decimal("5.50"), "total": Decimal("16.50")},
|
|
53
|
+
])
|
|
54
|
+
|
|
55
|
+
result = calculate_order_total(input_df)
|
|
56
|
+
|
|
57
|
+
assert_df_equality(result, expected_df, ignore_row_order=True)
|
|
58
|
+
|
|
59
|
+
def test_filter_valid_orders(self, spark):
|
|
60
|
+
"""Test filtering of invalid orders."""
|
|
61
|
+
input_df = spark.createDataFrame([
|
|
62
|
+
{"order_id": "1", "quantity": 2, "status": "confirmed"},
|
|
63
|
+
{"order_id": "2", "quantity": 0, "status": "confirmed"}, # Invalid
|
|
64
|
+
{"order_id": "3", "quantity": 1, "status": "cancelled"}, # Invalid
|
|
65
|
+
])
|
|
66
|
+
|
|
67
|
+
result = filter_valid_orders(input_df)
|
|
68
|
+
|
|
69
|
+
assert result.count() == 1
|
|
70
|
+
assert result.collect()[0]["order_id"] == "1"
|
|
71
|
+
|
|
72
|
+
def test_handles_null_values(self, spark):
|
|
73
|
+
"""Test graceful null handling."""
|
|
74
|
+
input_df = spark.createDataFrame([
|
|
75
|
+
{"order_id": "1", "email": None},
|
|
76
|
+
{"order_id": "2", "email": "test@example.com"},
|
|
77
|
+
])
|
|
78
|
+
|
|
79
|
+
result = extract_email_domain(input_df)
|
|
80
|
+
|
|
81
|
+
row1 = result.filter("order_id = '1'").collect()[0]
|
|
82
|
+
row2 = result.filter("order_id = '2'").collect()[0]
|
|
83
|
+
|
|
84
|
+
assert row1["email_domain"] is None
|
|
85
|
+
assert row2["email_domain"] == "example.com"
|
|
86
|
+
|
|
87
|
+
def test_handles_empty_dataframe(self, spark):
|
|
88
|
+
"""Test behavior with empty input."""
|
|
89
|
+
empty_df = spark.createDataFrame([], schema="order_id STRING, quantity INT")
|
|
90
|
+
|
|
91
|
+
result = process_orders(empty_df)
|
|
92
|
+
|
|
93
|
+
assert result.count() == 0
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Testing Aggregations
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
def test_daily_aggregation(spark):
|
|
100
|
+
"""Test daily aggregation logic."""
|
|
101
|
+
input_df = spark.createDataFrame([
|
|
102
|
+
{"order_date": "2024-01-15", "amount": Decimal("100.00")},
|
|
103
|
+
{"order_date": "2024-01-15", "amount": Decimal("50.00")},
|
|
104
|
+
{"order_date": "2024-01-16", "amount": Decimal("200.00")},
|
|
105
|
+
])
|
|
106
|
+
|
|
107
|
+
result = aggregate_daily_totals(input_df)
|
|
108
|
+
|
|
109
|
+
jan15 = result.filter("order_date = '2024-01-15'").collect()[0]
|
|
110
|
+
jan16 = result.filter("order_date = '2024-01-16'").collect()[0]
|
|
111
|
+
|
|
112
|
+
assert jan15["total_amount"] == Decimal("150.00")
|
|
113
|
+
assert jan15["order_count"] == 2
|
|
114
|
+
assert jan16["total_amount"] == Decimal("200.00")
|
|
115
|
+
assert jan16["order_count"] == 1
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Testing Edge Cases
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
class TestEdgeCases:
|
|
122
|
+
|
|
123
|
+
def test_duplicate_handling(self, spark):
|
|
124
|
+
"""Test deduplication logic."""
|
|
125
|
+
input_df = spark.createDataFrame([
|
|
126
|
+
{"order_id": "1", "amount": Decimal("100.00"), "updated_at": "2024-01-15 10:00:00"},
|
|
127
|
+
{"order_id": "1", "amount": Decimal("150.00"), "updated_at": "2024-01-15 11:00:00"}, # Newer
|
|
128
|
+
])
|
|
129
|
+
|
|
130
|
+
result = deduplicate_orders(input_df)
|
|
131
|
+
|
|
132
|
+
assert result.count() == 1
|
|
133
|
+
assert result.collect()[0]["amount"] == Decimal("150.00")
|
|
134
|
+
|
|
135
|
+
def test_large_values(self, spark):
|
|
136
|
+
"""Test handling of large numeric values."""
|
|
137
|
+
input_df = spark.createDataFrame([
|
|
138
|
+
{"order_id": "1", "quantity": 999999, "unit_price": Decimal("99999.99")},
|
|
139
|
+
])
|
|
140
|
+
|
|
141
|
+
result = calculate_order_total(input_df)
|
|
142
|
+
|
|
143
|
+
# Should not overflow
|
|
144
|
+
assert result.collect()[0]["total"] == Decimal("99999890000.01")
|
|
145
|
+
|
|
146
|
+
def test_special_characters(self, spark):
|
|
147
|
+
"""Test handling of special characters in strings."""
|
|
148
|
+
input_df = spark.createDataFrame([
|
|
149
|
+
{"customer_name": "O'Brien"},
|
|
150
|
+
{"customer_name": "Müller"},
|
|
151
|
+
{"customer_name": "日本語"},
|
|
152
|
+
])
|
|
153
|
+
|
|
154
|
+
result = normalize_names(input_df)
|
|
155
|
+
|
|
156
|
+
assert result.count() == 3 # Should not fail
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Integration Tests
|
|
160
|
+
|
|
161
|
+
Test complete pipeline flows with realistic data.
|
|
162
|
+
|
|
163
|
+
### Test Database Setup
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
@pytest.fixture(scope="class")
|
|
167
|
+
def test_database(spark):
|
|
168
|
+
"""Create isolated test database."""
|
|
169
|
+
db_name = f"test_db_{uuid.uuid4().hex[:8]}"
|
|
170
|
+
spark.sql(f"CREATE DATABASE {db_name}")
|
|
171
|
+
yield db_name
|
|
172
|
+
spark.sql(f"DROP DATABASE {db_name} CASCADE")
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### End-to-End Pipeline Test
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
class TestOrdersPipeline:
|
|
179
|
+
|
|
180
|
+
def test_end_to_end_flow(self, spark, test_database):
|
|
181
|
+
"""Test complete pipeline from raw to mart."""
|
|
182
|
+
# Arrange: Create test data
|
|
183
|
+
raw_orders = spark.createDataFrame([
|
|
184
|
+
{"id": "1", "customer_id": "C1", "amount": 100.0, "order_date": "2024-01-15"},
|
|
185
|
+
{"id": "2", "customer_id": "C1", "amount": 50.0, "order_date": "2024-01-15"},
|
|
186
|
+
{"id": "3", "customer_id": "C2", "amount": 200.0, "order_date": "2024-01-15"},
|
|
187
|
+
])
|
|
188
|
+
raw_orders.write.mode("overwrite").saveAsTable(f"{test_database}.raw_orders")
|
|
189
|
+
|
|
190
|
+
# Act: Run pipeline
|
|
191
|
+
run_orders_pipeline(
|
|
192
|
+
source_table=f"{test_database}.raw_orders",
|
|
193
|
+
target_table=f"{test_database}.curated_orders",
|
|
194
|
+
execution_date=date(2024, 1, 15),
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Assert
|
|
198
|
+
result = spark.table(f"{test_database}.curated_orders")
|
|
199
|
+
|
|
200
|
+
assert result.count() == 3
|
|
201
|
+
assert "_loaded_at" in result.columns # Metadata added
|
|
202
|
+
|
|
203
|
+
def test_idempotency(self, spark, test_database):
|
|
204
|
+
"""Pipeline produces same result on re-run."""
|
|
205
|
+
# Run twice
|
|
206
|
+
for _ in range(2):
|
|
207
|
+
run_orders_pipeline(
|
|
208
|
+
source_table=f"{test_database}.raw_orders",
|
|
209
|
+
target_table=f"{test_database}.curated_orders",
|
|
210
|
+
execution_date=date(2024, 1, 15),
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
result = spark.table(f"{test_database}.curated_orders")
|
|
214
|
+
assert result.count() == 3 # Not doubled
|
|
215
|
+
|
|
216
|
+
def test_incremental_processing(self, spark, test_database):
|
|
217
|
+
"""Incremental loads only process new data."""
|
|
218
|
+
# Initial load
|
|
219
|
+
run_orders_pipeline(execution_date=date(2024, 1, 15))
|
|
220
|
+
|
|
221
|
+
# Add new data
|
|
222
|
+
new_orders = spark.createDataFrame([
|
|
223
|
+
{"id": "4", "customer_id": "C3", "amount": 300.0, "order_date": "2024-01-16"},
|
|
224
|
+
])
|
|
225
|
+
new_orders.write.mode("append").saveAsTable(f"{test_database}.raw_orders")
|
|
226
|
+
|
|
227
|
+
# Incremental load
|
|
228
|
+
run_orders_pipeline(execution_date=date(2024, 1, 16))
|
|
229
|
+
|
|
230
|
+
result = spark.table(f"{test_database}.curated_orders")
|
|
231
|
+
assert result.count() == 4 # 3 original + 1 new
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Testing Error Conditions
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
def test_handles_missing_source_gracefully(spark, test_database):
|
|
238
|
+
"""Pipeline fails gracefully when source is missing."""
|
|
239
|
+
with pytest.raises(SourceNotFoundError):
|
|
240
|
+
run_orders_pipeline(
|
|
241
|
+
source_table=f"{test_database}.nonexistent_table",
|
|
242
|
+
target_table=f"{test_database}.curated_orders",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
def test_handles_schema_mismatch(spark, test_database):
|
|
246
|
+
"""Pipeline fails on unexpected schema."""
|
|
247
|
+
bad_schema_data = spark.createDataFrame([
|
|
248
|
+
{"wrong_column": "value"},
|
|
249
|
+
])
|
|
250
|
+
bad_schema_data.write.mode("overwrite").saveAsTable(f"{test_database}.raw_orders")
|
|
251
|
+
|
|
252
|
+
with pytest.raises(SchemaValidationError):
|
|
253
|
+
run_orders_pipeline(
|
|
254
|
+
source_table=f"{test_database}.raw_orders",
|
|
255
|
+
target_table=f"{test_database}.curated_orders",
|
|
256
|
+
)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
## Contract Tests
|
|
260
|
+
|
|
261
|
+
Ensure schema compatibility between producers and consumers.
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
def test_schema_backward_compatible():
|
|
265
|
+
"""Ensure current schema is backward compatible."""
|
|
266
|
+
current_schema = spark.table("curated.orders").schema
|
|
267
|
+
|
|
268
|
+
# Required columns that consumers depend on
|
|
269
|
+
required_contract = {
|
|
270
|
+
"order_id": StringType(),
|
|
271
|
+
"customer_id": StringType(),
|
|
272
|
+
"order_date": DateType(),
|
|
273
|
+
"total_amount": DecimalType(12, 2),
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
for col_name, expected_type in required_contract.items():
|
|
277
|
+
# Column must exist
|
|
278
|
+
assert col_name in [f.name for f in current_schema.fields], \
|
|
279
|
+
f"Breaking change: Required column '{col_name}' missing"
|
|
280
|
+
|
|
281
|
+
# Type must match
|
|
282
|
+
actual_type = current_schema[col_name].dataType
|
|
283
|
+
assert actual_type == expected_type, \
|
|
284
|
+
f"Breaking change: Column '{col_name}' type changed from {expected_type} to {actual_type}"
|
|
285
|
+
|
|
286
|
+
def test_no_accidental_column_removal():
|
|
287
|
+
"""Ensure no columns were accidentally removed."""
|
|
288
|
+
previous_columns = load_previous_schema("curated.orders")
|
|
289
|
+
current_columns = {f.name for f in spark.table("curated.orders").schema.fields}
|
|
290
|
+
|
|
291
|
+
removed = previous_columns - current_columns
|
|
292
|
+
assert len(removed) == 0, f"Columns removed: {removed}"
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
## DBT Tests
|
|
296
|
+
|
|
297
|
+
### Model Tests
|
|
298
|
+
|
|
299
|
+
```yaml
|
|
300
|
+
# models/orders/schema.yml
|
|
301
|
+
version: 2
|
|
302
|
+
|
|
303
|
+
models:
|
|
304
|
+
- name: orders
|
|
305
|
+
description: Curated orders table
|
|
306
|
+
columns:
|
|
307
|
+
- name: order_id
|
|
308
|
+
tests:
|
|
309
|
+
- not_null
|
|
310
|
+
- unique
|
|
311
|
+
- name: customer_id
|
|
312
|
+
tests:
|
|
313
|
+
- not_null
|
|
314
|
+
- relationships:
|
|
315
|
+
to: ref('customers')
|
|
316
|
+
field: customer_id
|
|
317
|
+
- name: total_amount
|
|
318
|
+
tests:
|
|
319
|
+
- not_null
|
|
320
|
+
- dbt_utils.accepted_range:
|
|
321
|
+
min_value: 0
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
### Custom Tests
|
|
325
|
+
|
|
326
|
+
```sql
|
|
327
|
+
-- tests/orders_total_matches_line_items.sql
|
|
328
|
+
SELECT order_id
|
|
329
|
+
FROM {{ ref('orders') }} o
|
|
330
|
+
JOIN (
|
|
331
|
+
SELECT order_id, SUM(quantity * unit_price) as calc_total
|
|
332
|
+
FROM {{ ref('order_items') }}
|
|
333
|
+
GROUP BY order_id
|
|
334
|
+
) li USING (order_id)
|
|
335
|
+
WHERE ABS(o.total_amount - li.calc_total) > 0.01
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
## Test Utilities
|
|
339
|
+
|
|
340
|
+
### Test Data Factories
|
|
341
|
+
|
|
342
|
+
```python
|
|
343
|
+
from dataclasses import dataclass
|
|
344
|
+
from faker import Faker
|
|
345
|
+
|
|
346
|
+
fake = Faker()
|
|
347
|
+
|
|
348
|
+
@dataclass
|
|
349
|
+
class OrderFactory:
|
|
350
|
+
"""Generate test order data."""
|
|
351
|
+
|
|
352
|
+
@staticmethod
|
|
353
|
+
def create(spark: SparkSession, count: int = 10, **overrides) -> DataFrame:
|
|
354
|
+
data = [
|
|
355
|
+
{
|
|
356
|
+
"order_id": overrides.get("order_id", f"ORD-{i}"),
|
|
357
|
+
"customer_id": overrides.get("customer_id", f"CUST-{fake.random_int(1, 100)}"),
|
|
358
|
+
"order_date": overrides.get("order_date", fake.date_between("-30d", "today").isoformat()),
|
|
359
|
+
"total_amount": overrides.get("total_amount", float(fake.pydecimal(min_value=10, max_value=1000))),
|
|
360
|
+
"status": overrides.get("status", fake.random_element(["pending", "confirmed", "shipped"])),
|
|
361
|
+
}
|
|
362
|
+
for i in range(count)
|
|
363
|
+
]
|
|
364
|
+
return spark.createDataFrame(data)
|
|
365
|
+
|
|
366
|
+
# Usage
|
|
367
|
+
orders = OrderFactory.create(spark, count=100, status="confirmed")
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
### Assertion Helpers
|
|
371
|
+
|
|
372
|
+
```python
|
|
373
|
+
def assert_row_count(df: DataFrame, expected: int, message: str = ""):
|
|
374
|
+
"""Assert DataFrame has expected row count."""
|
|
375
|
+
actual = df.count()
|
|
376
|
+
assert actual == expected, f"{message}: Expected {expected} rows, got {actual}"
|
|
377
|
+
|
|
378
|
+
def assert_no_nulls(df: DataFrame, columns: list[str]):
|
|
379
|
+
"""Assert no null values in specified columns."""
|
|
380
|
+
for col in columns:
|
|
381
|
+
null_count = df.filter(F.col(col).isNull()).count()
|
|
382
|
+
assert null_count == 0, f"Found {null_count} nulls in column '{col}'"
|
|
383
|
+
|
|
384
|
+
def assert_no_duplicates(df: DataFrame, key_columns: list[str]):
|
|
385
|
+
"""Assert no duplicate keys."""
|
|
386
|
+
dup_count = df.groupBy(key_columns).count().filter("count > 1").count()
|
|
387
|
+
assert dup_count == 0, f"Found {dup_count} duplicate keys on {key_columns}"
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
## Best Practices
|
|
391
|
+
|
|
392
|
+
### Test Behavior, Not Implementation
|
|
393
|
+
|
|
394
|
+
```python
|
|
395
|
+
# Bad: Testing implementation details
|
|
396
|
+
def test_uses_left_join():
|
|
397
|
+
# Don't test HOW it's done
|
|
398
|
+
assert "LEFT JOIN" in get_query_plan()
|
|
399
|
+
|
|
400
|
+
# Good: Testing behavior
|
|
401
|
+
def test_preserves_all_orders():
|
|
402
|
+
# Test WHAT it does
|
|
403
|
+
result = join_orders_with_customers(orders, customers)
|
|
404
|
+
assert result.count() == orders.count()
|
|
405
|
+
```
|
|
406
|
+
|
|
407
|
+
### One Assertion Per Concept
|
|
408
|
+
|
|
409
|
+
```python
|
|
410
|
+
# Good: Focused tests
|
|
411
|
+
def test_filters_cancelled_orders():
|
|
412
|
+
result = filter_orders(orders_with_cancelled)
|
|
413
|
+
assert result.filter("status = 'cancelled'").count() == 0
|
|
414
|
+
|
|
415
|
+
def test_preserves_confirmed_orders():
|
|
416
|
+
result = filter_orders(orders_with_confirmed)
|
|
417
|
+
assert result.filter("status = 'confirmed'").count() == confirmed_count
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
### Use Descriptive Names
|
|
421
|
+
|
|
422
|
+
```python
|
|
423
|
+
# Good: Clear what's being tested
|
|
424
|
+
def test_calculate_total_handles_zero_quantity(): ...
|
|
425
|
+
def test_calculate_total_handles_negative_discount(): ...
|
|
426
|
+
def test_calculate_total_rounds_to_two_decimals(): ...
|
|
427
|
+
|
|
428
|
+
# Bad: Vague names
|
|
429
|
+
def test_calculate_total(): ...
|
|
430
|
+
def test_calculate_total_2(): ...
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
### Isolate Tests
|
|
434
|
+
|
|
435
|
+
```python
|
|
436
|
+
# Good: Each test sets up its own data
|
|
437
|
+
def test_aggregation(spark):
|
|
438
|
+
test_data = spark.createDataFrame([...]) # Local data
|
|
439
|
+
result = aggregate(test_data)
|
|
440
|
+
assert ...
|
|
441
|
+
|
|
442
|
+
# Bad: Tests depend on shared state
|
|
443
|
+
shared_df = None
|
|
444
|
+
|
|
445
|
+
def test_step_1():
|
|
446
|
+
global shared_df
|
|
447
|
+
shared_df = process_step_1(data)
|
|
448
|
+
|
|
449
|
+
def test_step_2():
|
|
450
|
+
# Depends on test_step_1 running first!
|
|
451
|
+
result = process_step_2(shared_df)
|
|
452
|
+
```
|