agentic-team-templates 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +280 -0
- package/bin/cli.js +5 -0
- package/package.json +47 -0
- package/src/index.js +521 -0
- package/templates/_shared/code-quality.md +162 -0
- package/templates/_shared/communication.md +114 -0
- package/templates/_shared/core-principles.md +62 -0
- package/templates/_shared/git-workflow.md +165 -0
- package/templates/_shared/security-fundamentals.md +173 -0
- package/templates/blockchain/.cursorrules/defi-patterns.md +520 -0
- package/templates/blockchain/.cursorrules/gas-optimization.md +339 -0
- package/templates/blockchain/.cursorrules/overview.md +130 -0
- package/templates/blockchain/.cursorrules/security.md +318 -0
- package/templates/blockchain/.cursorrules/smart-contracts.md +364 -0
- package/templates/blockchain/.cursorrules/testing.md +415 -0
- package/templates/blockchain/.cursorrules/web3-integration.md +538 -0
- package/templates/blockchain/CLAUDE.md +389 -0
- package/templates/cli-tools/.cursorrules/architecture.md +412 -0
- package/templates/cli-tools/.cursorrules/arguments.md +406 -0
- package/templates/cli-tools/.cursorrules/distribution.md +546 -0
- package/templates/cli-tools/.cursorrules/error-handling.md +455 -0
- package/templates/cli-tools/.cursorrules/overview.md +136 -0
- package/templates/cli-tools/.cursorrules/testing.md +537 -0
- package/templates/cli-tools/.cursorrules/user-experience.md +545 -0
- package/templates/cli-tools/CLAUDE.md +356 -0
- package/templates/data-engineering/.cursorrules/data-modeling.md +367 -0
- package/templates/data-engineering/.cursorrules/data-quality.md +455 -0
- package/templates/data-engineering/.cursorrules/overview.md +85 -0
- package/templates/data-engineering/.cursorrules/performance.md +339 -0
- package/templates/data-engineering/.cursorrules/pipeline-design.md +280 -0
- package/templates/data-engineering/.cursorrules/security.md +460 -0
- package/templates/data-engineering/.cursorrules/testing.md +452 -0
- package/templates/data-engineering/CLAUDE.md +974 -0
- package/templates/devops-sre/.cursorrules/capacity-planning.md +653 -0
- package/templates/devops-sre/.cursorrules/change-management.md +584 -0
- package/templates/devops-sre/.cursorrules/chaos-engineering.md +651 -0
- package/templates/devops-sre/.cursorrules/disaster-recovery.md +641 -0
- package/templates/devops-sre/.cursorrules/incident-management.md +565 -0
- package/templates/devops-sre/.cursorrules/observability.md +714 -0
- package/templates/devops-sre/.cursorrules/overview.md +230 -0
- package/templates/devops-sre/.cursorrules/postmortems.md +588 -0
- package/templates/devops-sre/.cursorrules/runbooks.md +760 -0
- package/templates/devops-sre/.cursorrules/slo-sli.md +617 -0
- package/templates/devops-sre/.cursorrules/toil-reduction.md +567 -0
- package/templates/devops-sre/CLAUDE.md +1007 -0
- package/templates/documentation/.cursorrules/adr.md +277 -0
- package/templates/documentation/.cursorrules/api-documentation.md +411 -0
- package/templates/documentation/.cursorrules/code-comments.md +253 -0
- package/templates/documentation/.cursorrules/maintenance.md +260 -0
- package/templates/documentation/.cursorrules/overview.md +82 -0
- package/templates/documentation/.cursorrules/readme-standards.md +306 -0
- package/templates/documentation/CLAUDE.md +120 -0
- package/templates/fullstack/.cursorrules/api-contracts.md +331 -0
- package/templates/fullstack/.cursorrules/architecture.md +298 -0
- package/templates/fullstack/.cursorrules/overview.md +109 -0
- package/templates/fullstack/.cursorrules/shared-types.md +348 -0
- package/templates/fullstack/.cursorrules/testing.md +386 -0
- package/templates/fullstack/CLAUDE.md +349 -0
- package/templates/ml-ai/.cursorrules/data-engineering.md +483 -0
- package/templates/ml-ai/.cursorrules/deployment.md +601 -0
- package/templates/ml-ai/.cursorrules/model-development.md +538 -0
- package/templates/ml-ai/.cursorrules/monitoring.md +658 -0
- package/templates/ml-ai/.cursorrules/overview.md +131 -0
- package/templates/ml-ai/.cursorrules/security.md +637 -0
- package/templates/ml-ai/.cursorrules/testing.md +678 -0
- package/templates/ml-ai/CLAUDE.md +1136 -0
- package/templates/mobile/.cursorrules/navigation.md +246 -0
- package/templates/mobile/.cursorrules/offline-first.md +302 -0
- package/templates/mobile/.cursorrules/overview.md +71 -0
- package/templates/mobile/.cursorrules/performance.md +345 -0
- package/templates/mobile/.cursorrules/testing.md +339 -0
- package/templates/mobile/CLAUDE.md +233 -0
- package/templates/platform-engineering/.cursorrules/ci-cd.md +778 -0
- package/templates/platform-engineering/.cursorrules/developer-experience.md +632 -0
- package/templates/platform-engineering/.cursorrules/infrastructure-as-code.md +600 -0
- package/templates/platform-engineering/.cursorrules/kubernetes.md +710 -0
- package/templates/platform-engineering/.cursorrules/observability.md +747 -0
- package/templates/platform-engineering/.cursorrules/overview.md +215 -0
- package/templates/platform-engineering/.cursorrules/security.md +855 -0
- package/templates/platform-engineering/.cursorrules/testing.md +878 -0
- package/templates/platform-engineering/CLAUDE.md +850 -0
- package/templates/utility-agent/.cursorrules/action-control.md +284 -0
- package/templates/utility-agent/.cursorrules/context-management.md +186 -0
- package/templates/utility-agent/.cursorrules/hallucination-prevention.md +253 -0
- package/templates/utility-agent/.cursorrules/overview.md +78 -0
- package/templates/utility-agent/.cursorrules/token-optimization.md +369 -0
- package/templates/utility-agent/CLAUDE.md +513 -0
- package/templates/web-backend/.cursorrules/api-design.md +255 -0
- package/templates/web-backend/.cursorrules/authentication.md +309 -0
- package/templates/web-backend/.cursorrules/database-patterns.md +298 -0
- package/templates/web-backend/.cursorrules/error-handling.md +366 -0
- package/templates/web-backend/.cursorrules/overview.md +69 -0
- package/templates/web-backend/.cursorrules/security.md +358 -0
- package/templates/web-backend/.cursorrules/testing.md +395 -0
- package/templates/web-backend/CLAUDE.md +366 -0
- package/templates/web-frontend/.cursorrules/accessibility.md +296 -0
- package/templates/web-frontend/.cursorrules/component-patterns.md +204 -0
- package/templates/web-frontend/.cursorrules/overview.md +72 -0
- package/templates/web-frontend/.cursorrules/performance.md +325 -0
- package/templates/web-frontend/.cursorrules/state-management.md +227 -0
- package/templates/web-frontend/.cursorrules/styling.md +271 -0
- package/templates/web-frontend/.cursorrules/testing.md +311 -0
- package/templates/web-frontend/CLAUDE.md +399 -0
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
# Data Engineering for ML
|
|
2
|
+
|
|
3
|
+
Guidelines for data pipelines, validation, feature engineering, and ensuring data quality throughout the ML lifecycle.
|
|
4
|
+
|
|
5
|
+
## Data Validation
|
|
6
|
+
|
|
7
|
+
### Schema Validation
|
|
8
|
+
|
|
9
|
+
Define explicit schemas for all data:
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
import pandera as pa
|
|
13
|
+
from pandera.typing import Series, DataFrame
|
|
14
|
+
|
|
15
|
+
class TrainingDataSchema(pa.DataFrameModel):
|
|
16
|
+
"""Schema for training data validation."""
|
|
17
|
+
|
|
18
|
+
user_id: Series[str] = pa.Field(nullable=False)
|
|
19
|
+
timestamp: Series[pa.DateTime] = pa.Field(nullable=False)
|
|
20
|
+
feature_numeric: Series[float] = pa.Field(ge=0, le=1)
|
|
21
|
+
feature_categorical: Series[str] = pa.Field(isin=["A", "B", "C"])
|
|
22
|
+
label: Series[int] = pa.Field(isin=[0, 1])
|
|
23
|
+
|
|
24
|
+
class Config:
|
|
25
|
+
strict = True # Reject extra columns
|
|
26
|
+
coerce = True # Auto-convert types
|
|
27
|
+
|
|
28
|
+
@pa.check_types
|
|
29
|
+
def load_training_data(path: str) -> DataFrame[TrainingDataSchema]:
|
|
30
|
+
"""Load and validate training data."""
|
|
31
|
+
df = pd.read_parquet(path)
|
|
32
|
+
return df # Automatically validated
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Data Quality Checks
|
|
36
|
+
|
|
37
|
+
Use Great Expectations for comprehensive quality checks:
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from great_expectations.core import ExpectationSuite
|
|
41
|
+
|
|
42
|
+
def create_quality_suite() -> ExpectationSuite:
|
|
43
|
+
suite = ExpectationSuite("training_data_quality")
|
|
44
|
+
|
|
45
|
+
# Completeness
|
|
46
|
+
suite.add_expectation(
|
|
47
|
+
expectation_type="expect_column_values_to_not_be_null",
|
|
48
|
+
kwargs={"column": "user_id"}
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Uniqueness
|
|
52
|
+
suite.add_expectation(
|
|
53
|
+
expectation_type="expect_column_values_to_be_unique",
|
|
54
|
+
kwargs={"column": "transaction_id"}
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Freshness
|
|
58
|
+
suite.add_expectation(
|
|
59
|
+
expectation_type="expect_column_max_to_be_between",
|
|
60
|
+
kwargs={
|
|
61
|
+
"column": "timestamp",
|
|
62
|
+
"min_value": "{{ yesterday }}",
|
|
63
|
+
"parse_strings_as_datetimes": True
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Distribution stability
|
|
68
|
+
suite.add_expectation(
|
|
69
|
+
expectation_type="expect_column_mean_to_be_between",
|
|
70
|
+
kwargs={"column": "amount", "min_value": 90, "max_value": 110}
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Referential integrity
|
|
74
|
+
suite.add_expectation(
|
|
75
|
+
expectation_type="expect_column_values_to_be_in_set",
|
|
76
|
+
kwargs={"column": "country_code", "value_set": VALID_COUNTRIES}
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return suite
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Validation Pipeline Integration
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from prefect import flow, task
|
|
86
|
+
|
|
87
|
+
@task
|
|
88
|
+
def validate_data(df: pd.DataFrame, suite_name: str) -> ValidationResult:
|
|
89
|
+
"""Validate data against expectations."""
|
|
90
|
+
context = ge.get_context()
|
|
91
|
+
suite = context.get_expectation_suite(suite_name)
|
|
92
|
+
|
|
93
|
+
result = context.run_validation_operator(
|
|
94
|
+
"action_list_operator",
|
|
95
|
+
assets_to_validate=[df],
|
|
96
|
+
expectation_suite=suite,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if not result.success:
|
|
100
|
+
failed_expectations = [
|
|
101
|
+
exp for exp in result.results if not exp.success
|
|
102
|
+
]
|
|
103
|
+
raise DataQualityError(f"Validation failed: {failed_expectations}")
|
|
104
|
+
|
|
105
|
+
return result
|
|
106
|
+
|
|
107
|
+
@flow
|
|
108
|
+
def data_ingestion_pipeline(source: str):
|
|
109
|
+
raw_data = extract_data(source)
|
|
110
|
+
validate_data(raw_data, "raw_data_quality")
|
|
111
|
+
|
|
112
|
+
processed_data = transform_data(raw_data)
|
|
113
|
+
validate_data(processed_data, "processed_data_quality")
|
|
114
|
+
|
|
115
|
+
load_data(processed_data)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Feature Engineering
|
|
119
|
+
|
|
120
|
+
### Feature Store Integration
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from feast import FeatureStore, Entity, FeatureView, Field
|
|
124
|
+
from feast.types import Float32, Int64, String
|
|
125
|
+
|
|
126
|
+
# Define entities
|
|
127
|
+
user = Entity(name="user", join_keys=["user_id"], description="User entity")
|
|
128
|
+
|
|
129
|
+
# Define feature view
|
|
130
|
+
user_behavior_features = FeatureView(
|
|
131
|
+
name="user_behavior_features",
|
|
132
|
+
entities=[user],
|
|
133
|
+
schema=[
|
|
134
|
+
Field(name="session_count_7d", dtype=Int64),
|
|
135
|
+
Field(name="avg_session_duration_7d", dtype=Float32),
|
|
136
|
+
Field(name="purchase_count_30d", dtype=Int64),
|
|
137
|
+
Field(name="total_spend_30d", dtype=Float32),
|
|
138
|
+
],
|
|
139
|
+
online=True, # Enable online serving
|
|
140
|
+
source=user_behavior_source,
|
|
141
|
+
ttl=timedelta(days=1),
|
|
142
|
+
tags={"team": "ml", "feature_group": "user_behavior"},
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Retrieve features for training
|
|
146
|
+
def get_training_features(entity_df: pd.DataFrame) -> pd.DataFrame:
|
|
147
|
+
"""Get historical features for training."""
|
|
148
|
+
store = FeatureStore(repo_path="feature_repo/")
|
|
149
|
+
|
|
150
|
+
training_df = store.get_historical_features(
|
|
151
|
+
entity_df=entity_df,
|
|
152
|
+
features=[
|
|
153
|
+
"user_behavior_features:session_count_7d",
|
|
154
|
+
"user_behavior_features:avg_session_duration_7d",
|
|
155
|
+
"user_behavior_features:purchase_count_30d",
|
|
156
|
+
"user_behavior_features:total_spend_30d",
|
|
157
|
+
],
|
|
158
|
+
).to_df()
|
|
159
|
+
|
|
160
|
+
return training_df
|
|
161
|
+
|
|
162
|
+
# Retrieve features for online inference
|
|
163
|
+
def get_online_features(user_ids: list[str]) -> dict:
|
|
164
|
+
"""Get features for real-time inference."""
|
|
165
|
+
store = FeatureStore(repo_path="feature_repo/")
|
|
166
|
+
|
|
167
|
+
entity_rows = [{"user_id": uid} for uid in user_ids]
|
|
168
|
+
|
|
169
|
+
features = store.get_online_features(
|
|
170
|
+
features=[
|
|
171
|
+
"user_behavior_features:session_count_7d",
|
|
172
|
+
"user_behavior_features:avg_session_duration_7d",
|
|
173
|
+
],
|
|
174
|
+
entity_rows=entity_rows,
|
|
175
|
+
).to_dict()
|
|
176
|
+
|
|
177
|
+
return features
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Feature Transformations
|
|
181
|
+
|
|
182
|
+
Ensure identical transforms for training and serving:
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
186
|
+
from sklearn.compose import ColumnTransformer
|
|
187
|
+
import joblib
|
|
188
|
+
|
|
189
|
+
class FeatureTransformer:
|
|
190
|
+
"""Serializable feature transformer for training/serving parity."""
|
|
191
|
+
|
|
192
|
+
def __init__(self, config: FeatureConfig):
|
|
193
|
+
self.config = config
|
|
194
|
+
self.numeric_transformer = StandardScaler()
|
|
195
|
+
self.categorical_encoders: dict[str, LabelEncoder] = {}
|
|
196
|
+
self._fitted = False
|
|
197
|
+
|
|
198
|
+
def fit(self, df: pd.DataFrame) -> "FeatureTransformer":
|
|
199
|
+
"""Fit transformer on training data."""
|
|
200
|
+
# Fit numeric scaler
|
|
201
|
+
if self.config.numeric_cols:
|
|
202
|
+
self.numeric_transformer.fit(df[self.config.numeric_cols])
|
|
203
|
+
|
|
204
|
+
# Fit categorical encoders
|
|
205
|
+
for col in self.config.categorical_cols:
|
|
206
|
+
encoder = LabelEncoder()
|
|
207
|
+
encoder.fit(df[col].fillna("__MISSING__"))
|
|
208
|
+
self.categorical_encoders[col] = encoder
|
|
209
|
+
|
|
210
|
+
self._fitted = True
|
|
211
|
+
return self
|
|
212
|
+
|
|
213
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
214
|
+
"""Transform features using fitted parameters."""
|
|
215
|
+
if not self._fitted:
|
|
216
|
+
raise ValueError("Transformer not fitted")
|
|
217
|
+
|
|
218
|
+
result = df.copy()
|
|
219
|
+
|
|
220
|
+
# Transform numeric
|
|
221
|
+
if self.config.numeric_cols:
|
|
222
|
+
result[self.config.numeric_cols] = self.numeric_transformer.transform(
|
|
223
|
+
result[self.config.numeric_cols]
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Transform categorical
|
|
227
|
+
for col, encoder in self.categorical_encoders.items():
|
|
228
|
+
# Handle unseen categories
|
|
229
|
+
result[col] = result[col].fillna("__MISSING__")
|
|
230
|
+
result[col] = result[col].apply(
|
|
231
|
+
lambda x: x if x in encoder.classes_ else "__UNKNOWN__"
|
|
232
|
+
)
|
|
233
|
+
result[col] = encoder.transform(result[col])
|
|
234
|
+
|
|
235
|
+
return result
|
|
236
|
+
|
|
237
|
+
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
238
|
+
"""Fit and transform in one step."""
|
|
239
|
+
return self.fit(df).transform(df)
|
|
240
|
+
|
|
241
|
+
def save(self, path: str) -> None:
|
|
242
|
+
"""Serialize transformer."""
|
|
243
|
+
joblib.dump({
|
|
244
|
+
"config": self.config,
|
|
245
|
+
"numeric_transformer": self.numeric_transformer,
|
|
246
|
+
"categorical_encoders": self.categorical_encoders,
|
|
247
|
+
}, path)
|
|
248
|
+
|
|
249
|
+
@classmethod
|
|
250
|
+
def load(cls, path: str) -> "FeatureTransformer":
|
|
251
|
+
"""Load serialized transformer."""
|
|
252
|
+
data = joblib.load(path)
|
|
253
|
+
transformer = cls(data["config"])
|
|
254
|
+
transformer.numeric_transformer = data["numeric_transformer"]
|
|
255
|
+
transformer.categorical_encoders = data["categorical_encoders"]
|
|
256
|
+
transformer._fitted = True
|
|
257
|
+
return transformer
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Data Versioning
|
|
261
|
+
|
|
262
|
+
### DVC for Data Version Control
|
|
263
|
+
|
|
264
|
+
```yaml
|
|
265
|
+
# dvc.yaml
|
|
266
|
+
stages:
|
|
267
|
+
prepare_data:
|
|
268
|
+
cmd: python src/data/prepare.py
|
|
269
|
+
deps:
|
|
270
|
+
- src/data/prepare.py
|
|
271
|
+
- data/raw/
|
|
272
|
+
outs:
|
|
273
|
+
- data/processed/
|
|
274
|
+
params:
|
|
275
|
+
- prepare.split_ratio
|
|
276
|
+
- prepare.random_seed
|
|
277
|
+
|
|
278
|
+
extract_features:
|
|
279
|
+
cmd: python src/features/extract.py
|
|
280
|
+
deps:
|
|
281
|
+
- src/features/extract.py
|
|
282
|
+
- data/processed/
|
|
283
|
+
outs:
|
|
284
|
+
- data/features/
|
|
285
|
+
params:
|
|
286
|
+
- features.window_size
|
|
287
|
+
- features.aggregations
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
```bash
|
|
291
|
+
# Track data versions
|
|
292
|
+
dvc add data/raw/dataset_v1.parquet
|
|
293
|
+
git add data/raw/dataset_v1.parquet.dvc
|
|
294
|
+
git commit -m "data: add dataset v1"
|
|
295
|
+
|
|
296
|
+
# Switch between versions
|
|
297
|
+
git checkout v1.0
|
|
298
|
+
dvc checkout
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
### Data Lineage Tracking
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
from dataclasses import dataclass
|
|
305
|
+
from datetime import datetime
|
|
306
|
+
import hashlib
|
|
307
|
+
|
|
308
|
+
@dataclass
|
|
309
|
+
class DataLineage:
|
|
310
|
+
"""Track data provenance."""
|
|
311
|
+
source: str
|
|
312
|
+
version: str
|
|
313
|
+
created_at: datetime
|
|
314
|
+
row_count: int
|
|
315
|
+
column_count: int
|
|
316
|
+
checksum: str
|
|
317
|
+
schema_version: str
|
|
318
|
+
transformations: list[str]
|
|
319
|
+
parent_lineage: Optional["DataLineage"] = None
|
|
320
|
+
|
|
321
|
+
@classmethod
|
|
322
|
+
def from_dataframe(
|
|
323
|
+
cls,
|
|
324
|
+
df: pd.DataFrame,
|
|
325
|
+
source: str,
|
|
326
|
+
version: str,
|
|
327
|
+
transformations: list[str] = None,
|
|
328
|
+
parent: "DataLineage" = None,
|
|
329
|
+
) -> "DataLineage":
|
|
330
|
+
"""Create lineage record from DataFrame."""
|
|
331
|
+
checksum = hashlib.sha256(
|
|
332
|
+
pd.util.hash_pandas_object(df).values
|
|
333
|
+
).hexdigest()
|
|
334
|
+
|
|
335
|
+
return cls(
|
|
336
|
+
source=source,
|
|
337
|
+
version=version,
|
|
338
|
+
created_at=datetime.utcnow(),
|
|
339
|
+
row_count=len(df),
|
|
340
|
+
column_count=len(df.columns),
|
|
341
|
+
checksum=checksum,
|
|
342
|
+
schema_version=get_schema_version(df),
|
|
343
|
+
transformations=transformations or [],
|
|
344
|
+
parent_lineage=parent,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
def to_dict(self) -> dict:
|
|
348
|
+
"""Serialize for logging."""
|
|
349
|
+
return {
|
|
350
|
+
"source": self.source,
|
|
351
|
+
"version": self.version,
|
|
352
|
+
"created_at": self.created_at.isoformat(),
|
|
353
|
+
"row_count": self.row_count,
|
|
354
|
+
"column_count": self.column_count,
|
|
355
|
+
"checksum": self.checksum,
|
|
356
|
+
"schema_version": self.schema_version,
|
|
357
|
+
"transformations": self.transformations,
|
|
358
|
+
"parent_checksum": self.parent_lineage.checksum if self.parent_lineage else None,
|
|
359
|
+
}
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
## Training/Serving Skew Prevention
|
|
363
|
+
|
|
364
|
+
### Common Causes
|
|
365
|
+
|
|
366
|
+
| Skew Type | Cause | Prevention |
|
|
367
|
+
|-----------|-------|------------|
|
|
368
|
+
| Data Processing | Different preprocessing code | Serialize transformers |
|
|
369
|
+
| Feature Computation | Time-dependent features computed differently | Use feature store timestamps |
|
|
370
|
+
| Data Distribution | Training on old data, serving on new | Monitor drift continuously |
|
|
371
|
+
| Missing Values | Different imputation strategies | Document and version imputation |
|
|
372
|
+
|
|
373
|
+
### Prevention Strategies
|
|
374
|
+
|
|
375
|
+
```python
|
|
376
|
+
# 1. Single source of truth for transforms
|
|
377
|
+
class FeaturePipeline:
|
|
378
|
+
"""Unified pipeline for training and serving."""
|
|
379
|
+
|
|
380
|
+
def __init__(self, config_path: str):
|
|
381
|
+
self.config = load_config(config_path)
|
|
382
|
+
self.transformer = None
|
|
383
|
+
|
|
384
|
+
def fit(self, df: pd.DataFrame) -> None:
|
|
385
|
+
"""Fit on training data."""
|
|
386
|
+
self.transformer = FeatureTransformer(self.config)
|
|
387
|
+
self.transformer.fit(df)
|
|
388
|
+
|
|
389
|
+
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
390
|
+
"""Transform for both training and serving."""
|
|
391
|
+
return self.transformer.transform(df)
|
|
392
|
+
|
|
393
|
+
def save(self, path: str) -> None:
|
|
394
|
+
"""Save entire pipeline."""
|
|
395
|
+
joblib.dump({
|
|
396
|
+
"config": self.config,
|
|
397
|
+
"transformer": self.transformer,
|
|
398
|
+
}, path)
|
|
399
|
+
|
|
400
|
+
# 2. Test for skew
|
|
401
|
+
def test_training_serving_parity():
|
|
402
|
+
"""Verify training and serving produce identical features."""
|
|
403
|
+
training_pipeline = FeaturePipeline.load("training_pipeline.pkl")
|
|
404
|
+
serving_pipeline = FeaturePipeline.load("serving_pipeline.pkl")
|
|
405
|
+
|
|
406
|
+
test_data = load_test_data()
|
|
407
|
+
|
|
408
|
+
training_features = training_pipeline.transform(test_data)
|
|
409
|
+
serving_features = serving_pipeline.transform(test_data)
|
|
410
|
+
|
|
411
|
+
pd.testing.assert_frame_equal(training_features, serving_features)
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
## Data Pipeline Patterns
|
|
415
|
+
|
|
416
|
+
### Idempotent Pipelines
|
|
417
|
+
|
|
418
|
+
```python
|
|
419
|
+
@task
|
|
420
|
+
def process_partition(date: str, force: bool = False) -> str:
|
|
421
|
+
"""Process a single date partition idempotently."""
|
|
422
|
+
output_path = f"s3://processed/{date}/data.parquet"
|
|
423
|
+
|
|
424
|
+
# Check if already processed
|
|
425
|
+
if not force and file_exists(output_path):
|
|
426
|
+
checksum = get_checksum(output_path)
|
|
427
|
+
logger.info(f"Partition {date} already exists: {checksum}")
|
|
428
|
+
return output_path
|
|
429
|
+
|
|
430
|
+
# Process
|
|
431
|
+
raw_data = load_raw_data(date)
|
|
432
|
+
processed = transform(raw_data)
|
|
433
|
+
|
|
434
|
+
# Write atomically (write to temp, then rename)
|
|
435
|
+
temp_path = f"{output_path}.tmp"
|
|
436
|
+
processed.to_parquet(temp_path)
|
|
437
|
+
rename(temp_path, output_path)
|
|
438
|
+
|
|
439
|
+
return output_path
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
### Backfill Strategy
|
|
443
|
+
|
|
444
|
+
```python
|
|
445
|
+
@flow
|
|
446
|
+
def backfill_features(start_date: str, end_date: str, parallelism: int = 4):
|
|
447
|
+
"""Backfill features for a date range."""
|
|
448
|
+
dates = pd.date_range(start_date, end_date, freq="D")
|
|
449
|
+
|
|
450
|
+
# Process in parallel batches
|
|
451
|
+
for batch in chunked(dates, parallelism):
|
|
452
|
+
futures = [
|
|
453
|
+
process_partition.submit(date.strftime("%Y-%m-%d"))
|
|
454
|
+
for date in batch
|
|
455
|
+
]
|
|
456
|
+
|
|
457
|
+
# Wait for batch to complete
|
|
458
|
+
results = [f.result() for f in futures]
|
|
459
|
+
|
|
460
|
+
# Validate results
|
|
461
|
+
for date, result in zip(batch, results):
|
|
462
|
+
validate_partition(result)
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
## Best Practices
|
|
466
|
+
|
|
467
|
+
### Do
|
|
468
|
+
|
|
469
|
+
- Define explicit schemas for all data
|
|
470
|
+
- Validate data at pipeline boundaries
|
|
471
|
+
- Version datasets alongside code
|
|
472
|
+
- Serialize feature transformers
|
|
473
|
+
- Test for training/serving skew
|
|
474
|
+
- Monitor data distributions continuously
|
|
475
|
+
|
|
476
|
+
### Don't
|
|
477
|
+
|
|
478
|
+
- Trust external data without validation
|
|
479
|
+
- Hardcode feature engineering parameters
|
|
480
|
+
- Use different preprocessing in training vs serving
|
|
481
|
+
- Ignore data freshness requirements
|
|
482
|
+
- Skip data quality checks in production
|
|
483
|
+
- Assume data distributions are stable
|