aigroup-workflow 2.2.1 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/fix-build.md +10 -5
- package/.claude/commands/init-project.md +13 -8
- package/.claude/commands/plan.md +15 -8
- package/.claude/commands/review.md +12 -6
- package/.claude/commands/tdd.md +11 -5
- package/.claude/commands/workflow-start.md +20 -11
- package/.claude/settings.json +28 -0
- package/.codex/agents/architect.toml +207 -0
- package/.codex/agents/build-error-resolver.toml +110 -0
- package/.codex/agents/code-reviewer.toml +233 -0
- package/.codex/agents/doc-updater.toml +103 -0
- package/.codex/agents/e2e-runner.toml +103 -0
- package/.codex/agents/get-current-datetime.toml +23 -0
- package/.codex/agents/init-architect.toml +181 -0
- package/.codex/agents/planner.toml +208 -0
- package/.codex/agents/refactor-cleaner.toml +81 -0
- package/.codex/agents/rust-reviewer.toml +90 -0
- package/.codex/agents/security-reviewer.toml +104 -0
- package/.codex/agents/tdd-guide.toml +87 -0
- package/AGENTS.md +2 -2
- package/CLAUDE.md +23 -1
- package/LICENSE +20 -20
- package/README.md +333 -333
- package/agents/a11y-architect.md +141 -141
- package/agents/architect.md +211 -211
- package/agents/build-error-resolver.md +114 -114
- package/agents/chief-of-staff.md +151 -151
- package/agents/code-architect.md +71 -71
- package/agents/code-explorer.md +69 -69
- package/agents/code-reviewer.md +237 -237
- package/agents/code-simplifier.md +47 -47
- package/agents/comment-analyzer.md +45 -45
- package/agents/conversation-analyzer.md +52 -52
- package/agents/cpp-build-resolver.md +90 -90
- package/agents/cpp-reviewer.md +72 -72
- package/agents/csharp-reviewer.md +101 -101
- package/agents/dart-build-resolver.md +201 -201
- package/agents/database-reviewer.md +91 -91
- package/agents/doc-updater.md +107 -107
- package/agents/docs-lookup.md +68 -68
- package/agents/e2e-runner.md +107 -107
- package/agents/flutter-reviewer.md +243 -243
- package/agents/gan-evaluator.md +209 -209
- package/agents/gan-generator.md +131 -131
- package/agents/gan-planner.md +99 -99
- package/agents/get-current-datetime.md +26 -26
- package/agents/go-build-resolver.md +94 -94
- package/agents/go-reviewer.md +76 -76
- package/agents/harness-optimizer.md +35 -35
- package/agents/healthcare-reviewer.md +83 -83
- package/agents/java-build-resolver.md +153 -153
- package/agents/java-reviewer.md +92 -92
- package/agents/kotlin-build-resolver.md +118 -118
- package/agents/kotlin-reviewer.md +159 -159
- package/agents/loop-operator.md +36 -36
- package/agents/opensource-forker.md +198 -198
- package/agents/opensource-packager.md +249 -249
- package/agents/opensource-sanitizer.md +188 -188
- package/agents/performance-optimizer.md +446 -446
- package/agents/planner.md +212 -212
- package/agents/pr-test-analyzer.md +45 -45
- package/agents/python-reviewer.md +98 -98
- package/agents/pytorch-build-resolver.md +120 -120
- package/agents/refactor-cleaner.md +85 -85
- package/agents/rust-build-resolver.md +148 -148
- package/agents/rust-reviewer.md +94 -94
- package/agents/security-reviewer.md +108 -108
- package/agents/seo-specialist.md +59 -59
- package/agents/silent-failure-hunter.md +50 -50
- package/agents/tdd-guide.md +91 -91
- package/agents/type-design-analyzer.md +41 -41
- package/agents/typescript-reviewer.md +112 -112
- package/cli/commands/update.mjs +1 -1
- package/cli/utils/scaffold.mjs +53 -0
- package/docs/rules/agents.md +166 -50
- package/docs/rules/cpp/coding-style.md +44 -44
- package/docs/rules/cpp/hooks.md +39 -39
- package/docs/rules/cpp/patterns.md +51 -51
- package/docs/rules/cpp/security.md +51 -51
- package/docs/rules/cpp/testing.md +44 -44
- package/docs/rules/csharp/coding-style.md +72 -72
- package/docs/rules/csharp/hooks.md +25 -25
- package/docs/rules/csharp/patterns.md +50 -50
- package/docs/rules/csharp/security.md +58 -58
- package/docs/rules/csharp/testing.md +46 -46
- package/docs/rules/dart/coding-style.md +159 -159
- package/docs/rules/dart/hooks.md +66 -66
- package/docs/rules/dart/patterns.md +261 -261
- package/docs/rules/dart/security.md +135 -135
- package/docs/rules/dart/testing.md +215 -215
- package/docs/rules/golang/coding-style.md +32 -32
- package/docs/rules/golang/hooks.md +17 -17
- package/docs/rules/golang/patterns.md +45 -45
- package/docs/rules/golang/security.md +34 -34
- package/docs/rules/golang/testing.md +31 -31
- package/docs/rules/java/coding-style.md +114 -114
- package/docs/rules/java/hooks.md +18 -18
- package/docs/rules/java/patterns.md +146 -146
- package/docs/rules/java/security.md +100 -100
- package/docs/rules/java/testing.md +131 -131
- package/docs/rules/kotlin/coding-style.md +86 -86
- package/docs/rules/kotlin/hooks.md +17 -17
- package/docs/rules/kotlin/patterns.md +146 -146
- package/docs/rules/kotlin/security.md +82 -82
- package/docs/rules/kotlin/testing.md +128 -128
- package/docs/rules/perl/coding-style.md +46 -46
- package/docs/rules/perl/hooks.md +22 -22
- package/docs/rules/perl/patterns.md +76 -76
- package/docs/rules/perl/security.md +69 -69
- package/docs/rules/perl/testing.md +54 -54
- package/docs/rules/php/coding-style.md +40 -40
- package/docs/rules/php/hooks.md +24 -24
- package/docs/rules/php/patterns.md +33 -33
- package/docs/rules/php/security.md +37 -37
- package/docs/rules/php/testing.md +39 -39
- package/docs/rules/python/coding-style.md +42 -42
- package/docs/rules/python/hooks.md +19 -19
- package/docs/rules/python/patterns.md +39 -39
- package/docs/rules/python/security.md +30 -30
- package/docs/rules/python/testing.md +38 -38
- package/docs/rules/rust/coding-style.md +151 -151
- package/docs/rules/rust/hooks.md +16 -16
- package/docs/rules/rust/patterns.md +168 -168
- package/docs/rules/rust/security.md +141 -141
- package/docs/rules/rust/testing.md +154 -154
- package/docs/rules/swift/coding-style.md +47 -47
- package/docs/rules/swift/hooks.md +20 -20
- package/docs/rules/swift/patterns.md +66 -66
- package/docs/rules/swift/security.md +33 -33
- package/docs/rules/swift/testing.md +45 -45
- package/docs/rules/typescript/coding-style.md +199 -199
- package/docs/rules/typescript/hooks.md +22 -22
- package/docs/rules/typescript/patterns.md +52 -52
- package/docs/rules/typescript/security.md +28 -28
- package/docs/rules/typescript/testing.md +18 -18
- package/docs/rules/web/coding-style.md +96 -96
- package/docs/rules/web/design-quality.md +62 -62
- package/docs/rules/web/hooks.md +120 -120
- package/docs/rules/web/patterns.md +79 -79
- package/docs/rules/web/performance.md +64 -64
- package/docs/rules/web/security.md +57 -57
- package/docs/rules/web/testing.md +55 -55
- package/docs/templates/README.md +36 -36
- package/docs/templates/ai-project-final.md +124 -124
- package/docs/templates/ai-project.md +105 -105
- package/docs/templates/api.md +157 -157
- package/docs/templates/bug.md +62 -62
- package/docs/templates/code-review.md +87 -87
- package/docs/templates/generic.md +116 -116
- package/docs/templates/implementation-plan.md +1 -1
- package/docs/templates/meeting.md +68 -68
- package/docs/templates/prd.md +98 -98
- package/docs/templates/ui.md +134 -134
- package/docs/workflow-pipeline.md +5 -5
- package/package.json +40 -39
- package/skills/SUPERPOWERS-LICENSE +21 -21
- package/skills/ai-ml/fine-tuning-expert/SKILL.md +162 -162
- package/skills/ai-ml/fine-tuning-expert/references/dataset-preparation.md +540 -540
- package/skills/ai-ml/fine-tuning-expert/references/deployment-optimization.md +673 -673
- package/skills/ai-ml/fine-tuning-expert/references/evaluation-metrics.md +597 -597
- package/skills/ai-ml/fine-tuning-expert/references/hyperparameter-tuning.md +565 -565
- package/skills/ai-ml/fine-tuning-expert/references/lora-peft.md +347 -347
- package/skills/ai-ml/ml-pipeline/SKILL.md +159 -159
- package/skills/ai-ml/ml-pipeline/references/experiment-tracking.md +833 -833
- package/skills/ai-ml/ml-pipeline/references/feature-engineering.md +631 -631
- package/skills/ai-ml/ml-pipeline/references/model-validation.md +978 -978
- package/skills/ai-ml/ml-pipeline/references/pipeline-orchestration.md +907 -907
- package/skills/ai-ml/ml-pipeline/references/training-pipelines.md +782 -782
- package/skills/ai-ml/rag-architect/SKILL.md +194 -194
- package/skills/ai-ml/rag-architect/references/chunking-strategies.md +878 -878
- package/skills/ai-ml/rag-architect/references/embedding-models.md +561 -561
- package/skills/ai-ml/rag-architect/references/rag-evaluation.md +833 -833
- package/skills/ai-ml/rag-architect/references/retrieval-optimization.md +795 -795
- package/skills/ai-ml/rag-architect/references/vector-databases.md +589 -589
- package/skills/ai-ml/spark-engineer/SKILL.md +148 -148
- package/skills/ai-ml/spark-engineer/references/partitioning-caching.md +543 -543
- package/skills/ai-ml/spark-engineer/references/performance-tuning.md +544 -544
- package/skills/ai-ml/spark-engineer/references/rdd-operations.md +599 -599
- package/skills/ai-ml/spark-engineer/references/spark-sql-dataframes.md +474 -474
- package/skills/ai-ml/spark-engineer/references/streaming-patterns.md +786 -786
- package/skills/backend/api-designer/SKILL.md +217 -217
- package/skills/backend/api-designer/references/error-handling.md +541 -541
- package/skills/backend/api-designer/references/openapi.md +824 -824
- package/skills/backend/api-designer/references/pagination.md +494 -494
- package/skills/backend/api-designer/references/rest-patterns.md +335 -335
- package/skills/backend/api-designer/references/versioning.md +391 -391
- package/skills/backend/architecture-designer/SKILL.md +117 -117
- package/skills/backend/architecture-designer/references/adr-template.md +116 -116
- package/skills/backend/architecture-designer/references/architecture-patterns.md +111 -111
- package/skills/backend/architecture-designer/references/database-selection.md +102 -102
- package/skills/backend/architecture-designer/references/nfr-checklist.md +112 -112
- package/skills/backend/architecture-designer/references/system-design.md +100 -100
- package/skills/backend/code-documenter/SKILL.md +147 -147
- package/skills/backend/code-documenter/references/api-docs-fastapi-django.md +166 -166
- package/skills/backend/code-documenter/references/api-docs-nestjs-express.md +220 -220
- package/skills/backend/code-documenter/references/coverage-reports.md +125 -125
- package/skills/backend/code-documenter/references/documentation-systems.md +333 -333
- package/skills/backend/code-documenter/references/interactive-api-docs.md +531 -531
- package/skills/backend/code-documenter/references/python-docstrings.md +121 -121
- package/skills/backend/code-documenter/references/typescript-jsdoc.md +145 -145
- package/skills/backend/code-documenter/references/user-guides-tutorials.md +530 -530
- package/skills/backend/debugging-wizard/SKILL.md +105 -105
- package/skills/backend/debugging-wizard/references/common-patterns.md +132 -132
- package/skills/backend/debugging-wizard/references/debugging-tools.md +140 -140
- package/skills/backend/debugging-wizard/references/quick-fixes.md +177 -177
- package/skills/backend/debugging-wizard/references/strategies.md +142 -142
- package/skills/backend/debugging-wizard/references/systematic-debugging.md +367 -367
- package/skills/backend/feature-forge/SKILL.md +98 -98
- package/skills/backend/feature-forge/references/acceptance-criteria.md +104 -104
- package/skills/backend/feature-forge/references/ears-syntax.md +99 -99
- package/skills/backend/feature-forge/references/interview-questions.md +150 -150
- package/skills/backend/feature-forge/references/pre-discovery-subagents.md +54 -54
- package/skills/backend/feature-forge/references/specification-template.md +103 -103
- package/skills/backend/fullstack-guardian/SKILL.md +105 -105
- package/skills/backend/fullstack-guardian/references/api-design-standards.md +307 -307
- package/skills/backend/fullstack-guardian/references/architecture-decisions.md +350 -350
- package/skills/backend/fullstack-guardian/references/backend-patterns.md +237 -237
- package/skills/backend/fullstack-guardian/references/common-patterns.md +134 -134
- package/skills/backend/fullstack-guardian/references/deliverables-checklist.md +354 -354
- package/skills/backend/fullstack-guardian/references/design-template.md +91 -91
- package/skills/backend/fullstack-guardian/references/error-handling.md +135 -135
- package/skills/backend/fullstack-guardian/references/frontend-patterns.md +340 -340
- package/skills/backend/fullstack-guardian/references/integration-patterns.md +333 -333
- package/skills/backend/fullstack-guardian/references/security-checklist.md +106 -106
- package/skills/backend/graphql-architect/SKILL.md +146 -146
- package/skills/backend/graphql-architect/references/federation.md +418 -418
- package/skills/backend/graphql-architect/references/migration-from-rest.md +1141 -1141
- package/skills/backend/graphql-architect/references/resolvers.md +425 -425
- package/skills/backend/graphql-architect/references/schema-design.md +393 -393
- package/skills/backend/graphql-architect/references/security.md +569 -569
- package/skills/backend/graphql-architect/references/subscriptions.md +510 -510
- package/skills/backend/legacy-modernizer/SKILL.md +137 -137
- package/skills/backend/legacy-modernizer/references/legacy-testing.md +381 -381
- package/skills/backend/legacy-modernizer/references/migration-strategies.md +423 -423
- package/skills/backend/legacy-modernizer/references/refactoring-patterns.md +395 -395
- package/skills/backend/legacy-modernizer/references/strangler-fig-pattern.md +281 -281
- package/skills/backend/legacy-modernizer/references/system-assessment.md +487 -487
- package/skills/backend/microservices-architect/SKILL.md +164 -164
- package/skills/backend/microservices-architect/references/communication.md +499 -499
- package/skills/backend/microservices-architect/references/data.md +721 -721
- package/skills/backend/microservices-architect/references/decomposition.md +344 -344
- package/skills/backend/microservices-architect/references/observability.md +805 -805
- package/skills/backend/microservices-architect/references/patterns.md +603 -603
- package/skills/database/database-optimizer/SKILL.md +147 -147
- package/skills/database/database-optimizer/references/index-strategies.md +331 -331
- package/skills/database/database-optimizer/references/monitoring-analysis.md +501 -501
- package/skills/database/database-optimizer/references/mysql-tuning.md +452 -452
- package/skills/database/database-optimizer/references/postgresql-tuning.md +413 -413
- package/skills/database/database-optimizer/references/query-optimization.md +251 -251
- package/skills/database/postgres-pro/SKILL.md +152 -152
- package/skills/database/postgres-pro/references/extensions.md +404 -404
- package/skills/database/postgres-pro/references/jsonb.md +321 -321
- package/skills/database/postgres-pro/references/maintenance.md +481 -481
- package/skills/database/postgres-pro/references/performance.md +265 -265
- package/skills/database/postgres-pro/references/replication.md +446 -446
- package/skills/database/sql-pro/SKILL.md +129 -129
- package/skills/database/sql-pro/references/database-design.md +402 -402
- package/skills/database/sql-pro/references/dialect-differences.md +419 -419
- package/skills/database/sql-pro/references/optimization.md +384 -384
- package/skills/database/sql-pro/references/query-patterns.md +285 -285
- package/skills/database/sql-pro/references/window-functions.md +328 -328
- package/skills/dotnet/csharp-developer/SKILL.md +125 -125
- package/skills/dotnet/csharp-developer/references/aspnet-core.md +394 -394
- package/skills/dotnet/csharp-developer/references/blazor.md +553 -553
- package/skills/dotnet/csharp-developer/references/entity-framework.md +409 -409
- package/skills/dotnet/csharp-developer/references/modern-csharp.md +248 -248
- package/skills/dotnet/csharp-developer/references/performance.md +498 -498
- package/skills/dotnet/dotnet-core-expert/SKILL.md +138 -138
- package/skills/dotnet/dotnet-core-expert/references/authentication.md +546 -546
- package/skills/dotnet/dotnet-core-expert/references/clean-architecture.md +455 -455
- package/skills/dotnet/dotnet-core-expert/references/cloud-native.md +548 -548
- package/skills/dotnet/dotnet-core-expert/references/entity-framework.md +440 -440
- package/skills/dotnet/dotnet-core-expert/references/minimal-apis.md +319 -319
- package/skills/frontend/angular-architect/SKILL.md +152 -152
- package/skills/frontend/angular-architect/references/components.md +297 -297
- package/skills/frontend/angular-architect/references/ngrx.md +401 -401
- package/skills/frontend/angular-architect/references/routing.md +361 -361
- package/skills/frontend/angular-architect/references/rxjs.md +319 -319
- package/skills/frontend/angular-architect/references/testing.md +405 -405
- package/skills/frontend/design-commands/design.md +91 -91
- package/skills/frontend/design-commands/handoff.md +97 -97
- package/skills/frontend/design-commands/prototype.md +120 -120
- package/skills/frontend/design-commands/spec.md +160 -160
- package/skills/frontend/design-commands/style.md +78 -78
- package/skills/frontend/flutter-expert/SKILL.md +138 -138
- package/skills/frontend/flutter-expert/references/bloc-state.md +259 -259
- package/skills/frontend/flutter-expert/references/gorouter-navigation.md +119 -119
- package/skills/frontend/flutter-expert/references/performance.md +99 -99
- package/skills/frontend/flutter-expert/references/project-structure.md +118 -118
- package/skills/frontend/flutter-expert/references/riverpod-state.md +130 -130
- package/skills/frontend/flutter-expert/references/widget-patterns.md +123 -123
- package/skills/frontend/nextjs-developer/SKILL.md +143 -143
- package/skills/frontend/nextjs-developer/references/app-router.md +311 -311
- package/skills/frontend/nextjs-developer/references/data-fetching.md +482 -482
- package/skills/frontend/nextjs-developer/references/deployment.md +545 -545
- package/skills/frontend/nextjs-developer/references/server-actions.md +462 -462
- package/skills/frontend/nextjs-developer/references/server-components.md +384 -384
- package/skills/frontend/react-expert/SKILL.md +149 -149
- package/skills/frontend/react-expert/references/hooks-patterns.md +162 -162
- package/skills/frontend/react-expert/references/migration-class-to-modern.md +1119 -1119
- package/skills/frontend/react-expert/references/performance.md +168 -168
- package/skills/frontend/react-expert/references/react-19-features.md +174 -174
- package/skills/frontend/react-expert/references/server-components.md +143 -143
- package/skills/frontend/react-expert/references/state-management.md +171 -171
- package/skills/frontend/react-expert/references/testing-react.md +174 -174
- package/skills/frontend/react-native-expert/SKILL.md +185 -185
- package/skills/frontend/react-native-expert/references/expo-router.md +187 -187
- package/skills/frontend/react-native-expert/references/list-optimization.md +204 -204
- package/skills/frontend/react-native-expert/references/platform-handling.md +188 -188
- package/skills/frontend/react-native-expert/references/project-structure.md +171 -171
- package/skills/frontend/react-native-expert/references/storage-hooks.md +173 -173
- package/skills/frontend/senior-frontend/SKILL.md +477 -477
- package/skills/frontend/senior-frontend/references/frontend_best_practices.md +806 -806
- package/skills/frontend/senior-frontend/references/nextjs_optimization_guide.md +724 -724
- package/skills/frontend/senior-frontend/references/react_patterns.md +746 -746
- package/skills/frontend/senior-frontend/scripts/bundle_analyzer.py +407 -407
- package/skills/frontend/senior-frontend/scripts/component_generator.py +329 -329
- package/skills/frontend/senior-frontend/scripts/frontend_scaffolder.py +1005 -1005
- package/skills/frontend/ui-ux-pro-max/SKILL.md +386 -386
- package/skills/frontend/ui-ux-pro-max/data/charts.csv +26 -26
- package/skills/frontend/ui-ux-pro-max/data/colors.csv +97 -97
- package/skills/frontend/ui-ux-pro-max/data/icons.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/landing.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/data/products.csv +96 -96
- package/skills/frontend/ui-ux-pro-max/data/react-performance.csv +45 -45
- package/skills/frontend/ui-ux-pro-max/data/stacks/astro.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/flutter.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -56
- package/skills/frontend/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nextjs.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -59
- package/skills/frontend/ui-ux-pro-max/data/stacks/react-native.csv +52 -52
- package/skills/frontend/ui-ux-pro-max/data/stacks/react.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/shadcn.csv +61 -61
- package/skills/frontend/ui-ux-pro-max/data/stacks/svelte.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/swiftui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/vue.csv +50 -50
- package/skills/frontend/ui-ux-pro-max/data/styles.csv +68 -68
- package/skills/frontend/ui-ux-pro-max/data/typography.csv +57 -57
- package/skills/frontend/ui-ux-pro-max/data/ui-reasoning.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/ux-guidelines.csv +99 -99
- package/skills/frontend/ui-ux-pro-max/data/web-interface.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/scripts/core.py +253 -253
- package/skills/frontend/ui-ux-pro-max/scripts/design_system.py +1067 -1067
- package/skills/frontend/ui-ux-pro-max/scripts/search.py +114 -114
- package/skills/frontend/vue-expert/SKILL.md +98 -98
- package/skills/frontend/vue-expert/references/build-tooling.md +480 -480
- package/skills/frontend/vue-expert/references/components.md +448 -448
- package/skills/frontend/vue-expert/references/composition-api.md +299 -299
- package/skills/frontend/vue-expert/references/mobile-hybrid.md +636 -636
- package/skills/frontend/vue-expert/references/nuxt.md +669 -669
- package/skills/frontend/vue-expert/references/state-management.md +449 -449
- package/skills/frontend/vue-expert/references/typescript.md +584 -584
- package/skills/frontend/vue-expert-js/SKILL.md +167 -167
- package/skills/frontend/vue-expert-js/references/component-architecture.md +219 -219
- package/skills/frontend/vue-expert-js/references/composables-patterns.md +183 -183
- package/skills/frontend/vue-expert-js/references/jsdoc-typing.md +535 -535
- package/skills/frontend/vue-expert-js/references/state-management.md +249 -249
- package/skills/frontend/vue-expert-js/references/testing-patterns.md +237 -237
- package/skills/go-rust-cpp/cpp-pro/SKILL.md +115 -115
- package/skills/go-rust-cpp/cpp-pro/references/build-tooling.md +440 -440
- package/skills/go-rust-cpp/cpp-pro/references/concurrency.md +437 -437
- package/skills/go-rust-cpp/cpp-pro/references/memory-performance.md +397 -397
- package/skills/go-rust-cpp/cpp-pro/references/modern-cpp.md +304 -304
- package/skills/go-rust-cpp/cpp-pro/references/templates.md +357 -357
- package/skills/go-rust-cpp/golang-pro/SKILL.md +122 -122
- package/skills/go-rust-cpp/golang-pro/references/concurrency.md +329 -329
- package/skills/go-rust-cpp/golang-pro/references/generics.md +442 -442
- package/skills/go-rust-cpp/golang-pro/references/interfaces.md +432 -432
- package/skills/go-rust-cpp/golang-pro/references/project-structure.md +477 -477
- package/skills/go-rust-cpp/golang-pro/references/testing.md +451 -451
- package/skills/go-rust-cpp/rust-engineer/SKILL.md +167 -167
- package/skills/go-rust-cpp/rust-engineer/references/async.md +458 -458
- package/skills/go-rust-cpp/rust-engineer/references/error-handling.md +334 -334
- package/skills/go-rust-cpp/rust-engineer/references/ownership.md +278 -278
- package/skills/go-rust-cpp/rust-engineer/references/testing.md +470 -470
- package/skills/go-rust-cpp/rust-engineer/references/traits.md +413 -413
- package/skills/infra/cli-developer/SKILL.md +113 -113
- package/skills/infra/cli-developer/references/design-patterns.md +221 -221
- package/skills/infra/cli-developer/references/go-cli.md +540 -540
- package/skills/infra/cli-developer/references/node-cli.md +383 -383
- package/skills/infra/cli-developer/references/python-cli.md +422 -422
- package/skills/infra/cli-developer/references/ux-patterns.md +448 -448
- package/skills/infra/cloud-architect/SKILL.md +216 -216
- package/skills/infra/cloud-architect/references/aws.md +394 -394
- package/skills/infra/cloud-architect/references/azure.md +562 -562
- package/skills/infra/cloud-architect/references/cost.md +582 -582
- package/skills/infra/cloud-architect/references/gcp.md +633 -633
- package/skills/infra/cloud-architect/references/multi-cloud.md +483 -483
- package/skills/infra/devops-engineer/SKILL.md +144 -144
- package/skills/infra/devops-engineer/references/deployment-strategies.md +241 -241
- package/skills/infra/devops-engineer/references/docker-patterns.md +113 -113
- package/skills/infra/devops-engineer/references/github-actions.md +139 -139
- package/skills/infra/devops-engineer/references/incident-response.md +331 -331
- package/skills/infra/devops-engineer/references/kubernetes.md +154 -154
- package/skills/infra/devops-engineer/references/platform-engineering.md +417 -417
- package/skills/infra/devops-engineer/references/release-automation.md +527 -527
- package/skills/infra/devops-engineer/references/terraform-iac.md +141 -141
- package/skills/infra/kubernetes-specialist/SKILL.md +241 -241
- package/skills/infra/kubernetes-specialist/references/configuration.md +452 -452
- package/skills/infra/kubernetes-specialist/references/cost-optimization.md +458 -458
- package/skills/infra/kubernetes-specialist/references/custom-operators.md +563 -563
- package/skills/infra/kubernetes-specialist/references/gitops.md +530 -530
- package/skills/infra/kubernetes-specialist/references/helm-charts.md +912 -912
- package/skills/infra/kubernetes-specialist/references/multi-cluster.md +507 -507
- package/skills/infra/kubernetes-specialist/references/networking.md +447 -447
- package/skills/infra/kubernetes-specialist/references/service-mesh.md +459 -459
- package/skills/infra/kubernetes-specialist/references/storage.md +535 -535
- package/skills/infra/kubernetes-specialist/references/troubleshooting.md +414 -414
- package/skills/infra/kubernetes-specialist/references/workloads.md +377 -377
- package/skills/infra/mcp-developer/SKILL.md +143 -143
- package/skills/infra/mcp-developer/references/protocol.md +244 -244
- package/skills/infra/mcp-developer/references/python-sdk.md +367 -367
- package/skills/infra/mcp-developer/references/resources.md +554 -554
- package/skills/infra/mcp-developer/references/tools.md +480 -480
- package/skills/infra/mcp-developer/references/typescript-sdk.md +350 -350
- package/skills/infra/monitoring-expert/SKILL.md +176 -176
- package/skills/infra/monitoring-expert/references/alerting-rules.md +141 -141
- package/skills/infra/monitoring-expert/references/application-profiling.md +331 -331
- package/skills/infra/monitoring-expert/references/capacity-planning.md +344 -344
- package/skills/infra/monitoring-expert/references/dashboards.md +126 -126
- package/skills/infra/monitoring-expert/references/opentelemetry.md +123 -123
- package/skills/infra/monitoring-expert/references/performance-testing.md +269 -269
- package/skills/infra/monitoring-expert/references/prometheus-metrics.md +136 -136
- package/skills/infra/monitoring-expert/references/structured-logging.md +142 -142
- package/skills/infra/sre-engineer/SKILL.md +181 -181
- package/skills/infra/sre-engineer/references/automation-toil.md +492 -492
- package/skills/infra/sre-engineer/references/error-budget-policy.md +334 -334
- package/skills/infra/sre-engineer/references/incident-chaos.md +576 -576
- package/skills/infra/sre-engineer/references/monitoring-alerting.md +424 -424
- package/skills/infra/sre-engineer/references/slo-sli-management.md +238 -238
- package/skills/infra/terraform-engineer/SKILL.md +143 -143
- package/skills/infra/terraform-engineer/references/best-practices.md +583 -583
- package/skills/infra/terraform-engineer/references/module-patterns.md +297 -297
- package/skills/infra/terraform-engineer/references/providers.md +452 -452
- package/skills/infra/terraform-engineer/references/state-management.md +371 -371
- package/skills/infra/terraform-engineer/references/testing.md +486 -486
- package/skills/infra/websocket-engineer/SKILL.md +168 -168
- package/skills/infra/websocket-engineer/references/alternatives.md +391 -391
- package/skills/infra/websocket-engineer/references/patterns.md +400 -400
- package/skills/infra/websocket-engineer/references/protocol.md +195 -195
- package/skills/infra/websocket-engineer/references/scaling.md +333 -333
- package/skills/infra/websocket-engineer/references/security.md +474 -474
- package/skills/java/java-architect/SKILL.md +132 -132
- package/skills/java/java-architect/references/jpa-optimization.md +393 -393
- package/skills/java/java-architect/references/reactive-webflux.md +356 -356
- package/skills/java/java-architect/references/spring-boot-setup.md +269 -269
- package/skills/java/java-architect/references/spring-security.md +445 -445
- package/skills/java/java-architect/references/testing-patterns.md +500 -500
- package/skills/java/kotlin-specialist/SKILL.md +147 -147
- package/skills/java/kotlin-specialist/references/android-compose.md +419 -419
- package/skills/java/kotlin-specialist/references/coroutines-flow.md +276 -276
- package/skills/java/kotlin-specialist/references/dsl-idioms.md +421 -421
- package/skills/java/kotlin-specialist/references/ktor-server.md +426 -426
- package/skills/java/kotlin-specialist/references/multiplatform-kmp.md +380 -380
- package/skills/java/spring-boot-engineer/SKILL.md +195 -195
- package/skills/java/spring-boot-engineer/references/cloud.md +498 -498
- package/skills/java/spring-boot-engineer/references/data.md +381 -381
- package/skills/java/spring-boot-engineer/references/security.md +459 -459
- package/skills/java/spring-boot-engineer/references/testing.md +545 -545
- package/skills/java/spring-boot-engineer/references/web.md +295 -295
- package/skills/javascript/javascript-pro/SKILL.md +132 -132
- package/skills/javascript/javascript-pro/references/async-patterns.md +334 -334
- package/skills/javascript/javascript-pro/references/browser-apis.md +398 -398
- package/skills/javascript/javascript-pro/references/modern-syntax.md +272 -272
- package/skills/javascript/javascript-pro/references/modules.md +357 -357
- package/skills/javascript/javascript-pro/references/node-essentials.md +471 -471
- package/skills/javascript/nestjs-expert/SKILL.md +206 -206
- package/skills/javascript/nestjs-expert/references/authentication.md +166 -166
- package/skills/javascript/nestjs-expert/references/controllers-routing.md +111 -111
- package/skills/javascript/nestjs-expert/references/dtos-validation.md +153 -153
- package/skills/javascript/nestjs-expert/references/migration-from-express.md +1237 -1237
- package/skills/javascript/nestjs-expert/references/services-di.md +140 -140
- package/skills/javascript/nestjs-expert/references/testing-patterns.md +186 -186
- package/skills/javascript/typescript-pro/SKILL.md +145 -145
- package/skills/javascript/typescript-pro/references/advanced-types.md +259 -259
- package/skills/javascript/typescript-pro/references/configuration.md +445 -445
- package/skills/javascript/typescript-pro/references/patterns.md +484 -484
- package/skills/javascript/typescript-pro/references/type-guards.md +352 -352
- package/skills/javascript/typescript-pro/references/utility-types.md +329 -329
- package/skills/php/laravel-specialist/SKILL.md +262 -262
- package/skills/php/laravel-specialist/references/eloquent.md +351 -351
- package/skills/php/laravel-specialist/references/livewire.md +512 -512
- package/skills/php/laravel-specialist/references/queues.md +423 -423
- package/skills/php/laravel-specialist/references/routing.md +362 -362
- package/skills/php/laravel-specialist/references/testing.md +522 -522
- package/skills/php/php-pro/SKILL.md +206 -206
- package/skills/php/php-pro/references/async-patterns.md +412 -412
- package/skills/php/php-pro/references/laravel-patterns.md +377 -377
- package/skills/php/php-pro/references/modern-php-features.md +323 -323
- package/skills/php/php-pro/references/symfony-patterns.md +466 -466
- package/skills/php/php-pro/references/testing-quality.md +466 -466
- package/skills/product/competitive-analysis/SKILL.md +257 -257
- package/skills/product/meeting-notes/SKILL.md +266 -266
- package/skills/product/prd-template/SKILL.md +150 -150
- package/skills/product/stakeholder-update/SKILL.md +225 -225
- package/skills/product/user-research-synthesis/SKILL.md +235 -235
- package/skills/python/django-expert/SKILL.md +162 -162
- package/skills/python/django-expert/references/authentication.md +145 -145
- package/skills/python/django-expert/references/drf-serializers.md +148 -148
- package/skills/python/django-expert/references/models-orm.md +151 -151
- package/skills/python/django-expert/references/testing-django.md +204 -204
- package/skills/python/django-expert/references/viewsets-views.md +153 -153
- package/skills/python/fastapi-expert/SKILL.md +185 -185
- package/skills/python/fastapi-expert/references/async-sqlalchemy.md +146 -146
- package/skills/python/fastapi-expert/references/authentication.md +159 -159
- package/skills/python/fastapi-expert/references/endpoints-routing.md +142 -142
- package/skills/python/fastapi-expert/references/migration-from-django.md +996 -996
- package/skills/python/fastapi-expert/references/pydantic-v2.md +135 -135
- package/skills/python/fastapi-expert/references/testing-async.md +159 -159
- package/skills/python/pandas-pro/SKILL.md +178 -178
- package/skills/python/pandas-pro/references/aggregation-groupby.md +545 -545
- package/skills/python/pandas-pro/references/data-cleaning.md +500 -500
- package/skills/python/pandas-pro/references/dataframe-operations.md +420 -420
- package/skills/python/pandas-pro/references/merging-joining.md +596 -596
- package/skills/python/pandas-pro/references/performance-optimization.md +597 -597
- package/skills/python/python-pro/SKILL.md +177 -177
- package/skills/python/python-pro/references/async-patterns.md +356 -356
- package/skills/python/python-pro/references/packaging.md +460 -460
- package/skills/python/python-pro/references/standard-library.md +378 -378
- package/skills/python/python-pro/references/testing.md +404 -404
- package/skills/python/python-pro/references/type-system.md +290 -290
- package/skills/quality/chaos-engineer/SKILL.md +182 -182
- package/skills/quality/chaos-engineer/references/chaos-tools.md +511 -511
- package/skills/quality/chaos-engineer/references/experiment-design.md +229 -229
- package/skills/quality/chaos-engineer/references/game-days.md +434 -434
- package/skills/quality/chaos-engineer/references/infrastructure-chaos.md +348 -348
- package/skills/quality/chaos-engineer/references/kubernetes-chaos.md +432 -432
- package/skills/quality/code-reviewer/SKILL.md +119 -119
- package/skills/quality/code-reviewer/references/common-issues.md +142 -142
- package/skills/quality/code-reviewer/references/feedback-examples.md +144 -144
- package/skills/quality/code-reviewer/references/receiving-feedback.md +238 -238
- package/skills/quality/code-reviewer/references/report-template.md +109 -109
- package/skills/quality/code-reviewer/references/review-checklist.md +88 -88
- package/skills/quality/code-reviewer/references/spec-compliance-review.md +258 -258
- package/skills/quality/playwright-expert/SKILL.md +169 -169
- package/skills/quality/playwright-expert/references/api-mocking.md +140 -140
- package/skills/quality/playwright-expert/references/configuration.md +155 -155
- package/skills/quality/playwright-expert/references/debugging-flaky.md +150 -150
- package/skills/quality/playwright-expert/references/page-object-model.md +152 -152
- package/skills/quality/playwright-expert/references/selectors-locators.md +119 -119
- package/skills/quality/secure-code-guardian/SKILL.md +191 -191
- package/skills/quality/secure-code-guardian/references/authentication.md +136 -136
- package/skills/quality/secure-code-guardian/references/input-validation.md +146 -146
- package/skills/quality/secure-code-guardian/references/owasp-prevention.md +135 -135
- package/skills/quality/secure-code-guardian/references/security-headers.md +133 -133
- package/skills/quality/secure-code-guardian/references/xss-csrf.md +157 -157
- package/skills/quality/security-reviewer/SKILL.md +103 -103
- package/skills/quality/security-reviewer/references/infrastructure-security.md +268 -268
- package/skills/quality/security-reviewer/references/penetration-testing.md +268 -268
- package/skills/quality/security-reviewer/references/report-template.md +170 -170
- package/skills/quality/security-reviewer/references/sast-tools.md +117 -117
- package/skills/quality/security-reviewer/references/secret-scanning.md +125 -125
- package/skills/quality/security-reviewer/references/vulnerability-patterns.md +152 -152
- package/skills/quality/senior-qa/README.md +196 -196
- package/skills/quality/senior-qa/SKILL.md +399 -399
- package/skills/quality/senior-qa/references/qa_best_practices.md +964 -964
- package/skills/quality/senior-qa/references/test_automation_patterns.md +1009 -1009
- package/skills/quality/senior-qa/references/testing_strategies.md +649 -649
- package/skills/quality/senior-qa/scripts/coverage_analyzer.py +836 -836
- package/skills/quality/senior-qa/scripts/e2e_test_scaffolder.py +820 -820
- package/skills/quality/senior-qa/scripts/test_suite_generator.py +605 -605
- package/skills/quality/tdd-guide/HOW_TO_USE.md +313 -313
- package/skills/quality/tdd-guide/README.md +680 -680
- package/skills/quality/tdd-guide/SKILL.md +122 -122
- package/skills/quality/tdd-guide/assets/expected_output.json +77 -77
- package/skills/quality/tdd-guide/assets/sample_input_python.json +39 -39
- package/skills/quality/tdd-guide/assets/sample_input_typescript.json +36 -36
- package/skills/quality/tdd-guide/references/ci-integration.md +195 -195
- package/skills/quality/tdd-guide/references/framework-guide.md +206 -206
- package/skills/quality/tdd-guide/references/tdd-best-practices.md +128 -128
- package/skills/quality/tdd-guide/scripts/coverage_analyzer.py +434 -434
- package/skills/quality/tdd-guide/scripts/fixture_generator.py +440 -440
- package/skills/quality/tdd-guide/scripts/format_detector.py +384 -384
- package/skills/quality/tdd-guide/scripts/framework_adapter.py +428 -428
- package/skills/quality/tdd-guide/scripts/metrics_calculator.py +456 -456
- package/skills/quality/tdd-guide/scripts/output_formatter.py +354 -354
- package/skills/quality/tdd-guide/scripts/tdd_workflow.py +474 -474
- package/skills/quality/tdd-guide/scripts/test_generator.py +438 -438
- package/skills/quality/test-master/SKILL.md +94 -94
- package/skills/quality/test-master/references/automation-frameworks.md +294 -294
- package/skills/quality/test-master/references/e2e-testing.md +128 -128
- package/skills/quality/test-master/references/integration-testing.md +120 -120
- package/skills/quality/test-master/references/performance-testing.md +118 -118
- package/skills/quality/test-master/references/qa-methodology.md +247 -247
- package/skills/quality/test-master/references/security-testing.md +127 -127
- package/skills/quality/test-master/references/tdd-iron-laws.md +174 -174
- package/skills/quality/test-master/references/test-reports.md +104 -104
- package/skills/quality/test-master/references/testing-anti-patterns.md +231 -231
- package/skills/quality/test-master/references/unit-testing.md +113 -113
- package/skills/ruby/rails-expert/SKILL.md +154 -154
- package/skills/ruby/rails-expert/references/active-record.md +244 -244
- package/skills/ruby/rails-expert/references/api-development.md +401 -401
- package/skills/ruby/rails-expert/references/background-jobs.md +272 -272
- package/skills/ruby/rails-expert/references/hotwire-turbo.md +228 -228
- package/skills/ruby/rails-expert/references/rspec-testing.md +367 -367
- package/skills/swift/swift-expert/SKILL.md +163 -163
- package/skills/swift/swift-expert/references/async-concurrency.md +360 -360
- package/skills/swift/swift-expert/references/memory-performance.md +377 -377
- package/skills/swift/swift-expert/references/protocol-oriented.md +354 -354
- package/skills/swift/swift-expert/references/swiftui-patterns.md +291 -291
- package/skills/swift/swift-expert/references/testing-patterns.md +399 -399
- package/skills/workflow/brainstorming/SKILL.md +164 -164
- package/skills/workflow/brainstorming/scripts/frame-template.html +214 -214
- package/skills/workflow/brainstorming/scripts/helper.js +88 -88
- package/skills/workflow/brainstorming/scripts/server.cjs +354 -354
- package/skills/workflow/brainstorming/scripts/start-server.sh +148 -148
- package/skills/workflow/brainstorming/scripts/stop-server.sh +56 -56
- package/skills/workflow/brainstorming/spec-document-reviewer-prompt.md +49 -49
- package/skills/workflow/brainstorming/visual-companion.md +287 -287
- package/skills/workflow/documentation/SKILL.md +45 -45
- package/skills/workflow/entropy-management/SKILL.md +115 -115
- package/skills/workflow/executing-plans/SKILL.md +70 -70
- package/skills/workflow/finishing-a-development-branch/SKILL.md +200 -200
- package/skills/workflow/receiving-code-review/SKILL.md +213 -213
- package/skills/workflow/requesting-code-review/SKILL.md +105 -105
- package/skills/workflow/requesting-code-review/code-reviewer.md +146 -146
- package/skills/workflow/requirement-engineering/SKILL.md +111 -111
- package/skills/workflow/systematic-debugging/CREATION-LOG.md +119 -119
- package/skills/workflow/systematic-debugging/SKILL.md +296 -296
- package/skills/workflow/systematic-debugging/condition-based-waiting-example.ts +158 -158
- package/skills/workflow/systematic-debugging/condition-based-waiting.md +115 -115
- package/skills/workflow/systematic-debugging/defense-in-depth.md +122 -122
- package/skills/workflow/systematic-debugging/find-polluter.sh +63 -63
- package/skills/workflow/systematic-debugging/root-cause-tracing.md +169 -169
- package/skills/workflow/systematic-debugging/test-academic.md +14 -14
- package/skills/workflow/systematic-debugging/test-pressure-1.md +58 -58
- package/skills/workflow/systematic-debugging/test-pressure-2.md +68 -68
- package/skills/workflow/systematic-debugging/test-pressure-3.md +69 -69
- package/skills/workflow/using-git-worktrees/SKILL.md +218 -218
- package/skills/workflow/verification-before-completion/SKILL.md +139 -139
- package/skills/workflow/writing-plans/SKILL.md +151 -151
- package/skills/workflow/writing-plans/plan-document-reviewer-prompt.md +49 -49
- package/skills/workflow/writing-skills/SKILL.md +655 -655
- package/skills/workflow/writing-skills/anthropic-best-practices.md +1150 -1150
- package/skills/workflow/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -189
- package/skills/workflow/writing-skills/persuasion-principles.md +187 -187
- package/skills/workflow/writing-skills/render-graphs.js +168 -168
- package/skills/workflow/writing-skills/testing-skills-with-subagents.md +384 -384
|
@@ -1,631 +1,631 @@
|
|
|
1
|
-
# Feature Engineering
|
|
2
|
-
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
## Overview
|
|
6
|
-
|
|
7
|
-
Feature engineering transforms raw data into features that improve model performance. Production systems require reproducible transformations, feature versioning, and online/offline consistency through feature stores.
|
|
8
|
-
|
|
9
|
-
## When to Use This Reference
|
|
10
|
-
|
|
11
|
-
- Building feature transformation pipelines
|
|
12
|
-
- Implementing feature stores (Feast, Tecton, custom)
|
|
13
|
-
- Creating data validation workflows
|
|
14
|
-
- Designing feature schemas and registries
|
|
15
|
-
- Handling feature drift and monitoring
|
|
16
|
-
|
|
17
|
-
## When NOT to Use
|
|
18
|
-
|
|
19
|
-
- Simple ad-hoc feature creation (use pandas directly)
|
|
20
|
-
- One-time exploratory analysis
|
|
21
|
-
- Prototyping with small datasets
|
|
22
|
-
|
|
23
|
-
---
|
|
24
|
-
|
|
25
|
-
## Feature Transformation Pipelines
|
|
26
|
-
|
|
27
|
-
### Scikit-learn Pipeline Pattern
|
|
28
|
-
|
|
29
|
-
```python
|
|
30
|
-
from sklearn.pipeline import Pipeline
|
|
31
|
-
from sklearn.compose import ColumnTransformer
|
|
32
|
-
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
33
|
-
from sklearn.impute import SimpleImputer
|
|
34
|
-
import joblib
|
|
35
|
-
|
|
36
|
-
def create_feature_pipeline(
|
|
37
|
-
numeric_features: list[str],
|
|
38
|
-
categorical_features: list[str],
|
|
39
|
-
) -> ColumnTransformer:
|
|
40
|
-
"""Create reproducible feature transformation pipeline."""
|
|
41
|
-
|
|
42
|
-
numeric_transformer = Pipeline(steps=[
|
|
43
|
-
('imputer', SimpleImputer(strategy='median')),
|
|
44
|
-
('scaler', StandardScaler()),
|
|
45
|
-
])
|
|
46
|
-
|
|
47
|
-
categorical_transformer = Pipeline(steps=[
|
|
48
|
-
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
|
|
49
|
-
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
|
|
50
|
-
])
|
|
51
|
-
|
|
52
|
-
preprocessor = ColumnTransformer(
|
|
53
|
-
transformers=[
|
|
54
|
-
('num', numeric_transformer, numeric_features),
|
|
55
|
-
('cat', categorical_transformer, categorical_features),
|
|
56
|
-
],
|
|
57
|
-
remainder='drop',
|
|
58
|
-
verbose_feature_names_out=False,
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
return preprocessor
|
|
62
|
-
|
|
63
|
-
# Usage with versioning
|
|
64
|
-
def save_pipeline(pipeline: ColumnTransformer, version: str, path: str) -> str:
|
|
65
|
-
"""Save pipeline with version metadata."""
|
|
66
|
-
import hashlib
|
|
67
|
-
import json
|
|
68
|
-
from datetime import datetime
|
|
69
|
-
|
|
70
|
-
artifact_path = f"{path}/feature_pipeline_v{version}.joblib"
|
|
71
|
-
metadata_path = f"{path}/feature_pipeline_v{version}_metadata.json"
|
|
72
|
-
|
|
73
|
-
joblib.dump(pipeline, artifact_path)
|
|
74
|
-
|
|
75
|
-
metadata = {
|
|
76
|
-
"version": version,
|
|
77
|
-
"created_at": datetime.utcnow().isoformat(),
|
|
78
|
-
"feature_names_in": list(pipeline.feature_names_in_),
|
|
79
|
-
"n_features_out": pipeline.n_features_out_,
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
with open(metadata_path, 'w') as f:
|
|
83
|
-
json.dump(metadata, f, indent=2)
|
|
84
|
-
|
|
85
|
-
return artifact_path
|
|
86
|
-
```
|
|
87
|
-
|
|
88
|
-
### Custom Transformer Pattern
|
|
89
|
-
|
|
90
|
-
```python
|
|
91
|
-
from sklearn.base import BaseEstimator, TransformerMixin
|
|
92
|
-
import numpy as np
|
|
93
|
-
import pandas as pd
|
|
94
|
-
|
|
95
|
-
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
|
|
96
|
-
"""Extract features from datetime columns."""
|
|
97
|
-
|
|
98
|
-
def __init__(self, date_column: str, features: list[str] = None):
|
|
99
|
-
self.date_column = date_column
|
|
100
|
-
self.features = features or ['year', 'month', 'day', 'dayofweek', 'hour']
|
|
101
|
-
|
|
102
|
-
def fit(self, X: pd.DataFrame, y=None):
|
|
103
|
-
return self
|
|
104
|
-
|
|
105
|
-
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
106
|
-
X = X.copy()
|
|
107
|
-
dt = pd.to_datetime(X[self.date_column])
|
|
108
|
-
|
|
109
|
-
feature_map = {
|
|
110
|
-
'year': dt.dt.year,
|
|
111
|
-
'month': dt.dt.month,
|
|
112
|
-
'day': dt.dt.day,
|
|
113
|
-
'dayofweek': dt.dt.dayofweek,
|
|
114
|
-
'hour': dt.dt.hour,
|
|
115
|
-
'is_weekend': dt.dt.dayofweek.isin([5, 6]).astype(int),
|
|
116
|
-
'quarter': dt.dt.quarter,
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
for feature in self.features:
|
|
120
|
-
if feature in feature_map:
|
|
121
|
-
X[f"{self.date_column}_{feature}"] = feature_map[feature]
|
|
122
|
-
|
|
123
|
-
return X.drop(columns=[self.date_column])
|
|
124
|
-
|
|
125
|
-
def get_feature_names_out(self, input_features=None):
|
|
126
|
-
return [f"{self.date_column}_{f}" for f in self.features]
|
|
127
|
-
|
|
128
|
-
class TargetEncoder(BaseEstimator, TransformerMixin):
|
|
129
|
-
"""Target encoding for high-cardinality categorical features."""
|
|
130
|
-
|
|
131
|
-
def __init__(self, columns: list[str], smoothing: float = 1.0):
|
|
132
|
-
self.columns = columns
|
|
133
|
-
self.smoothing = smoothing
|
|
134
|
-
self.encodings_: dict = {}
|
|
135
|
-
self.global_mean_: float = None
|
|
136
|
-
|
|
137
|
-
def fit(self, X: pd.DataFrame, y: pd.Series):
|
|
138
|
-
self.global_mean_ = y.mean()
|
|
139
|
-
|
|
140
|
-
for col in self.columns:
|
|
141
|
-
stats = y.groupby(X[col]).agg(['mean', 'count'])
|
|
142
|
-
smooth = (stats['count'] * stats['mean'] + self.smoothing * self.global_mean_) / (
|
|
143
|
-
stats['count'] + self.smoothing
|
|
144
|
-
)
|
|
145
|
-
self.encodings_[col] = smooth.to_dict()
|
|
146
|
-
|
|
147
|
-
return self
|
|
148
|
-
|
|
149
|
-
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
150
|
-
X = X.copy()
|
|
151
|
-
for col in self.columns:
|
|
152
|
-
X[f"{col}_encoded"] = X[col].map(self.encodings_[col]).fillna(self.global_mean_)
|
|
153
|
-
return X.drop(columns=self.columns)
|
|
154
|
-
```
|
|
155
|
-
|
|
156
|
-
---
|
|
157
|
-
|
|
158
|
-
## Feature Store with Feast
|
|
159
|
-
|
|
160
|
-
### Feature Store Setup
|
|
161
|
-
|
|
162
|
-
```python
|
|
163
|
-
# feature_store.yaml
|
|
164
|
-
"""
|
|
165
|
-
project: ml_project
|
|
166
|
-
registry: data/registry.db
|
|
167
|
-
provider: local
|
|
168
|
-
online_store:
|
|
169
|
-
type: sqlite
|
|
170
|
-
path: data/online_store.db
|
|
171
|
-
offline_store:
|
|
172
|
-
type: file
|
|
173
|
-
entity_key_serialization_version: 2
|
|
174
|
-
"""
|
|
175
|
-
|
|
176
|
-
# features/user_features.py
|
|
177
|
-
from datetime import timedelta
|
|
178
|
-
from feast import Entity, Feature, FeatureView, FileSource, Field
|
|
179
|
-
from feast.types import Float32, Int64, String
|
|
180
|
-
|
|
181
|
-
# Define entity
|
|
182
|
-
user = Entity(
|
|
183
|
-
name="user_id",
|
|
184
|
-
description="User identifier",
|
|
185
|
-
join_keys=["user_id"],
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
# Define data source
|
|
189
|
-
user_stats_source = FileSource(
|
|
190
|
-
path="data/user_stats.parquet",
|
|
191
|
-
timestamp_field="event_timestamp",
|
|
192
|
-
created_timestamp_column="created_timestamp",
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
# Define feature view
|
|
196
|
-
user_stats_fv = FeatureView(
|
|
197
|
-
name="user_stats",
|
|
198
|
-
entities=[user],
|
|
199
|
-
ttl=timedelta(days=1),
|
|
200
|
-
schema=[
|
|
201
|
-
Field(name="total_purchases", dtype=Int64),
|
|
202
|
-
Field(name="avg_purchase_value", dtype=Float32),
|
|
203
|
-
Field(name="days_since_last_purchase", dtype=Int64),
|
|
204
|
-
Field(name="user_segment", dtype=String),
|
|
205
|
-
],
|
|
206
|
-
source=user_stats_source,
|
|
207
|
-
online=True,
|
|
208
|
-
tags={"team": "ml", "owner": "data-science"},
|
|
209
|
-
)
|
|
210
|
-
```
|
|
211
|
-
|
|
212
|
-
### Feature Retrieval Pattern
|
|
213
|
-
|
|
214
|
-
```python
|
|
215
|
-
from feast import FeatureStore
|
|
216
|
-
import pandas as pd
|
|
217
|
-
from datetime import datetime
|
|
218
|
-
|
|
219
|
-
class FeatureService:
|
|
220
|
-
"""Production feature service with Feast."""
|
|
221
|
-
|
|
222
|
-
def __init__(self, repo_path: str = "."):
|
|
223
|
-
self.store = FeatureStore(repo_path=repo_path)
|
|
224
|
-
|
|
225
|
-
def get_training_features(
|
|
226
|
-
self,
|
|
227
|
-
entity_df: pd.DataFrame,
|
|
228
|
-
feature_refs: list[str],
|
|
229
|
-
) -> pd.DataFrame:
|
|
230
|
-
"""Get historical features for training."""
|
|
231
|
-
return self.store.get_historical_features(
|
|
232
|
-
entity_df=entity_df,
|
|
233
|
-
features=feature_refs,
|
|
234
|
-
).to_df()
|
|
235
|
-
|
|
236
|
-
def get_online_features(
|
|
237
|
-
self,
|
|
238
|
-
entity_rows: list[dict],
|
|
239
|
-
feature_refs: list[str],
|
|
240
|
-
) -> dict:
|
|
241
|
-
"""Get features for real-time inference."""
|
|
242
|
-
response = self.store.get_online_features(
|
|
243
|
-
entity_rows=entity_rows,
|
|
244
|
-
features=feature_refs,
|
|
245
|
-
)
|
|
246
|
-
return response.to_dict()
|
|
247
|
-
|
|
248
|
-
def materialize_features(
|
|
249
|
-
self,
|
|
250
|
-
start_date: datetime,
|
|
251
|
-
end_date: datetime,
|
|
252
|
-
) -> None:
|
|
253
|
-
"""Materialize features to online store."""
|
|
254
|
-
self.store.materialize(start_date=start_date, end_date=end_date)
|
|
255
|
-
|
|
256
|
-
# Usage
|
|
257
|
-
feature_service = FeatureService()
|
|
258
|
-
|
|
259
|
-
# Training: historical features
|
|
260
|
-
entity_df = pd.DataFrame({
|
|
261
|
-
"user_id": [1, 2, 3],
|
|
262
|
-
"event_timestamp": [datetime(2024, 1, 15)] * 3,
|
|
263
|
-
})
|
|
264
|
-
|
|
265
|
-
training_features = feature_service.get_training_features(
|
|
266
|
-
entity_df=entity_df,
|
|
267
|
-
feature_refs=[
|
|
268
|
-
"user_stats:total_purchases",
|
|
269
|
-
"user_stats:avg_purchase_value",
|
|
270
|
-
"user_stats:days_since_last_purchase",
|
|
271
|
-
],
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
# Inference: online features
|
|
275
|
-
online_features = feature_service.get_online_features(
|
|
276
|
-
entity_rows=[{"user_id": 1}],
|
|
277
|
-
feature_refs=["user_stats:total_purchases", "user_stats:avg_purchase_value"],
|
|
278
|
-
)
|
|
279
|
-
```
|
|
280
|
-
|
|
281
|
-
---
|
|
282
|
-
|
|
283
|
-
## Data Validation with Great Expectations
|
|
284
|
-
|
|
285
|
-
### Expectation Suite Definition
|
|
286
|
-
|
|
287
|
-
```python
|
|
288
|
-
import great_expectations as gx
|
|
289
|
-
from great_expectations.core import ExpectationSuite
|
|
290
|
-
from great_expectations.checkpoint import Checkpoint
|
|
291
|
-
|
|
292
|
-
def create_feature_expectations(context: gx.DataContext) -> ExpectationSuite:
|
|
293
|
-
"""Define data quality expectations for features."""
|
|
294
|
-
|
|
295
|
-
suite = context.add_expectation_suite("feature_validation_suite")
|
|
296
|
-
|
|
297
|
-
# Column existence
|
|
298
|
-
suite.add_expectation(
|
|
299
|
-
gx.expectations.ExpectColumnToExist(column="user_id")
|
|
300
|
-
)
|
|
301
|
-
suite.add_expectation(
|
|
302
|
-
gx.expectations.ExpectColumnToExist(column="purchase_amount")
|
|
303
|
-
)
|
|
304
|
-
|
|
305
|
-
# Null checks
|
|
306
|
-
suite.add_expectation(
|
|
307
|
-
gx.expectations.ExpectColumnValuesToNotBeNull(column="user_id")
|
|
308
|
-
)
|
|
309
|
-
suite.add_expectation(
|
|
310
|
-
gx.expectations.ExpectColumnValuesToNotBeNull(
|
|
311
|
-
column="purchase_amount",
|
|
312
|
-
mostly=0.95, # Allow 5% nulls
|
|
313
|
-
)
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
# Value ranges
|
|
317
|
-
suite.add_expectation(
|
|
318
|
-
gx.expectations.ExpectColumnValuesToBeBetween(
|
|
319
|
-
column="purchase_amount",
|
|
320
|
-
min_value=0,
|
|
321
|
-
max_value=10000,
|
|
322
|
-
)
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
# Uniqueness
|
|
326
|
-
suite.add_expectation(
|
|
327
|
-
gx.expectations.ExpectColumnValuesToBeUnique(column="transaction_id")
|
|
328
|
-
)
|
|
329
|
-
|
|
330
|
-
# Distribution checks
|
|
331
|
-
suite.add_expectation(
|
|
332
|
-
gx.expectations.ExpectColumnMeanToBeBetween(
|
|
333
|
-
column="purchase_amount",
|
|
334
|
-
min_value=50,
|
|
335
|
-
max_value=500,
|
|
336
|
-
)
|
|
337
|
-
)
|
|
338
|
-
|
|
339
|
-
return suite
|
|
340
|
-
|
|
341
|
-
def validate_features(
|
|
342
|
-
df: pd.DataFrame,
|
|
343
|
-
context: gx.DataContext,
|
|
344
|
-
suite_name: str,
|
|
345
|
-
) -> dict:
|
|
346
|
-
"""Run validation and return results."""
|
|
347
|
-
|
|
348
|
-
datasource = context.sources.add_pandas("runtime_source")
|
|
349
|
-
data_asset = datasource.add_dataframe_asset("runtime_asset")
|
|
350
|
-
batch_request = data_asset.build_batch_request(dataframe=df)
|
|
351
|
-
|
|
352
|
-
checkpoint = context.add_or_update_checkpoint(
|
|
353
|
-
name="feature_checkpoint",
|
|
354
|
-
validations=[
|
|
355
|
-
{
|
|
356
|
-
"batch_request": batch_request,
|
|
357
|
-
"expectation_suite_name": suite_name,
|
|
358
|
-
}
|
|
359
|
-
],
|
|
360
|
-
)
|
|
361
|
-
|
|
362
|
-
result = checkpoint.run()
|
|
363
|
-
|
|
364
|
-
return {
|
|
365
|
-
"success": result.success,
|
|
366
|
-
"statistics": result.run_results[list(result.run_results.keys())[0]].get("validation_result").statistics,
|
|
367
|
-
"results": result.to_json_dict(),
|
|
368
|
-
}
|
|
369
|
-
```
|
|
370
|
-
|
|
371
|
-
### Data Drift Detection
|
|
372
|
-
|
|
373
|
-
```python
|
|
374
|
-
from scipy import stats
|
|
375
|
-
import numpy as np
|
|
376
|
-
from dataclasses import dataclass
|
|
377
|
-
|
|
378
|
-
@dataclass
|
|
379
|
-
class DriftResult:
|
|
380
|
-
feature: str
|
|
381
|
-
drift_detected: bool
|
|
382
|
-
statistic: float
|
|
383
|
-
p_value: float
|
|
384
|
-
method: str
|
|
385
|
-
|
|
386
|
-
class FeatureDriftDetector:
|
|
387
|
-
"""Detect distribution drift in features."""
|
|
388
|
-
|
|
389
|
-
def __init__(self, significance_level: float = 0.05):
|
|
390
|
-
self.significance_level = significance_level
|
|
391
|
-
self.reference_stats: dict = {}
|
|
392
|
-
|
|
393
|
-
def fit(self, reference_df: pd.DataFrame, features: list[str]) -> None:
|
|
394
|
-
"""Store reference distribution statistics."""
|
|
395
|
-
for feature in features:
|
|
396
|
-
self.reference_stats[feature] = {
|
|
397
|
-
'mean': reference_df[feature].mean(),
|
|
398
|
-
'std': reference_df[feature].std(),
|
|
399
|
-
'values': reference_df[feature].dropna().values,
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
def detect_drift(
|
|
403
|
-
self,
|
|
404
|
-
current_df: pd.DataFrame,
|
|
405
|
-
features: list[str],
|
|
406
|
-
) -> list[DriftResult]:
|
|
407
|
-
"""Detect drift using KS test."""
|
|
408
|
-
results = []
|
|
409
|
-
|
|
410
|
-
for feature in features:
|
|
411
|
-
if feature not in self.reference_stats:
|
|
412
|
-
continue
|
|
413
|
-
|
|
414
|
-
reference_values = self.reference_stats[feature]['values']
|
|
415
|
-
current_values = current_df[feature].dropna().values
|
|
416
|
-
|
|
417
|
-
statistic, p_value = stats.ks_2samp(reference_values, current_values)
|
|
418
|
-
|
|
419
|
-
results.append(DriftResult(
|
|
420
|
-
feature=feature,
|
|
421
|
-
drift_detected=p_value < self.significance_level,
|
|
422
|
-
statistic=statistic,
|
|
423
|
-
p_value=p_value,
|
|
424
|
-
method='ks_test',
|
|
425
|
-
))
|
|
426
|
-
|
|
427
|
-
return results
|
|
428
|
-
|
|
429
|
-
def detect_drift_psi(
|
|
430
|
-
self,
|
|
431
|
-
current_df: pd.DataFrame,
|
|
432
|
-
feature: str,
|
|
433
|
-
bins: int = 10,
|
|
434
|
-
) -> DriftResult:
|
|
435
|
-
"""Detect drift using Population Stability Index."""
|
|
436
|
-
reference = self.reference_stats[feature]['values']
|
|
437
|
-
current = current_df[feature].dropna().values
|
|
438
|
-
|
|
439
|
-
# Create bins from reference distribution
|
|
440
|
-
bin_edges = np.percentile(reference, np.linspace(0, 100, bins + 1))
|
|
441
|
-
bin_edges[0] = -np.inf
|
|
442
|
-
bin_edges[-1] = np.inf
|
|
443
|
-
|
|
444
|
-
ref_counts = np.histogram(reference, bins=bin_edges)[0] / len(reference)
|
|
445
|
-
cur_counts = np.histogram(current, bins=bin_edges)[0] / len(current)
|
|
446
|
-
|
|
447
|
-
# Avoid log(0)
|
|
448
|
-
ref_counts = np.clip(ref_counts, 0.0001, None)
|
|
449
|
-
cur_counts = np.clip(cur_counts, 0.0001, None)
|
|
450
|
-
|
|
451
|
-
psi = np.sum((cur_counts - ref_counts) * np.log(cur_counts / ref_counts))
|
|
452
|
-
|
|
453
|
-
return DriftResult(
|
|
454
|
-
feature=feature,
|
|
455
|
-
drift_detected=psi > 0.2, # PSI > 0.2 indicates significant drift
|
|
456
|
-
statistic=psi,
|
|
457
|
-
p_value=np.nan,
|
|
458
|
-
method='psi',
|
|
459
|
-
)
|
|
460
|
-
```
|
|
461
|
-
|
|
462
|
-
---
|
|
463
|
-
|
|
464
|
-
## Feature Pipeline Integration
|
|
465
|
-
|
|
466
|
-
### Complete Feature Pipeline
|
|
467
|
-
|
|
468
|
-
```python
|
|
469
|
-
from typing import Protocol
|
|
470
|
-
from abc import abstractmethod
|
|
471
|
-
import logging
|
|
472
|
-
|
|
473
|
-
logger = logging.getLogger(__name__)
|
|
474
|
-
|
|
475
|
-
class FeatureTransformer(Protocol):
|
|
476
|
-
"""Protocol for feature transformers."""
|
|
477
|
-
|
|
478
|
-
@abstractmethod
|
|
479
|
-
def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "FeatureTransformer": ...
|
|
480
|
-
|
|
481
|
-
@abstractmethod
|
|
482
|
-
def transform(self, X: pd.DataFrame) -> pd.DataFrame: ...
|
|
483
|
-
|
|
484
|
-
class FeaturePipeline:
|
|
485
|
-
"""Production feature pipeline with validation and monitoring."""
|
|
486
|
-
|
|
487
|
-
def __init__(
|
|
488
|
-
self,
|
|
489
|
-
transformers: list[tuple[str, FeatureTransformer]],
|
|
490
|
-
validator: FeatureDriftDetector = None,
|
|
491
|
-
feature_store: FeatureService = None,
|
|
492
|
-
):
|
|
493
|
-
self.transformers = transformers
|
|
494
|
-
self.validator = validator
|
|
495
|
-
self.feature_store = feature_store
|
|
496
|
-
self.is_fitted = False
|
|
497
|
-
|
|
498
|
-
def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "FeaturePipeline":
|
|
499
|
-
"""Fit all transformers."""
|
|
500
|
-
X_current = X.copy()
|
|
501
|
-
|
|
502
|
-
for name, transformer in self.transformers:
|
|
503
|
-
logger.info(f"Fitting transformer: {name}")
|
|
504
|
-
transformer.fit(X_current, y)
|
|
505
|
-
X_current = transformer.transform(X_current)
|
|
506
|
-
|
|
507
|
-
if self.validator:
|
|
508
|
-
numeric_cols = X_current.select_dtypes(include=[np.number]).columns.tolist()
|
|
509
|
-
self.validator.fit(X_current, numeric_cols)
|
|
510
|
-
|
|
511
|
-
self.is_fitted = True
|
|
512
|
-
return self
|
|
513
|
-
|
|
514
|
-
def transform(
|
|
515
|
-
self,
|
|
516
|
-
X: pd.DataFrame,
|
|
517
|
-
validate: bool = True,
|
|
518
|
-
) -> tuple[pd.DataFrame, list[DriftResult]]:
|
|
519
|
-
"""Transform features with optional validation."""
|
|
520
|
-
if not self.is_fitted:
|
|
521
|
-
raise ValueError("Pipeline must be fitted before transform")
|
|
522
|
-
|
|
523
|
-
X_current = X.copy()
|
|
524
|
-
|
|
525
|
-
for name, transformer in self.transformers:
|
|
526
|
-
logger.info(f"Applying transformer: {name}")
|
|
527
|
-
X_current = transformer.transform(X_current)
|
|
528
|
-
|
|
529
|
-
drift_results = []
|
|
530
|
-
if validate and self.validator:
|
|
531
|
-
numeric_cols = X_current.select_dtypes(include=[np.number]).columns.tolist()
|
|
532
|
-
drift_results = self.validator.detect_drift(X_current, numeric_cols)
|
|
533
|
-
|
|
534
|
-
drifted = [r.feature for r in drift_results if r.drift_detected]
|
|
535
|
-
if drifted:
|
|
536
|
-
logger.warning(f"Drift detected in features: {drifted}")
|
|
537
|
-
|
|
538
|
-
return X_current, drift_results
|
|
539
|
-
|
|
540
|
-
def save(self, path: str) -> None:
|
|
541
|
-
"""Save pipeline artifacts."""
|
|
542
|
-
import pickle
|
|
543
|
-
|
|
544
|
-
with open(f"{path}/feature_pipeline.pkl", 'wb') as f:
|
|
545
|
-
pickle.dump({
|
|
546
|
-
'transformers': self.transformers,
|
|
547
|
-
'validator': self.validator,
|
|
548
|
-
'is_fitted': self.is_fitted,
|
|
549
|
-
}, f)
|
|
550
|
-
|
|
551
|
-
@classmethod
|
|
552
|
-
def load(cls, path: str) -> "FeaturePipeline":
|
|
553
|
-
"""Load pipeline from artifacts."""
|
|
554
|
-
import pickle
|
|
555
|
-
|
|
556
|
-
with open(f"{path}/feature_pipeline.pkl", 'rb') as f:
|
|
557
|
-
data = pickle.load(f)
|
|
558
|
-
|
|
559
|
-
pipeline = cls(
|
|
560
|
-
transformers=data['transformers'],
|
|
561
|
-
validator=data['validator'],
|
|
562
|
-
)
|
|
563
|
-
pipeline.is_fitted = data['is_fitted']
|
|
564
|
-
return pipeline
|
|
565
|
-
```
|
|
566
|
-
|
|
567
|
-
---
|
|
568
|
-
|
|
569
|
-
## Best Practices
|
|
570
|
-
|
|
571
|
-
### Feature Naming Conventions
|
|
572
|
-
|
|
573
|
-
```python
|
|
574
|
-
# Good: descriptive, includes transformation info
|
|
575
|
-
"user_total_purchases_30d"
|
|
576
|
-
"product_price_log_scaled"
|
|
577
|
-
"category_onehot_electronics"
|
|
578
|
-
|
|
579
|
-
# Bad: ambiguous, no context
|
|
580
|
-
"feature_1"
|
|
581
|
-
"x_transformed"
|
|
582
|
-
"col"
|
|
583
|
-
```
|
|
584
|
-
|
|
585
|
-
### Feature Documentation
|
|
586
|
-
|
|
587
|
-
```python
|
|
588
|
-
from dataclasses import dataclass
|
|
589
|
-
from typing import Optional
|
|
590
|
-
|
|
591
|
-
@dataclass
|
|
592
|
-
class FeatureMetadata:
|
|
593
|
-
"""Document feature for registry."""
|
|
594
|
-
name: str
|
|
595
|
-
description: str
|
|
596
|
-
dtype: str
|
|
597
|
-
source_table: str
|
|
598
|
-
transformation: str
|
|
599
|
-
owner: str
|
|
600
|
-
created_at: str
|
|
601
|
-
tags: list[str]
|
|
602
|
-
dependencies: list[str]
|
|
603
|
-
freshness_sla: Optional[str] = None
|
|
604
|
-
|
|
605
|
-
def to_dict(self) -> dict:
|
|
606
|
-
return {
|
|
607
|
-
"name": self.name,
|
|
608
|
-
"description": self.description,
|
|
609
|
-
"dtype": self.dtype,
|
|
610
|
-
"source_table": self.source_table,
|
|
611
|
-
"transformation": self.transformation,
|
|
612
|
-
"owner": self.owner,
|
|
613
|
-
"created_at": self.created_at,
|
|
614
|
-
"tags": self.tags,
|
|
615
|
-
"dependencies": self.dependencies,
|
|
616
|
-
"freshness_sla": self.freshness_sla,
|
|
617
|
-
}
|
|
618
|
-
```
|
|
619
|
-
|
|
620
|
-
---
|
|
621
|
-
|
|
622
|
-
## Related References
|
|
623
|
-
|
|
624
|
-
- `training-pipelines.md` - Using features in training workflows
|
|
625
|
-
- `experiment-tracking.md` - Logging feature importance and metadata
|
|
626
|
-
- `model-validation.md` - Validating model performance on feature sets
|
|
627
|
-
|
|
628
|
-
## Cross-Reference Skills
|
|
629
|
-
|
|
630
|
-
- **Pandas Pro** - DataFrame operations for feature engineering
|
|
631
|
-
- **Data Engineer** - Data pipeline integration for feature computation
|
|
1
|
+
# Feature Engineering
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Feature engineering transforms raw data into features that improve model performance. Production systems require reproducible transformations, feature versioning, and online/offline consistency through feature stores.
|
|
8
|
+
|
|
9
|
+
## When to Use This Reference
|
|
10
|
+
|
|
11
|
+
- Building feature transformation pipelines
|
|
12
|
+
- Implementing feature stores (Feast, Tecton, custom)
|
|
13
|
+
- Creating data validation workflows
|
|
14
|
+
- Designing feature schemas and registries
|
|
15
|
+
- Handling feature drift and monitoring
|
|
16
|
+
|
|
17
|
+
## When NOT to Use
|
|
18
|
+
|
|
19
|
+
- Simple ad-hoc feature creation (use pandas directly)
|
|
20
|
+
- One-time exploratory analysis
|
|
21
|
+
- Prototyping with small datasets
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Feature Transformation Pipelines
|
|
26
|
+
|
|
27
|
+
### Scikit-learn Pipeline Pattern
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from sklearn.pipeline import Pipeline
|
|
31
|
+
from sklearn.compose import ColumnTransformer
|
|
32
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
33
|
+
from sklearn.impute import SimpleImputer
|
|
34
|
+
import joblib
|
|
35
|
+
|
|
36
|
+
def create_feature_pipeline(
|
|
37
|
+
numeric_features: list[str],
|
|
38
|
+
categorical_features: list[str],
|
|
39
|
+
) -> ColumnTransformer:
|
|
40
|
+
"""Create reproducible feature transformation pipeline."""
|
|
41
|
+
|
|
42
|
+
numeric_transformer = Pipeline(steps=[
|
|
43
|
+
('imputer', SimpleImputer(strategy='median')),
|
|
44
|
+
('scaler', StandardScaler()),
|
|
45
|
+
])
|
|
46
|
+
|
|
47
|
+
categorical_transformer = Pipeline(steps=[
|
|
48
|
+
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
|
|
49
|
+
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
|
|
50
|
+
])
|
|
51
|
+
|
|
52
|
+
preprocessor = ColumnTransformer(
|
|
53
|
+
transformers=[
|
|
54
|
+
('num', numeric_transformer, numeric_features),
|
|
55
|
+
('cat', categorical_transformer, categorical_features),
|
|
56
|
+
],
|
|
57
|
+
remainder='drop',
|
|
58
|
+
verbose_feature_names_out=False,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return preprocessor
|
|
62
|
+
|
|
63
|
+
# Usage with versioning
|
|
64
|
+
def save_pipeline(pipeline: ColumnTransformer, version: str, path: str) -> str:
|
|
65
|
+
"""Save pipeline with version metadata."""
|
|
66
|
+
import hashlib
|
|
67
|
+
import json
|
|
68
|
+
from datetime import datetime
|
|
69
|
+
|
|
70
|
+
artifact_path = f"{path}/feature_pipeline_v{version}.joblib"
|
|
71
|
+
metadata_path = f"{path}/feature_pipeline_v{version}_metadata.json"
|
|
72
|
+
|
|
73
|
+
joblib.dump(pipeline, artifact_path)
|
|
74
|
+
|
|
75
|
+
metadata = {
|
|
76
|
+
"version": version,
|
|
77
|
+
"created_at": datetime.utcnow().isoformat(),
|
|
78
|
+
"feature_names_in": list(pipeline.feature_names_in_),
|
|
79
|
+
"n_features_out": pipeline.n_features_out_,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
with open(metadata_path, 'w') as f:
|
|
83
|
+
json.dump(metadata, f, indent=2)
|
|
84
|
+
|
|
85
|
+
return artifact_path
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Custom Transformer Pattern
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
92
|
+
import numpy as np
|
|
93
|
+
import pandas as pd
|
|
94
|
+
|
|
95
|
+
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
|
|
96
|
+
"""Extract features from datetime columns."""
|
|
97
|
+
|
|
98
|
+
def __init__(self, date_column: str, features: list[str] = None):
|
|
99
|
+
self.date_column = date_column
|
|
100
|
+
self.features = features or ['year', 'month', 'day', 'dayofweek', 'hour']
|
|
101
|
+
|
|
102
|
+
def fit(self, X: pd.DataFrame, y=None):
|
|
103
|
+
return self
|
|
104
|
+
|
|
105
|
+
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
106
|
+
X = X.copy()
|
|
107
|
+
dt = pd.to_datetime(X[self.date_column])
|
|
108
|
+
|
|
109
|
+
feature_map = {
|
|
110
|
+
'year': dt.dt.year,
|
|
111
|
+
'month': dt.dt.month,
|
|
112
|
+
'day': dt.dt.day,
|
|
113
|
+
'dayofweek': dt.dt.dayofweek,
|
|
114
|
+
'hour': dt.dt.hour,
|
|
115
|
+
'is_weekend': dt.dt.dayofweek.isin([5, 6]).astype(int),
|
|
116
|
+
'quarter': dt.dt.quarter,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
for feature in self.features:
|
|
120
|
+
if feature in feature_map:
|
|
121
|
+
X[f"{self.date_column}_{feature}"] = feature_map[feature]
|
|
122
|
+
|
|
123
|
+
return X.drop(columns=[self.date_column])
|
|
124
|
+
|
|
125
|
+
def get_feature_names_out(self, input_features=None):
|
|
126
|
+
return [f"{self.date_column}_{f}" for f in self.features]
|
|
127
|
+
|
|
128
|
+
class TargetEncoder(BaseEstimator, TransformerMixin):
|
|
129
|
+
"""Target encoding for high-cardinality categorical features."""
|
|
130
|
+
|
|
131
|
+
def __init__(self, columns: list[str], smoothing: float = 1.0):
|
|
132
|
+
self.columns = columns
|
|
133
|
+
self.smoothing = smoothing
|
|
134
|
+
self.encodings_: dict = {}
|
|
135
|
+
self.global_mean_: float = None
|
|
136
|
+
|
|
137
|
+
def fit(self, X: pd.DataFrame, y: pd.Series):
|
|
138
|
+
self.global_mean_ = y.mean()
|
|
139
|
+
|
|
140
|
+
for col in self.columns:
|
|
141
|
+
stats = y.groupby(X[col]).agg(['mean', 'count'])
|
|
142
|
+
smooth = (stats['count'] * stats['mean'] + self.smoothing * self.global_mean_) / (
|
|
143
|
+
stats['count'] + self.smoothing
|
|
144
|
+
)
|
|
145
|
+
self.encodings_[col] = smooth.to_dict()
|
|
146
|
+
|
|
147
|
+
return self
|
|
148
|
+
|
|
149
|
+
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
150
|
+
X = X.copy()
|
|
151
|
+
for col in self.columns:
|
|
152
|
+
X[f"{col}_encoded"] = X[col].map(self.encodings_[col]).fillna(self.global_mean_)
|
|
153
|
+
return X.drop(columns=self.columns)
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Feature Store with Feast
|
|
159
|
+
|
|
160
|
+
### Feature Store Setup
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
# feature_store.yaml
|
|
164
|
+
"""
|
|
165
|
+
project: ml_project
|
|
166
|
+
registry: data/registry.db
|
|
167
|
+
provider: local
|
|
168
|
+
online_store:
|
|
169
|
+
type: sqlite
|
|
170
|
+
path: data/online_store.db
|
|
171
|
+
offline_store:
|
|
172
|
+
type: file
|
|
173
|
+
entity_key_serialization_version: 2
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
# features/user_features.py
|
|
177
|
+
from datetime import timedelta
|
|
178
|
+
from feast import Entity, Feature, FeatureView, FileSource, Field
|
|
179
|
+
from feast.types import Float32, Int64, String
|
|
180
|
+
|
|
181
|
+
# Define entity
|
|
182
|
+
user = Entity(
|
|
183
|
+
name="user_id",
|
|
184
|
+
description="User identifier",
|
|
185
|
+
join_keys=["user_id"],
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Define data source
|
|
189
|
+
user_stats_source = FileSource(
|
|
190
|
+
path="data/user_stats.parquet",
|
|
191
|
+
timestamp_field="event_timestamp",
|
|
192
|
+
created_timestamp_column="created_timestamp",
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Define feature view
|
|
196
|
+
user_stats_fv = FeatureView(
|
|
197
|
+
name="user_stats",
|
|
198
|
+
entities=[user],
|
|
199
|
+
ttl=timedelta(days=1),
|
|
200
|
+
schema=[
|
|
201
|
+
Field(name="total_purchases", dtype=Int64),
|
|
202
|
+
Field(name="avg_purchase_value", dtype=Float32),
|
|
203
|
+
Field(name="days_since_last_purchase", dtype=Int64),
|
|
204
|
+
Field(name="user_segment", dtype=String),
|
|
205
|
+
],
|
|
206
|
+
source=user_stats_source,
|
|
207
|
+
online=True,
|
|
208
|
+
tags={"team": "ml", "owner": "data-science"},
|
|
209
|
+
)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Feature Retrieval Pattern
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
from feast import FeatureStore
|
|
216
|
+
import pandas as pd
|
|
217
|
+
from datetime import datetime
|
|
218
|
+
|
|
219
|
+
class FeatureService:
|
|
220
|
+
"""Production feature service with Feast."""
|
|
221
|
+
|
|
222
|
+
def __init__(self, repo_path: str = "."):
|
|
223
|
+
self.store = FeatureStore(repo_path=repo_path)
|
|
224
|
+
|
|
225
|
+
def get_training_features(
|
|
226
|
+
self,
|
|
227
|
+
entity_df: pd.DataFrame,
|
|
228
|
+
feature_refs: list[str],
|
|
229
|
+
) -> pd.DataFrame:
|
|
230
|
+
"""Get historical features for training."""
|
|
231
|
+
return self.store.get_historical_features(
|
|
232
|
+
entity_df=entity_df,
|
|
233
|
+
features=feature_refs,
|
|
234
|
+
).to_df()
|
|
235
|
+
|
|
236
|
+
def get_online_features(
|
|
237
|
+
self,
|
|
238
|
+
entity_rows: list[dict],
|
|
239
|
+
feature_refs: list[str],
|
|
240
|
+
) -> dict:
|
|
241
|
+
"""Get features for real-time inference."""
|
|
242
|
+
response = self.store.get_online_features(
|
|
243
|
+
entity_rows=entity_rows,
|
|
244
|
+
features=feature_refs,
|
|
245
|
+
)
|
|
246
|
+
return response.to_dict()
|
|
247
|
+
|
|
248
|
+
def materialize_features(
|
|
249
|
+
self,
|
|
250
|
+
start_date: datetime,
|
|
251
|
+
end_date: datetime,
|
|
252
|
+
) -> None:
|
|
253
|
+
"""Materialize features to online store."""
|
|
254
|
+
self.store.materialize(start_date=start_date, end_date=end_date)
|
|
255
|
+
|
|
256
|
+
# Usage
|
|
257
|
+
feature_service = FeatureService()
|
|
258
|
+
|
|
259
|
+
# Training: historical features
|
|
260
|
+
entity_df = pd.DataFrame({
|
|
261
|
+
"user_id": [1, 2, 3],
|
|
262
|
+
"event_timestamp": [datetime(2024, 1, 15)] * 3,
|
|
263
|
+
})
|
|
264
|
+
|
|
265
|
+
training_features = feature_service.get_training_features(
|
|
266
|
+
entity_df=entity_df,
|
|
267
|
+
feature_refs=[
|
|
268
|
+
"user_stats:total_purchases",
|
|
269
|
+
"user_stats:avg_purchase_value",
|
|
270
|
+
"user_stats:days_since_last_purchase",
|
|
271
|
+
],
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Inference: online features
|
|
275
|
+
online_features = feature_service.get_online_features(
|
|
276
|
+
entity_rows=[{"user_id": 1}],
|
|
277
|
+
feature_refs=["user_stats:total_purchases", "user_stats:avg_purchase_value"],
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
## Data Validation with Great Expectations
|
|
284
|
+
|
|
285
|
+
### Expectation Suite Definition
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
import great_expectations as gx
|
|
289
|
+
from great_expectations.core import ExpectationSuite
|
|
290
|
+
from great_expectations.checkpoint import Checkpoint
|
|
291
|
+
|
|
292
|
+
def create_feature_expectations(context: gx.DataContext) -> ExpectationSuite:
|
|
293
|
+
"""Define data quality expectations for features."""
|
|
294
|
+
|
|
295
|
+
suite = context.add_expectation_suite("feature_validation_suite")
|
|
296
|
+
|
|
297
|
+
# Column existence
|
|
298
|
+
suite.add_expectation(
|
|
299
|
+
gx.expectations.ExpectColumnToExist(column="user_id")
|
|
300
|
+
)
|
|
301
|
+
suite.add_expectation(
|
|
302
|
+
gx.expectations.ExpectColumnToExist(column="purchase_amount")
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Null checks
|
|
306
|
+
suite.add_expectation(
|
|
307
|
+
gx.expectations.ExpectColumnValuesToNotBeNull(column="user_id")
|
|
308
|
+
)
|
|
309
|
+
suite.add_expectation(
|
|
310
|
+
gx.expectations.ExpectColumnValuesToNotBeNull(
|
|
311
|
+
column="purchase_amount",
|
|
312
|
+
mostly=0.95, # Allow 5% nulls
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Value ranges
|
|
317
|
+
suite.add_expectation(
|
|
318
|
+
gx.expectations.ExpectColumnValuesToBeBetween(
|
|
319
|
+
column="purchase_amount",
|
|
320
|
+
min_value=0,
|
|
321
|
+
max_value=10000,
|
|
322
|
+
)
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Uniqueness
|
|
326
|
+
suite.add_expectation(
|
|
327
|
+
gx.expectations.ExpectColumnValuesToBeUnique(column="transaction_id")
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Distribution checks
|
|
331
|
+
suite.add_expectation(
|
|
332
|
+
gx.expectations.ExpectColumnMeanToBeBetween(
|
|
333
|
+
column="purchase_amount",
|
|
334
|
+
min_value=50,
|
|
335
|
+
max_value=500,
|
|
336
|
+
)
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
return suite
|
|
340
|
+
|
|
341
|
+
def validate_features(
|
|
342
|
+
df: pd.DataFrame,
|
|
343
|
+
context: gx.DataContext,
|
|
344
|
+
suite_name: str,
|
|
345
|
+
) -> dict:
|
|
346
|
+
"""Run validation and return results."""
|
|
347
|
+
|
|
348
|
+
datasource = context.sources.add_pandas("runtime_source")
|
|
349
|
+
data_asset = datasource.add_dataframe_asset("runtime_asset")
|
|
350
|
+
batch_request = data_asset.build_batch_request(dataframe=df)
|
|
351
|
+
|
|
352
|
+
checkpoint = context.add_or_update_checkpoint(
|
|
353
|
+
name="feature_checkpoint",
|
|
354
|
+
validations=[
|
|
355
|
+
{
|
|
356
|
+
"batch_request": batch_request,
|
|
357
|
+
"expectation_suite_name": suite_name,
|
|
358
|
+
}
|
|
359
|
+
],
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
result = checkpoint.run()
|
|
363
|
+
|
|
364
|
+
return {
|
|
365
|
+
"success": result.success,
|
|
366
|
+
"statistics": result.run_results[list(result.run_results.keys())[0]].get("validation_result").statistics,
|
|
367
|
+
"results": result.to_json_dict(),
|
|
368
|
+
}
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
### Data Drift Detection
|
|
372
|
+
|
|
373
|
+
```python
|
|
374
|
+
from scipy import stats
|
|
375
|
+
import numpy as np
|
|
376
|
+
from dataclasses import dataclass
|
|
377
|
+
|
|
378
|
+
@dataclass
|
|
379
|
+
class DriftResult:
|
|
380
|
+
feature: str
|
|
381
|
+
drift_detected: bool
|
|
382
|
+
statistic: float
|
|
383
|
+
p_value: float
|
|
384
|
+
method: str
|
|
385
|
+
|
|
386
|
+
class FeatureDriftDetector:
|
|
387
|
+
"""Detect distribution drift in features."""
|
|
388
|
+
|
|
389
|
+
def __init__(self, significance_level: float = 0.05):
|
|
390
|
+
self.significance_level = significance_level
|
|
391
|
+
self.reference_stats: dict = {}
|
|
392
|
+
|
|
393
|
+
def fit(self, reference_df: pd.DataFrame, features: list[str]) -> None:
|
|
394
|
+
"""Store reference distribution statistics."""
|
|
395
|
+
for feature in features:
|
|
396
|
+
self.reference_stats[feature] = {
|
|
397
|
+
'mean': reference_df[feature].mean(),
|
|
398
|
+
'std': reference_df[feature].std(),
|
|
399
|
+
'values': reference_df[feature].dropna().values,
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
def detect_drift(
|
|
403
|
+
self,
|
|
404
|
+
current_df: pd.DataFrame,
|
|
405
|
+
features: list[str],
|
|
406
|
+
) -> list[DriftResult]:
|
|
407
|
+
"""Detect drift using KS test."""
|
|
408
|
+
results = []
|
|
409
|
+
|
|
410
|
+
for feature in features:
|
|
411
|
+
if feature not in self.reference_stats:
|
|
412
|
+
continue
|
|
413
|
+
|
|
414
|
+
reference_values = self.reference_stats[feature]['values']
|
|
415
|
+
current_values = current_df[feature].dropna().values
|
|
416
|
+
|
|
417
|
+
statistic, p_value = stats.ks_2samp(reference_values, current_values)
|
|
418
|
+
|
|
419
|
+
results.append(DriftResult(
|
|
420
|
+
feature=feature,
|
|
421
|
+
drift_detected=p_value < self.significance_level,
|
|
422
|
+
statistic=statistic,
|
|
423
|
+
p_value=p_value,
|
|
424
|
+
method='ks_test',
|
|
425
|
+
))
|
|
426
|
+
|
|
427
|
+
return results
|
|
428
|
+
|
|
429
|
+
def detect_drift_psi(
|
|
430
|
+
self,
|
|
431
|
+
current_df: pd.DataFrame,
|
|
432
|
+
feature: str,
|
|
433
|
+
bins: int = 10,
|
|
434
|
+
) -> DriftResult:
|
|
435
|
+
"""Detect drift using Population Stability Index."""
|
|
436
|
+
reference = self.reference_stats[feature]['values']
|
|
437
|
+
current = current_df[feature].dropna().values
|
|
438
|
+
|
|
439
|
+
# Create bins from reference distribution
|
|
440
|
+
bin_edges = np.percentile(reference, np.linspace(0, 100, bins + 1))
|
|
441
|
+
bin_edges[0] = -np.inf
|
|
442
|
+
bin_edges[-1] = np.inf
|
|
443
|
+
|
|
444
|
+
ref_counts = np.histogram(reference, bins=bin_edges)[0] / len(reference)
|
|
445
|
+
cur_counts = np.histogram(current, bins=bin_edges)[0] / len(current)
|
|
446
|
+
|
|
447
|
+
# Avoid log(0)
|
|
448
|
+
ref_counts = np.clip(ref_counts, 0.0001, None)
|
|
449
|
+
cur_counts = np.clip(cur_counts, 0.0001, None)
|
|
450
|
+
|
|
451
|
+
psi = np.sum((cur_counts - ref_counts) * np.log(cur_counts / ref_counts))
|
|
452
|
+
|
|
453
|
+
return DriftResult(
|
|
454
|
+
feature=feature,
|
|
455
|
+
drift_detected=psi > 0.2, # PSI > 0.2 indicates significant drift
|
|
456
|
+
statistic=psi,
|
|
457
|
+
p_value=np.nan,
|
|
458
|
+
method='psi',
|
|
459
|
+
)
|
|
460
|
+
```
|
|
461
|
+
|
|
462
|
+
---
|
|
463
|
+
|
|
464
|
+
## Feature Pipeline Integration
|
|
465
|
+
|
|
466
|
+
### Complete Feature Pipeline
|
|
467
|
+
|
|
468
|
+
```python
|
|
469
|
+
from typing import Protocol
|
|
470
|
+
from abc import abstractmethod
|
|
471
|
+
import logging
|
|
472
|
+
|
|
473
|
+
logger = logging.getLogger(__name__)
|
|
474
|
+
|
|
475
|
+
class FeatureTransformer(Protocol):
|
|
476
|
+
"""Protocol for feature transformers."""
|
|
477
|
+
|
|
478
|
+
@abstractmethod
|
|
479
|
+
def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "FeatureTransformer": ...
|
|
480
|
+
|
|
481
|
+
@abstractmethod
|
|
482
|
+
def transform(self, X: pd.DataFrame) -> pd.DataFrame: ...
|
|
483
|
+
|
|
484
|
+
class FeaturePipeline:
|
|
485
|
+
"""Production feature pipeline with validation and monitoring."""
|
|
486
|
+
|
|
487
|
+
def __init__(
|
|
488
|
+
self,
|
|
489
|
+
transformers: list[tuple[str, FeatureTransformer]],
|
|
490
|
+
validator: FeatureDriftDetector = None,
|
|
491
|
+
feature_store: FeatureService = None,
|
|
492
|
+
):
|
|
493
|
+
self.transformers = transformers
|
|
494
|
+
self.validator = validator
|
|
495
|
+
self.feature_store = feature_store
|
|
496
|
+
self.is_fitted = False
|
|
497
|
+
|
|
498
|
+
def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "FeaturePipeline":
|
|
499
|
+
"""Fit all transformers."""
|
|
500
|
+
X_current = X.copy()
|
|
501
|
+
|
|
502
|
+
for name, transformer in self.transformers:
|
|
503
|
+
logger.info(f"Fitting transformer: {name}")
|
|
504
|
+
transformer.fit(X_current, y)
|
|
505
|
+
X_current = transformer.transform(X_current)
|
|
506
|
+
|
|
507
|
+
if self.validator:
|
|
508
|
+
numeric_cols = X_current.select_dtypes(include=[np.number]).columns.tolist()
|
|
509
|
+
self.validator.fit(X_current, numeric_cols)
|
|
510
|
+
|
|
511
|
+
self.is_fitted = True
|
|
512
|
+
return self
|
|
513
|
+
|
|
514
|
+
def transform(
|
|
515
|
+
self,
|
|
516
|
+
X: pd.DataFrame,
|
|
517
|
+
validate: bool = True,
|
|
518
|
+
) -> tuple[pd.DataFrame, list[DriftResult]]:
|
|
519
|
+
"""Transform features with optional validation."""
|
|
520
|
+
if not self.is_fitted:
|
|
521
|
+
raise ValueError("Pipeline must be fitted before transform")
|
|
522
|
+
|
|
523
|
+
X_current = X.copy()
|
|
524
|
+
|
|
525
|
+
for name, transformer in self.transformers:
|
|
526
|
+
logger.info(f"Applying transformer: {name}")
|
|
527
|
+
X_current = transformer.transform(X_current)
|
|
528
|
+
|
|
529
|
+
drift_results = []
|
|
530
|
+
if validate and self.validator:
|
|
531
|
+
numeric_cols = X_current.select_dtypes(include=[np.number]).columns.tolist()
|
|
532
|
+
drift_results = self.validator.detect_drift(X_current, numeric_cols)
|
|
533
|
+
|
|
534
|
+
drifted = [r.feature for r in drift_results if r.drift_detected]
|
|
535
|
+
if drifted:
|
|
536
|
+
logger.warning(f"Drift detected in features: {drifted}")
|
|
537
|
+
|
|
538
|
+
return X_current, drift_results
|
|
539
|
+
|
|
540
|
+
def save(self, path: str) -> None:
|
|
541
|
+
"""Save pipeline artifacts."""
|
|
542
|
+
import pickle
|
|
543
|
+
|
|
544
|
+
with open(f"{path}/feature_pipeline.pkl", 'wb') as f:
|
|
545
|
+
pickle.dump({
|
|
546
|
+
'transformers': self.transformers,
|
|
547
|
+
'validator': self.validator,
|
|
548
|
+
'is_fitted': self.is_fitted,
|
|
549
|
+
}, f)
|
|
550
|
+
|
|
551
|
+
@classmethod
|
|
552
|
+
def load(cls, path: str) -> "FeaturePipeline":
|
|
553
|
+
"""Load pipeline from artifacts."""
|
|
554
|
+
import pickle
|
|
555
|
+
|
|
556
|
+
with open(f"{path}/feature_pipeline.pkl", 'rb') as f:
|
|
557
|
+
data = pickle.load(f)
|
|
558
|
+
|
|
559
|
+
pipeline = cls(
|
|
560
|
+
transformers=data['transformers'],
|
|
561
|
+
validator=data['validator'],
|
|
562
|
+
)
|
|
563
|
+
pipeline.is_fitted = data['is_fitted']
|
|
564
|
+
return pipeline
|
|
565
|
+
```
|
|
566
|
+
|
|
567
|
+
---
|
|
568
|
+
|
|
569
|
+
## Best Practices
|
|
570
|
+
|
|
571
|
+
### Feature Naming Conventions
|
|
572
|
+
|
|
573
|
+
```python
|
|
574
|
+
# Good: descriptive, includes transformation info
|
|
575
|
+
"user_total_purchases_30d"
|
|
576
|
+
"product_price_log_scaled"
|
|
577
|
+
"category_onehot_electronics"
|
|
578
|
+
|
|
579
|
+
# Bad: ambiguous, no context
|
|
580
|
+
"feature_1"
|
|
581
|
+
"x_transformed"
|
|
582
|
+
"col"
|
|
583
|
+
```
|
|
584
|
+
|
|
585
|
+
### Feature Documentation
|
|
586
|
+
|
|
587
|
+
```python
|
|
588
|
+
from dataclasses import dataclass
|
|
589
|
+
from typing import Optional
|
|
590
|
+
|
|
591
|
+
@dataclass
|
|
592
|
+
class FeatureMetadata:
|
|
593
|
+
"""Document feature for registry."""
|
|
594
|
+
name: str
|
|
595
|
+
description: str
|
|
596
|
+
dtype: str
|
|
597
|
+
source_table: str
|
|
598
|
+
transformation: str
|
|
599
|
+
owner: str
|
|
600
|
+
created_at: str
|
|
601
|
+
tags: list[str]
|
|
602
|
+
dependencies: list[str]
|
|
603
|
+
freshness_sla: Optional[str] = None
|
|
604
|
+
|
|
605
|
+
def to_dict(self) -> dict:
|
|
606
|
+
return {
|
|
607
|
+
"name": self.name,
|
|
608
|
+
"description": self.description,
|
|
609
|
+
"dtype": self.dtype,
|
|
610
|
+
"source_table": self.source_table,
|
|
611
|
+
"transformation": self.transformation,
|
|
612
|
+
"owner": self.owner,
|
|
613
|
+
"created_at": self.created_at,
|
|
614
|
+
"tags": self.tags,
|
|
615
|
+
"dependencies": self.dependencies,
|
|
616
|
+
"freshness_sla": self.freshness_sla,
|
|
617
|
+
}
|
|
618
|
+
```
|
|
619
|
+
|
|
620
|
+
---
|
|
621
|
+
|
|
622
|
+
## Related References
|
|
623
|
+
|
|
624
|
+
- `training-pipelines.md` - Using features in training workflows
|
|
625
|
+
- `experiment-tracking.md` - Logging feature importance and metadata
|
|
626
|
+
- `model-validation.md` - Validating model performance on feature sets
|
|
627
|
+
|
|
628
|
+
## Cross-Reference Skills
|
|
629
|
+
|
|
630
|
+
- **Pandas Pro** - DataFrame operations for feature engineering
|
|
631
|
+
- **Data Engineer** - Data pipeline integration for feature computation
|