aigroup-workflow 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/fix-build.md +10 -5
- package/.claude/commands/init-project.md +13 -8
- package/.claude/commands/plan.md +15 -8
- package/.claude/commands/review.md +12 -6
- package/.claude/commands/tdd.md +11 -5
- package/.claude/commands/workflow-start.md +20 -11
- package/.claude/settings.json +28 -0
- package/.codex/agents/architect.toml +207 -0
- package/.codex/agents/build-error-resolver.toml +110 -0
- package/.codex/agents/code-reviewer.toml +233 -0
- package/.codex/agents/doc-updater.toml +103 -0
- package/.codex/agents/e2e-runner.toml +103 -0
- package/.codex/agents/get-current-datetime.toml +23 -0
- package/.codex/agents/init-architect.toml +181 -0
- package/.codex/agents/planner.toml +208 -0
- package/.codex/agents/refactor-cleaner.toml +81 -0
- package/.codex/agents/rust-reviewer.toml +90 -0
- package/.codex/agents/security-reviewer.toml +104 -0
- package/.codex/agents/tdd-guide.toml +87 -0
- package/AGENTS.md +2 -2
- package/CLAUDE.md +23 -1
- package/LICENSE +20 -20
- package/README.md +333 -333
- package/agents/a11y-architect.md +141 -141
- package/agents/architect.md +211 -211
- package/agents/build-error-resolver.md +114 -114
- package/agents/chief-of-staff.md +151 -151
- package/agents/code-architect.md +71 -71
- package/agents/code-explorer.md +69 -69
- package/agents/code-reviewer.md +237 -237
- package/agents/code-simplifier.md +47 -47
- package/agents/comment-analyzer.md +45 -45
- package/agents/conversation-analyzer.md +52 -52
- package/agents/cpp-build-resolver.md +90 -90
- package/agents/cpp-reviewer.md +72 -72
- package/agents/csharp-reviewer.md +101 -101
- package/agents/dart-build-resolver.md +201 -201
- package/agents/database-reviewer.md +91 -91
- package/agents/doc-updater.md +107 -107
- package/agents/docs-lookup.md +68 -68
- package/agents/e2e-runner.md +107 -107
- package/agents/flutter-reviewer.md +243 -243
- package/agents/gan-evaluator.md +209 -209
- package/agents/gan-generator.md +131 -131
- package/agents/gan-planner.md +99 -99
- package/agents/get-current-datetime.md +26 -26
- package/agents/go-build-resolver.md +94 -94
- package/agents/go-reviewer.md +76 -76
- package/agents/harness-optimizer.md +35 -35
- package/agents/healthcare-reviewer.md +83 -83
- package/agents/java-build-resolver.md +153 -153
- package/agents/java-reviewer.md +92 -92
- package/agents/kotlin-build-resolver.md +118 -118
- package/agents/kotlin-reviewer.md +159 -159
- package/agents/loop-operator.md +36 -36
- package/agents/opensource-forker.md +198 -198
- package/agents/opensource-packager.md +249 -249
- package/agents/opensource-sanitizer.md +188 -188
- package/agents/performance-optimizer.md +446 -446
- package/agents/planner.md +212 -212
- package/agents/pr-test-analyzer.md +45 -45
- package/agents/python-reviewer.md +98 -98
- package/agents/pytorch-build-resolver.md +120 -120
- package/agents/refactor-cleaner.md +85 -85
- package/agents/rust-build-resolver.md +148 -148
- package/agents/rust-reviewer.md +94 -94
- package/agents/security-reviewer.md +108 -108
- package/agents/seo-specialist.md +59 -59
- package/agents/silent-failure-hunter.md +50 -50
- package/agents/tdd-guide.md +91 -91
- package/agents/type-design-analyzer.md +41 -41
- package/agents/typescript-reviewer.md +112 -112
- package/cli/commands/update.mjs +1 -1
- package/cli/utils/scaffold.mjs +53 -0
- package/docs/rules/agents.md +166 -50
- package/docs/rules/cpp/coding-style.md +44 -44
- package/docs/rules/cpp/hooks.md +39 -39
- package/docs/rules/cpp/patterns.md +51 -51
- package/docs/rules/cpp/security.md +51 -51
- package/docs/rules/cpp/testing.md +44 -44
- package/docs/rules/csharp/coding-style.md +72 -72
- package/docs/rules/csharp/hooks.md +25 -25
- package/docs/rules/csharp/patterns.md +50 -50
- package/docs/rules/csharp/security.md +58 -58
- package/docs/rules/csharp/testing.md +46 -46
- package/docs/rules/dart/coding-style.md +159 -159
- package/docs/rules/dart/hooks.md +66 -66
- package/docs/rules/dart/patterns.md +261 -261
- package/docs/rules/dart/security.md +135 -135
- package/docs/rules/dart/testing.md +215 -215
- package/docs/rules/golang/coding-style.md +32 -32
- package/docs/rules/golang/hooks.md +17 -17
- package/docs/rules/golang/patterns.md +45 -45
- package/docs/rules/golang/security.md +34 -34
- package/docs/rules/golang/testing.md +31 -31
- package/docs/rules/java/coding-style.md +114 -114
- package/docs/rules/java/hooks.md +18 -18
- package/docs/rules/java/patterns.md +146 -146
- package/docs/rules/java/security.md +100 -100
- package/docs/rules/java/testing.md +131 -131
- package/docs/rules/kotlin/coding-style.md +86 -86
- package/docs/rules/kotlin/hooks.md +17 -17
- package/docs/rules/kotlin/patterns.md +146 -146
- package/docs/rules/kotlin/security.md +82 -82
- package/docs/rules/kotlin/testing.md +128 -128
- package/docs/rules/perl/coding-style.md +46 -46
- package/docs/rules/perl/hooks.md +22 -22
- package/docs/rules/perl/patterns.md +76 -76
- package/docs/rules/perl/security.md +69 -69
- package/docs/rules/perl/testing.md +54 -54
- package/docs/rules/php/coding-style.md +40 -40
- package/docs/rules/php/hooks.md +24 -24
- package/docs/rules/php/patterns.md +33 -33
- package/docs/rules/php/security.md +37 -37
- package/docs/rules/php/testing.md +39 -39
- package/docs/rules/python/coding-style.md +42 -42
- package/docs/rules/python/hooks.md +19 -19
- package/docs/rules/python/patterns.md +39 -39
- package/docs/rules/python/security.md +30 -30
- package/docs/rules/python/testing.md +38 -38
- package/docs/rules/rust/coding-style.md +151 -151
- package/docs/rules/rust/hooks.md +16 -16
- package/docs/rules/rust/patterns.md +168 -168
- package/docs/rules/rust/security.md +141 -141
- package/docs/rules/rust/testing.md +154 -154
- package/docs/rules/swift/coding-style.md +47 -47
- package/docs/rules/swift/hooks.md +20 -20
- package/docs/rules/swift/patterns.md +66 -66
- package/docs/rules/swift/security.md +33 -33
- package/docs/rules/swift/testing.md +45 -45
- package/docs/rules/typescript/coding-style.md +199 -199
- package/docs/rules/typescript/hooks.md +22 -22
- package/docs/rules/typescript/patterns.md +52 -52
- package/docs/rules/typescript/security.md +28 -28
- package/docs/rules/typescript/testing.md +18 -18
- package/docs/rules/web/coding-style.md +96 -96
- package/docs/rules/web/design-quality.md +62 -62
- package/docs/rules/web/hooks.md +120 -120
- package/docs/rules/web/patterns.md +79 -79
- package/docs/rules/web/performance.md +64 -64
- package/docs/rules/web/security.md +57 -57
- package/docs/rules/web/testing.md +55 -55
- package/docs/templates/README.md +36 -36
- package/docs/templates/ai-project-final.md +124 -124
- package/docs/templates/ai-project.md +105 -105
- package/docs/templates/api.md +157 -157
- package/docs/templates/bug.md +62 -62
- package/docs/templates/code-review.md +87 -87
- package/docs/templates/generic.md +116 -116
- package/docs/templates/implementation-plan.md +1 -1
- package/docs/templates/meeting.md +68 -68
- package/docs/templates/prd.md +98 -98
- package/docs/templates/ui.md +134 -134
- package/docs/workflow-pipeline.md +11 -10
- package/package.json +40 -39
- package/scripts/hooks/checks/orchestration-artifacts.cjs +28 -23
- package/scripts/hooks/checks/workflow-state.cjs +4 -5
- package/scripts/orchestration/lib/orchestrator.cjs +344 -117
- package/scripts/orchestration/lib/validate.cjs +145 -0
- package/scripts/orchestration/session.cjs +88 -44
- package/skills/SUPERPOWERS-LICENSE +21 -21
- package/skills/ai-ml/fine-tuning-expert/SKILL.md +162 -162
- package/skills/ai-ml/fine-tuning-expert/references/dataset-preparation.md +540 -540
- package/skills/ai-ml/fine-tuning-expert/references/deployment-optimization.md +673 -673
- package/skills/ai-ml/fine-tuning-expert/references/evaluation-metrics.md +597 -597
- package/skills/ai-ml/fine-tuning-expert/references/hyperparameter-tuning.md +565 -565
- package/skills/ai-ml/fine-tuning-expert/references/lora-peft.md +347 -347
- package/skills/ai-ml/ml-pipeline/SKILL.md +159 -159
- package/skills/ai-ml/ml-pipeline/references/experiment-tracking.md +833 -833
- package/skills/ai-ml/ml-pipeline/references/feature-engineering.md +631 -631
- package/skills/ai-ml/ml-pipeline/references/model-validation.md +978 -978
- package/skills/ai-ml/ml-pipeline/references/pipeline-orchestration.md +907 -907
- package/skills/ai-ml/ml-pipeline/references/training-pipelines.md +782 -782
- package/skills/ai-ml/rag-architect/SKILL.md +194 -194
- package/skills/ai-ml/rag-architect/references/chunking-strategies.md +878 -878
- package/skills/ai-ml/rag-architect/references/embedding-models.md +561 -561
- package/skills/ai-ml/rag-architect/references/rag-evaluation.md +833 -833
- package/skills/ai-ml/rag-architect/references/retrieval-optimization.md +795 -795
- package/skills/ai-ml/rag-architect/references/vector-databases.md +589 -589
- package/skills/ai-ml/spark-engineer/SKILL.md +148 -148
- package/skills/ai-ml/spark-engineer/references/partitioning-caching.md +543 -543
- package/skills/ai-ml/spark-engineer/references/performance-tuning.md +544 -544
- package/skills/ai-ml/spark-engineer/references/rdd-operations.md +599 -599
- package/skills/ai-ml/spark-engineer/references/spark-sql-dataframes.md +474 -474
- package/skills/ai-ml/spark-engineer/references/streaming-patterns.md +786 -786
- package/skills/backend/api-designer/SKILL.md +217 -217
- package/skills/backend/api-designer/references/error-handling.md +541 -541
- package/skills/backend/api-designer/references/openapi.md +824 -824
- package/skills/backend/api-designer/references/pagination.md +494 -494
- package/skills/backend/api-designer/references/rest-patterns.md +335 -335
- package/skills/backend/api-designer/references/versioning.md +391 -391
- package/skills/backend/architecture-designer/SKILL.md +117 -117
- package/skills/backend/architecture-designer/references/adr-template.md +116 -116
- package/skills/backend/architecture-designer/references/architecture-patterns.md +111 -111
- package/skills/backend/architecture-designer/references/database-selection.md +102 -102
- package/skills/backend/architecture-designer/references/nfr-checklist.md +112 -112
- package/skills/backend/architecture-designer/references/system-design.md +100 -100
- package/skills/backend/code-documenter/SKILL.md +147 -147
- package/skills/backend/code-documenter/references/api-docs-fastapi-django.md +166 -166
- package/skills/backend/code-documenter/references/api-docs-nestjs-express.md +220 -220
- package/skills/backend/code-documenter/references/coverage-reports.md +125 -125
- package/skills/backend/code-documenter/references/documentation-systems.md +333 -333
- package/skills/backend/code-documenter/references/interactive-api-docs.md +531 -531
- package/skills/backend/code-documenter/references/python-docstrings.md +121 -121
- package/skills/backend/code-documenter/references/typescript-jsdoc.md +145 -145
- package/skills/backend/code-documenter/references/user-guides-tutorials.md +530 -530
- package/skills/backend/debugging-wizard/SKILL.md +105 -105
- package/skills/backend/debugging-wizard/references/common-patterns.md +132 -132
- package/skills/backend/debugging-wizard/references/debugging-tools.md +140 -140
- package/skills/backend/debugging-wizard/references/quick-fixes.md +177 -177
- package/skills/backend/debugging-wizard/references/strategies.md +142 -142
- package/skills/backend/debugging-wizard/references/systematic-debugging.md +367 -367
- package/skills/backend/feature-forge/SKILL.md +98 -98
- package/skills/backend/feature-forge/references/acceptance-criteria.md +104 -104
- package/skills/backend/feature-forge/references/ears-syntax.md +99 -99
- package/skills/backend/feature-forge/references/interview-questions.md +150 -150
- package/skills/backend/feature-forge/references/pre-discovery-subagents.md +54 -54
- package/skills/backend/feature-forge/references/specification-template.md +103 -103
- package/skills/backend/fullstack-guardian/SKILL.md +105 -105
- package/skills/backend/fullstack-guardian/references/api-design-standards.md +307 -307
- package/skills/backend/fullstack-guardian/references/architecture-decisions.md +350 -350
- package/skills/backend/fullstack-guardian/references/backend-patterns.md +237 -237
- package/skills/backend/fullstack-guardian/references/common-patterns.md +134 -134
- package/skills/backend/fullstack-guardian/references/deliverables-checklist.md +354 -354
- package/skills/backend/fullstack-guardian/references/design-template.md +91 -91
- package/skills/backend/fullstack-guardian/references/error-handling.md +135 -135
- package/skills/backend/fullstack-guardian/references/frontend-patterns.md +340 -340
- package/skills/backend/fullstack-guardian/references/integration-patterns.md +333 -333
- package/skills/backend/fullstack-guardian/references/security-checklist.md +106 -106
- package/skills/backend/graphql-architect/SKILL.md +146 -146
- package/skills/backend/graphql-architect/references/federation.md +418 -418
- package/skills/backend/graphql-architect/references/migration-from-rest.md +1141 -1141
- package/skills/backend/graphql-architect/references/resolvers.md +425 -425
- package/skills/backend/graphql-architect/references/schema-design.md +393 -393
- package/skills/backend/graphql-architect/references/security.md +569 -569
- package/skills/backend/graphql-architect/references/subscriptions.md +510 -510
- package/skills/backend/legacy-modernizer/SKILL.md +137 -137
- package/skills/backend/legacy-modernizer/references/legacy-testing.md +381 -381
- package/skills/backend/legacy-modernizer/references/migration-strategies.md +423 -423
- package/skills/backend/legacy-modernizer/references/refactoring-patterns.md +395 -395
- package/skills/backend/legacy-modernizer/references/strangler-fig-pattern.md +281 -281
- package/skills/backend/legacy-modernizer/references/system-assessment.md +487 -487
- package/skills/backend/microservices-architect/SKILL.md +164 -164
- package/skills/backend/microservices-architect/references/communication.md +499 -499
- package/skills/backend/microservices-architect/references/data.md +721 -721
- package/skills/backend/microservices-architect/references/decomposition.md +344 -344
- package/skills/backend/microservices-architect/references/observability.md +805 -805
- package/skills/backend/microservices-architect/references/patterns.md +603 -603
- package/skills/database/database-optimizer/SKILL.md +147 -147
- package/skills/database/database-optimizer/references/index-strategies.md +331 -331
- package/skills/database/database-optimizer/references/monitoring-analysis.md +501 -501
- package/skills/database/database-optimizer/references/mysql-tuning.md +452 -452
- package/skills/database/database-optimizer/references/postgresql-tuning.md +413 -413
- package/skills/database/database-optimizer/references/query-optimization.md +251 -251
- package/skills/database/postgres-pro/SKILL.md +152 -152
- package/skills/database/postgres-pro/references/extensions.md +404 -404
- package/skills/database/postgres-pro/references/jsonb.md +321 -321
- package/skills/database/postgres-pro/references/maintenance.md +481 -481
- package/skills/database/postgres-pro/references/performance.md +265 -265
- package/skills/database/postgres-pro/references/replication.md +446 -446
- package/skills/database/sql-pro/SKILL.md +129 -129
- package/skills/database/sql-pro/references/database-design.md +402 -402
- package/skills/database/sql-pro/references/dialect-differences.md +419 -419
- package/skills/database/sql-pro/references/optimization.md +384 -384
- package/skills/database/sql-pro/references/query-patterns.md +285 -285
- package/skills/database/sql-pro/references/window-functions.md +328 -328
- package/skills/dotnet/csharp-developer/SKILL.md +125 -125
- package/skills/dotnet/csharp-developer/references/aspnet-core.md +394 -394
- package/skills/dotnet/csharp-developer/references/blazor.md +553 -553
- package/skills/dotnet/csharp-developer/references/entity-framework.md +409 -409
- package/skills/dotnet/csharp-developer/references/modern-csharp.md +248 -248
- package/skills/dotnet/csharp-developer/references/performance.md +498 -498
- package/skills/dotnet/dotnet-core-expert/SKILL.md +138 -138
- package/skills/dotnet/dotnet-core-expert/references/authentication.md +546 -546
- package/skills/dotnet/dotnet-core-expert/references/clean-architecture.md +455 -455
- package/skills/dotnet/dotnet-core-expert/references/cloud-native.md +548 -548
- package/skills/dotnet/dotnet-core-expert/references/entity-framework.md +440 -440
- package/skills/dotnet/dotnet-core-expert/references/minimal-apis.md +319 -319
- package/skills/frontend/angular-architect/SKILL.md +152 -152
- package/skills/frontend/angular-architect/references/components.md +297 -297
- package/skills/frontend/angular-architect/references/ngrx.md +401 -401
- package/skills/frontend/angular-architect/references/routing.md +361 -361
- package/skills/frontend/angular-architect/references/rxjs.md +319 -319
- package/skills/frontend/angular-architect/references/testing.md +405 -405
- package/skills/frontend/design-commands/design.md +91 -91
- package/skills/frontend/design-commands/handoff.md +97 -97
- package/skills/frontend/design-commands/prototype.md +120 -120
- package/skills/frontend/design-commands/spec.md +160 -160
- package/skills/frontend/design-commands/style.md +78 -78
- package/skills/frontend/flutter-expert/SKILL.md +138 -138
- package/skills/frontend/flutter-expert/references/bloc-state.md +259 -259
- package/skills/frontend/flutter-expert/references/gorouter-navigation.md +119 -119
- package/skills/frontend/flutter-expert/references/performance.md +99 -99
- package/skills/frontend/flutter-expert/references/project-structure.md +118 -118
- package/skills/frontend/flutter-expert/references/riverpod-state.md +130 -130
- package/skills/frontend/flutter-expert/references/widget-patterns.md +123 -123
- package/skills/frontend/nextjs-developer/SKILL.md +143 -143
- package/skills/frontend/nextjs-developer/references/app-router.md +311 -311
- package/skills/frontend/nextjs-developer/references/data-fetching.md +482 -482
- package/skills/frontend/nextjs-developer/references/deployment.md +545 -545
- package/skills/frontend/nextjs-developer/references/server-actions.md +462 -462
- package/skills/frontend/nextjs-developer/references/server-components.md +384 -384
- package/skills/frontend/react-expert/SKILL.md +149 -149
- package/skills/frontend/react-expert/references/hooks-patterns.md +162 -162
- package/skills/frontend/react-expert/references/migration-class-to-modern.md +1119 -1119
- package/skills/frontend/react-expert/references/performance.md +168 -168
- package/skills/frontend/react-expert/references/react-19-features.md +174 -174
- package/skills/frontend/react-expert/references/server-components.md +143 -143
- package/skills/frontend/react-expert/references/state-management.md +171 -171
- package/skills/frontend/react-expert/references/testing-react.md +174 -174
- package/skills/frontend/react-native-expert/SKILL.md +185 -185
- package/skills/frontend/react-native-expert/references/expo-router.md +187 -187
- package/skills/frontend/react-native-expert/references/list-optimization.md +204 -204
- package/skills/frontend/react-native-expert/references/platform-handling.md +188 -188
- package/skills/frontend/react-native-expert/references/project-structure.md +171 -171
- package/skills/frontend/react-native-expert/references/storage-hooks.md +173 -173
- package/skills/frontend/senior-frontend/SKILL.md +477 -477
- package/skills/frontend/senior-frontend/references/frontend_best_practices.md +806 -806
- package/skills/frontend/senior-frontend/references/nextjs_optimization_guide.md +724 -724
- package/skills/frontend/senior-frontend/references/react_patterns.md +746 -746
- package/skills/frontend/senior-frontend/scripts/bundle_analyzer.py +407 -407
- package/skills/frontend/senior-frontend/scripts/component_generator.py +329 -329
- package/skills/frontend/senior-frontend/scripts/frontend_scaffolder.py +1005 -1005
- package/skills/frontend/ui-ux-pro-max/SKILL.md +386 -386
- package/skills/frontend/ui-ux-pro-max/data/charts.csv +26 -26
- package/skills/frontend/ui-ux-pro-max/data/colors.csv +97 -97
- package/skills/frontend/ui-ux-pro-max/data/icons.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/landing.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/data/products.csv +96 -96
- package/skills/frontend/ui-ux-pro-max/data/react-performance.csv +45 -45
- package/skills/frontend/ui-ux-pro-max/data/stacks/astro.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/flutter.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -56
- package/skills/frontend/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nextjs.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -59
- package/skills/frontend/ui-ux-pro-max/data/stacks/react-native.csv +52 -52
- package/skills/frontend/ui-ux-pro-max/data/stacks/react.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/shadcn.csv +61 -61
- package/skills/frontend/ui-ux-pro-max/data/stacks/svelte.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/swiftui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/vue.csv +50 -50
- package/skills/frontend/ui-ux-pro-max/data/styles.csv +68 -68
- package/skills/frontend/ui-ux-pro-max/data/typography.csv +57 -57
- package/skills/frontend/ui-ux-pro-max/data/ui-reasoning.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/ux-guidelines.csv +99 -99
- package/skills/frontend/ui-ux-pro-max/data/web-interface.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/scripts/core.py +253 -253
- package/skills/frontend/ui-ux-pro-max/scripts/design_system.py +1067 -1067
- package/skills/frontend/ui-ux-pro-max/scripts/search.py +114 -114
- package/skills/frontend/vue-expert/SKILL.md +98 -98
- package/skills/frontend/vue-expert/references/build-tooling.md +480 -480
- package/skills/frontend/vue-expert/references/components.md +448 -448
- package/skills/frontend/vue-expert/references/composition-api.md +299 -299
- package/skills/frontend/vue-expert/references/mobile-hybrid.md +636 -636
- package/skills/frontend/vue-expert/references/nuxt.md +669 -669
- package/skills/frontend/vue-expert/references/state-management.md +449 -449
- package/skills/frontend/vue-expert/references/typescript.md +584 -584
- package/skills/frontend/vue-expert-js/SKILL.md +167 -167
- package/skills/frontend/vue-expert-js/references/component-architecture.md +219 -219
- package/skills/frontend/vue-expert-js/references/composables-patterns.md +183 -183
- package/skills/frontend/vue-expert-js/references/jsdoc-typing.md +535 -535
- package/skills/frontend/vue-expert-js/references/state-management.md +249 -249
- package/skills/frontend/vue-expert-js/references/testing-patterns.md +237 -237
- package/skills/go-rust-cpp/cpp-pro/SKILL.md +115 -115
- package/skills/go-rust-cpp/cpp-pro/references/build-tooling.md +440 -440
- package/skills/go-rust-cpp/cpp-pro/references/concurrency.md +437 -437
- package/skills/go-rust-cpp/cpp-pro/references/memory-performance.md +397 -397
- package/skills/go-rust-cpp/cpp-pro/references/modern-cpp.md +304 -304
- package/skills/go-rust-cpp/cpp-pro/references/templates.md +357 -357
- package/skills/go-rust-cpp/golang-pro/SKILL.md +122 -122
- package/skills/go-rust-cpp/golang-pro/references/concurrency.md +329 -329
- package/skills/go-rust-cpp/golang-pro/references/generics.md +442 -442
- package/skills/go-rust-cpp/golang-pro/references/interfaces.md +432 -432
- package/skills/go-rust-cpp/golang-pro/references/project-structure.md +477 -477
- package/skills/go-rust-cpp/golang-pro/references/testing.md +451 -451
- package/skills/go-rust-cpp/rust-engineer/SKILL.md +167 -167
- package/skills/go-rust-cpp/rust-engineer/references/async.md +458 -458
- package/skills/go-rust-cpp/rust-engineer/references/error-handling.md +334 -334
- package/skills/go-rust-cpp/rust-engineer/references/ownership.md +278 -278
- package/skills/go-rust-cpp/rust-engineer/references/testing.md +470 -470
- package/skills/go-rust-cpp/rust-engineer/references/traits.md +413 -413
- package/skills/infra/cli-developer/SKILL.md +113 -113
- package/skills/infra/cli-developer/references/design-patterns.md +221 -221
- package/skills/infra/cli-developer/references/go-cli.md +540 -540
- package/skills/infra/cli-developer/references/node-cli.md +383 -383
- package/skills/infra/cli-developer/references/python-cli.md +422 -422
- package/skills/infra/cli-developer/references/ux-patterns.md +448 -448
- package/skills/infra/cloud-architect/SKILL.md +216 -216
- package/skills/infra/cloud-architect/references/aws.md +394 -394
- package/skills/infra/cloud-architect/references/azure.md +562 -562
- package/skills/infra/cloud-architect/references/cost.md +582 -582
- package/skills/infra/cloud-architect/references/gcp.md +633 -633
- package/skills/infra/cloud-architect/references/multi-cloud.md +483 -483
- package/skills/infra/devops-engineer/SKILL.md +144 -144
- package/skills/infra/devops-engineer/references/deployment-strategies.md +241 -241
- package/skills/infra/devops-engineer/references/docker-patterns.md +113 -113
- package/skills/infra/devops-engineer/references/github-actions.md +139 -139
- package/skills/infra/devops-engineer/references/incident-response.md +331 -331
- package/skills/infra/devops-engineer/references/kubernetes.md +154 -154
- package/skills/infra/devops-engineer/references/platform-engineering.md +417 -417
- package/skills/infra/devops-engineer/references/release-automation.md +527 -527
- package/skills/infra/devops-engineer/references/terraform-iac.md +141 -141
- package/skills/infra/kubernetes-specialist/SKILL.md +241 -241
- package/skills/infra/kubernetes-specialist/references/configuration.md +452 -452
- package/skills/infra/kubernetes-specialist/references/cost-optimization.md +458 -458
- package/skills/infra/kubernetes-specialist/references/custom-operators.md +563 -563
- package/skills/infra/kubernetes-specialist/references/gitops.md +530 -530
- package/skills/infra/kubernetes-specialist/references/helm-charts.md +912 -912
- package/skills/infra/kubernetes-specialist/references/multi-cluster.md +507 -507
- package/skills/infra/kubernetes-specialist/references/networking.md +447 -447
- package/skills/infra/kubernetes-specialist/references/service-mesh.md +459 -459
- package/skills/infra/kubernetes-specialist/references/storage.md +535 -535
- package/skills/infra/kubernetes-specialist/references/troubleshooting.md +414 -414
- package/skills/infra/kubernetes-specialist/references/workloads.md +377 -377
- package/skills/infra/mcp-developer/SKILL.md +143 -143
- package/skills/infra/mcp-developer/references/protocol.md +244 -244
- package/skills/infra/mcp-developer/references/python-sdk.md +367 -367
- package/skills/infra/mcp-developer/references/resources.md +554 -554
- package/skills/infra/mcp-developer/references/tools.md +480 -480
- package/skills/infra/mcp-developer/references/typescript-sdk.md +350 -350
- package/skills/infra/monitoring-expert/SKILL.md +176 -176
- package/skills/infra/monitoring-expert/references/alerting-rules.md +141 -141
- package/skills/infra/monitoring-expert/references/application-profiling.md +331 -331
- package/skills/infra/monitoring-expert/references/capacity-planning.md +344 -344
- package/skills/infra/monitoring-expert/references/dashboards.md +126 -126
- package/skills/infra/monitoring-expert/references/opentelemetry.md +123 -123
- package/skills/infra/monitoring-expert/references/performance-testing.md +269 -269
- package/skills/infra/monitoring-expert/references/prometheus-metrics.md +136 -136
- package/skills/infra/monitoring-expert/references/structured-logging.md +142 -142
- package/skills/infra/sre-engineer/SKILL.md +181 -181
- package/skills/infra/sre-engineer/references/automation-toil.md +492 -492
- package/skills/infra/sre-engineer/references/error-budget-policy.md +334 -334
- package/skills/infra/sre-engineer/references/incident-chaos.md +576 -576
- package/skills/infra/sre-engineer/references/monitoring-alerting.md +424 -424
- package/skills/infra/sre-engineer/references/slo-sli-management.md +238 -238
- package/skills/infra/terraform-engineer/SKILL.md +143 -143
- package/skills/infra/terraform-engineer/references/best-practices.md +583 -583
- package/skills/infra/terraform-engineer/references/module-patterns.md +297 -297
- package/skills/infra/terraform-engineer/references/providers.md +452 -452
- package/skills/infra/terraform-engineer/references/state-management.md +371 -371
- package/skills/infra/terraform-engineer/references/testing.md +486 -486
- package/skills/infra/websocket-engineer/SKILL.md +168 -168
- package/skills/infra/websocket-engineer/references/alternatives.md +391 -391
- package/skills/infra/websocket-engineer/references/patterns.md +400 -400
- package/skills/infra/websocket-engineer/references/protocol.md +195 -195
- package/skills/infra/websocket-engineer/references/scaling.md +333 -333
- package/skills/infra/websocket-engineer/references/security.md +474 -474
- package/skills/java/java-architect/SKILL.md +132 -132
- package/skills/java/java-architect/references/jpa-optimization.md +393 -393
- package/skills/java/java-architect/references/reactive-webflux.md +356 -356
- package/skills/java/java-architect/references/spring-boot-setup.md +269 -269
- package/skills/java/java-architect/references/spring-security.md +445 -445
- package/skills/java/java-architect/references/testing-patterns.md +500 -500
- package/skills/java/kotlin-specialist/SKILL.md +147 -147
- package/skills/java/kotlin-specialist/references/android-compose.md +419 -419
- package/skills/java/kotlin-specialist/references/coroutines-flow.md +276 -276
- package/skills/java/kotlin-specialist/references/dsl-idioms.md +421 -421
- package/skills/java/kotlin-specialist/references/ktor-server.md +426 -426
- package/skills/java/kotlin-specialist/references/multiplatform-kmp.md +380 -380
- package/skills/java/spring-boot-engineer/SKILL.md +195 -195
- package/skills/java/spring-boot-engineer/references/cloud.md +498 -498
- package/skills/java/spring-boot-engineer/references/data.md +381 -381
- package/skills/java/spring-boot-engineer/references/security.md +459 -459
- package/skills/java/spring-boot-engineer/references/testing.md +545 -545
- package/skills/java/spring-boot-engineer/references/web.md +295 -295
- package/skills/javascript/javascript-pro/SKILL.md +132 -132
- package/skills/javascript/javascript-pro/references/async-patterns.md +334 -334
- package/skills/javascript/javascript-pro/references/browser-apis.md +398 -398
- package/skills/javascript/javascript-pro/references/modern-syntax.md +272 -272
- package/skills/javascript/javascript-pro/references/modules.md +357 -357
- package/skills/javascript/javascript-pro/references/node-essentials.md +471 -471
- package/skills/javascript/nestjs-expert/SKILL.md +206 -206
- package/skills/javascript/nestjs-expert/references/authentication.md +166 -166
- package/skills/javascript/nestjs-expert/references/controllers-routing.md +111 -111
- package/skills/javascript/nestjs-expert/references/dtos-validation.md +153 -153
- package/skills/javascript/nestjs-expert/references/migration-from-express.md +1237 -1237
- package/skills/javascript/nestjs-expert/references/services-di.md +140 -140
- package/skills/javascript/nestjs-expert/references/testing-patterns.md +186 -186
- package/skills/javascript/typescript-pro/SKILL.md +145 -145
- package/skills/javascript/typescript-pro/references/advanced-types.md +259 -259
- package/skills/javascript/typescript-pro/references/configuration.md +445 -445
- package/skills/javascript/typescript-pro/references/patterns.md +484 -484
- package/skills/javascript/typescript-pro/references/type-guards.md +352 -352
- package/skills/javascript/typescript-pro/references/utility-types.md +329 -329
- package/skills/php/laravel-specialist/SKILL.md +262 -262
- package/skills/php/laravel-specialist/references/eloquent.md +351 -351
- package/skills/php/laravel-specialist/references/livewire.md +512 -512
- package/skills/php/laravel-specialist/references/queues.md +423 -423
- package/skills/php/laravel-specialist/references/routing.md +362 -362
- package/skills/php/laravel-specialist/references/testing.md +522 -522
- package/skills/php/php-pro/SKILL.md +206 -206
- package/skills/php/php-pro/references/async-patterns.md +412 -412
- package/skills/php/php-pro/references/laravel-patterns.md +377 -377
- package/skills/php/php-pro/references/modern-php-features.md +323 -323
- package/skills/php/php-pro/references/symfony-patterns.md +466 -466
- package/skills/php/php-pro/references/testing-quality.md +466 -466
- package/skills/product/competitive-analysis/SKILL.md +257 -257
- package/skills/product/meeting-notes/SKILL.md +266 -266
- package/skills/product/prd-template/SKILL.md +150 -150
- package/skills/product/stakeholder-update/SKILL.md +225 -225
- package/skills/product/user-research-synthesis/SKILL.md +235 -235
- package/skills/python/django-expert/SKILL.md +162 -162
- package/skills/python/django-expert/references/authentication.md +145 -145
- package/skills/python/django-expert/references/drf-serializers.md +148 -148
- package/skills/python/django-expert/references/models-orm.md +151 -151
- package/skills/python/django-expert/references/testing-django.md +204 -204
- package/skills/python/django-expert/references/viewsets-views.md +153 -153
- package/skills/python/fastapi-expert/SKILL.md +185 -185
- package/skills/python/fastapi-expert/references/async-sqlalchemy.md +146 -146
- package/skills/python/fastapi-expert/references/authentication.md +159 -159
- package/skills/python/fastapi-expert/references/endpoints-routing.md +142 -142
- package/skills/python/fastapi-expert/references/migration-from-django.md +996 -996
- package/skills/python/fastapi-expert/references/pydantic-v2.md +135 -135
- package/skills/python/fastapi-expert/references/testing-async.md +159 -159
- package/skills/python/pandas-pro/SKILL.md +178 -178
- package/skills/python/pandas-pro/references/aggregation-groupby.md +545 -545
- package/skills/python/pandas-pro/references/data-cleaning.md +500 -500
- package/skills/python/pandas-pro/references/dataframe-operations.md +420 -420
- package/skills/python/pandas-pro/references/merging-joining.md +596 -596
- package/skills/python/pandas-pro/references/performance-optimization.md +597 -597
- package/skills/python/python-pro/SKILL.md +177 -177
- package/skills/python/python-pro/references/async-patterns.md +356 -356
- package/skills/python/python-pro/references/packaging.md +460 -460
- package/skills/python/python-pro/references/standard-library.md +378 -378
- package/skills/python/python-pro/references/testing.md +404 -404
- package/skills/python/python-pro/references/type-system.md +290 -290
- package/skills/quality/chaos-engineer/SKILL.md +182 -182
- package/skills/quality/chaos-engineer/references/chaos-tools.md +511 -511
- package/skills/quality/chaos-engineer/references/experiment-design.md +229 -229
- package/skills/quality/chaos-engineer/references/game-days.md +434 -434
- package/skills/quality/chaos-engineer/references/infrastructure-chaos.md +348 -348
- package/skills/quality/chaos-engineer/references/kubernetes-chaos.md +432 -432
- package/skills/quality/code-reviewer/SKILL.md +119 -119
- package/skills/quality/code-reviewer/references/common-issues.md +142 -142
- package/skills/quality/code-reviewer/references/feedback-examples.md +144 -144
- package/skills/quality/code-reviewer/references/receiving-feedback.md +238 -238
- package/skills/quality/code-reviewer/references/report-template.md +109 -109
- package/skills/quality/code-reviewer/references/review-checklist.md +88 -88
- package/skills/quality/code-reviewer/references/spec-compliance-review.md +258 -258
- package/skills/quality/playwright-expert/SKILL.md +169 -169
- package/skills/quality/playwright-expert/references/api-mocking.md +140 -140
- package/skills/quality/playwright-expert/references/configuration.md +155 -155
- package/skills/quality/playwright-expert/references/debugging-flaky.md +150 -150
- package/skills/quality/playwright-expert/references/page-object-model.md +152 -152
- package/skills/quality/playwright-expert/references/selectors-locators.md +119 -119
- package/skills/quality/secure-code-guardian/SKILL.md +191 -191
- package/skills/quality/secure-code-guardian/references/authentication.md +136 -136
- package/skills/quality/secure-code-guardian/references/input-validation.md +146 -146
- package/skills/quality/secure-code-guardian/references/owasp-prevention.md +135 -135
- package/skills/quality/secure-code-guardian/references/security-headers.md +133 -133
- package/skills/quality/secure-code-guardian/references/xss-csrf.md +157 -157
- package/skills/quality/security-reviewer/SKILL.md +103 -103
- package/skills/quality/security-reviewer/references/infrastructure-security.md +268 -268
- package/skills/quality/security-reviewer/references/penetration-testing.md +268 -268
- package/skills/quality/security-reviewer/references/report-template.md +170 -170
- package/skills/quality/security-reviewer/references/sast-tools.md +117 -117
- package/skills/quality/security-reviewer/references/secret-scanning.md +125 -125
- package/skills/quality/security-reviewer/references/vulnerability-patterns.md +152 -152
- package/skills/quality/senior-qa/README.md +196 -196
- package/skills/quality/senior-qa/SKILL.md +399 -399
- package/skills/quality/senior-qa/references/qa_best_practices.md +964 -964
- package/skills/quality/senior-qa/references/test_automation_patterns.md +1009 -1009
- package/skills/quality/senior-qa/references/testing_strategies.md +649 -649
- package/skills/quality/senior-qa/scripts/coverage_analyzer.py +836 -836
- package/skills/quality/senior-qa/scripts/e2e_test_scaffolder.py +820 -820
- package/skills/quality/senior-qa/scripts/test_suite_generator.py +605 -605
- package/skills/quality/tdd-guide/HOW_TO_USE.md +313 -313
- package/skills/quality/tdd-guide/README.md +680 -680
- package/skills/quality/tdd-guide/SKILL.md +122 -122
- package/skills/quality/tdd-guide/assets/expected_output.json +77 -77
- package/skills/quality/tdd-guide/assets/sample_input_python.json +39 -39
- package/skills/quality/tdd-guide/assets/sample_input_typescript.json +36 -36
- package/skills/quality/tdd-guide/references/ci-integration.md +195 -195
- package/skills/quality/tdd-guide/references/framework-guide.md +206 -206
- package/skills/quality/tdd-guide/references/tdd-best-practices.md +128 -128
- package/skills/quality/tdd-guide/scripts/coverage_analyzer.py +434 -434
- package/skills/quality/tdd-guide/scripts/fixture_generator.py +440 -440
- package/skills/quality/tdd-guide/scripts/format_detector.py +384 -384
- package/skills/quality/tdd-guide/scripts/framework_adapter.py +428 -428
- package/skills/quality/tdd-guide/scripts/metrics_calculator.py +456 -456
- package/skills/quality/tdd-guide/scripts/output_formatter.py +354 -354
- package/skills/quality/tdd-guide/scripts/tdd_workflow.py +474 -474
- package/skills/quality/tdd-guide/scripts/test_generator.py +438 -438
- package/skills/quality/test-master/SKILL.md +94 -94
- package/skills/quality/test-master/references/automation-frameworks.md +294 -294
- package/skills/quality/test-master/references/e2e-testing.md +128 -128
- package/skills/quality/test-master/references/integration-testing.md +120 -120
- package/skills/quality/test-master/references/performance-testing.md +118 -118
- package/skills/quality/test-master/references/qa-methodology.md +247 -247
- package/skills/quality/test-master/references/security-testing.md +127 -127
- package/skills/quality/test-master/references/tdd-iron-laws.md +174 -174
- package/skills/quality/test-master/references/test-reports.md +104 -104
- package/skills/quality/test-master/references/testing-anti-patterns.md +231 -231
- package/skills/quality/test-master/references/unit-testing.md +113 -113
- package/skills/ruby/rails-expert/SKILL.md +154 -154
- package/skills/ruby/rails-expert/references/active-record.md +244 -244
- package/skills/ruby/rails-expert/references/api-development.md +401 -401
- package/skills/ruby/rails-expert/references/background-jobs.md +272 -272
- package/skills/ruby/rails-expert/references/hotwire-turbo.md +228 -228
- package/skills/ruby/rails-expert/references/rspec-testing.md +367 -367
- package/skills/swift/swift-expert/SKILL.md +163 -163
- package/skills/swift/swift-expert/references/async-concurrency.md +360 -360
- package/skills/swift/swift-expert/references/memory-performance.md +377 -377
- package/skills/swift/swift-expert/references/protocol-oriented.md +354 -354
- package/skills/swift/swift-expert/references/swiftui-patterns.md +291 -291
- package/skills/swift/swift-expert/references/testing-patterns.md +399 -399
- package/skills/workflow/brainstorming/SKILL.md +164 -164
- package/skills/workflow/brainstorming/scripts/frame-template.html +214 -214
- package/skills/workflow/brainstorming/scripts/helper.js +88 -88
- package/skills/workflow/brainstorming/scripts/server.cjs +354 -354
- package/skills/workflow/brainstorming/scripts/start-server.sh +148 -148
- package/skills/workflow/brainstorming/scripts/stop-server.sh +56 -56
- package/skills/workflow/brainstorming/spec-document-reviewer-prompt.md +49 -49
- package/skills/workflow/brainstorming/visual-companion.md +287 -287
- package/skills/workflow/documentation/SKILL.md +45 -45
- package/skills/workflow/entropy-management/SKILL.md +115 -115
- package/skills/workflow/executing-plans/SKILL.md +70 -70
- package/skills/workflow/finishing-a-development-branch/SKILL.md +200 -200
- package/skills/workflow/receiving-code-review/SKILL.md +213 -213
- package/skills/workflow/requesting-code-review/SKILL.md +105 -105
- package/skills/workflow/requesting-code-review/code-reviewer.md +146 -146
- package/skills/workflow/requirement-engineering/SKILL.md +111 -111
- package/skills/workflow/systematic-debugging/CREATION-LOG.md +119 -119
- package/skills/workflow/systematic-debugging/SKILL.md +296 -296
- package/skills/workflow/systematic-debugging/condition-based-waiting-example.ts +158 -158
- package/skills/workflow/systematic-debugging/condition-based-waiting.md +115 -115
- package/skills/workflow/systematic-debugging/defense-in-depth.md +122 -122
- package/skills/workflow/systematic-debugging/find-polluter.sh +63 -63
- package/skills/workflow/systematic-debugging/root-cause-tracing.md +169 -169
- package/skills/workflow/systematic-debugging/test-academic.md +14 -14
- package/skills/workflow/systematic-debugging/test-pressure-1.md +58 -58
- package/skills/workflow/systematic-debugging/test-pressure-2.md +68 -68
- package/skills/workflow/systematic-debugging/test-pressure-3.md +69 -69
- package/skills/workflow/using-git-worktrees/SKILL.md +218 -218
- package/skills/workflow/verification-before-completion/SKILL.md +139 -139
- package/skills/workflow/writing-plans/SKILL.md +151 -151
- package/skills/workflow/writing-plans/plan-document-reviewer-prompt.md +49 -49
- package/skills/workflow/writing-skills/SKILL.md +655 -655
- package/skills/workflow/writing-skills/anthropic-best-practices.md +1150 -1150
- package/skills/workflow/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -189
- package/skills/workflow/writing-skills/persuasion-principles.md +187 -187
- package/skills/workflow/writing-skills/render-graphs.js +168 -168
- package/skills/workflow/writing-skills/testing-skills-with-subagents.md +384 -384
|
@@ -1,907 +1,907 @@
|
|
|
1
|
-
# Pipeline Orchestration
|
|
2
|
-
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
## Overview
|
|
6
|
-
|
|
7
|
-
Pipeline orchestration automates the end-to-end ML workflow from data ingestion through model deployment. Orchestrators manage dependencies, handle failures, enable scheduling, and provide observability across complex multi-step pipelines.
|
|
8
|
-
|
|
9
|
-
## When to Use This Reference
|
|
10
|
-
|
|
11
|
-
- Building Kubeflow Pipelines for ML workflows
|
|
12
|
-
- Creating Airflow DAGs for data and ML pipelines
|
|
13
|
-
- Implementing Prefect flows for modern orchestration
|
|
14
|
-
- Designing pipeline DAGs and component dependencies
|
|
15
|
-
- Setting up scheduled retraining workflows
|
|
16
|
-
|
|
17
|
-
## When NOT to Use
|
|
18
|
-
|
|
19
|
-
- Simple linear scripts without dependencies
|
|
20
|
-
- One-off data processing tasks
|
|
21
|
-
- Interactive development and experimentation
|
|
22
|
-
|
|
23
|
-
---
|
|
24
|
-
|
|
25
|
-
## Kubeflow Pipelines
|
|
26
|
-
|
|
27
|
-
### Pipeline Definition (KFP v2)
|
|
28
|
-
|
|
29
|
-
```python
|
|
30
|
-
from kfp import dsl
|
|
31
|
-
from kfp.dsl import Input, Output, Artifact, Dataset, Model, Metrics
|
|
32
|
-
from kfp import compiler
|
|
33
|
-
from typing import NamedTuple
|
|
34
|
-
|
|
35
|
-
@dsl.component(
|
|
36
|
-
base_image="python:3.11-slim",
|
|
37
|
-
packages_to_install=["pandas", "scikit-learn"],
|
|
38
|
-
)
|
|
39
|
-
def load_data(
|
|
40
|
-
data_path: str,
|
|
41
|
-
output_dataset: Output[Dataset],
|
|
42
|
-
) -> None:
|
|
43
|
-
"""Load and validate raw data."""
|
|
44
|
-
import pandas as pd
|
|
45
|
-
|
|
46
|
-
df = pd.read_parquet(data_path)
|
|
47
|
-
|
|
48
|
-
# Basic validation
|
|
49
|
-
assert len(df) > 0, "Dataset is empty"
|
|
50
|
-
assert "target" in df.columns, "Missing target column"
|
|
51
|
-
|
|
52
|
-
df.to_parquet(output_dataset.path)
|
|
53
|
-
output_dataset.metadata["num_rows"] = len(df)
|
|
54
|
-
output_dataset.metadata["num_features"] = len(df.columns) - 1
|
|
55
|
-
|
|
56
|
-
@dsl.component(
|
|
57
|
-
base_image="python:3.11-slim",
|
|
58
|
-
packages_to_install=["pandas", "scikit-learn"],
|
|
59
|
-
)
|
|
60
|
-
def preprocess_data(
|
|
61
|
-
input_dataset: Input[Dataset],
|
|
62
|
-
train_dataset: Output[Dataset],
|
|
63
|
-
test_dataset: Output[Dataset],
|
|
64
|
-
test_size: float = 0.2,
|
|
65
|
-
random_state: int = 42,
|
|
66
|
-
) -> None:
|
|
67
|
-
"""Preprocess and split data."""
|
|
68
|
-
import pandas as pd
|
|
69
|
-
from sklearn.model_selection import train_test_split
|
|
70
|
-
from sklearn.preprocessing import StandardScaler
|
|
71
|
-
|
|
72
|
-
df = pd.read_parquet(input_dataset.path)
|
|
73
|
-
|
|
74
|
-
X = df.drop("target", axis=1)
|
|
75
|
-
y = df["target"]
|
|
76
|
-
|
|
77
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
|
78
|
-
X, y, test_size=test_size, random_state=random_state
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
scaler = StandardScaler()
|
|
82
|
-
X_train_scaled = scaler.fit_transform(X_train)
|
|
83
|
-
X_test_scaled = scaler.transform(X_test)
|
|
84
|
-
|
|
85
|
-
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
|
|
86
|
-
train_df["target"] = y_train.values
|
|
87
|
-
train_df.to_parquet(train_dataset.path)
|
|
88
|
-
|
|
89
|
-
test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
|
|
90
|
-
test_df["target"] = y_test.values
|
|
91
|
-
test_df.to_parquet(test_dataset.path)
|
|
92
|
-
|
|
93
|
-
@dsl.component(
|
|
94
|
-
base_image="python:3.11-slim",
|
|
95
|
-
packages_to_install=["pandas", "scikit-learn", "joblib"],
|
|
96
|
-
)
|
|
97
|
-
def train_model(
|
|
98
|
-
train_dataset: Input[Dataset],
|
|
99
|
-
model_artifact: Output[Model],
|
|
100
|
-
n_estimators: int = 100,
|
|
101
|
-
max_depth: int = 10,
|
|
102
|
-
) -> None:
|
|
103
|
-
"""Train RandomForest model."""
|
|
104
|
-
import pandas as pd
|
|
105
|
-
from sklearn.ensemble import RandomForestClassifier
|
|
106
|
-
import joblib
|
|
107
|
-
|
|
108
|
-
df = pd.read_parquet(train_dataset.path)
|
|
109
|
-
X = df.drop("target", axis=1)
|
|
110
|
-
y = df["target"]
|
|
111
|
-
|
|
112
|
-
model = RandomForestClassifier(
|
|
113
|
-
n_estimators=n_estimators,
|
|
114
|
-
max_depth=max_depth,
|
|
115
|
-
random_state=42,
|
|
116
|
-
)
|
|
117
|
-
model.fit(X, y)
|
|
118
|
-
|
|
119
|
-
joblib.dump(model, model_artifact.path)
|
|
120
|
-
model_artifact.metadata["n_estimators"] = n_estimators
|
|
121
|
-
model_artifact.metadata["max_depth"] = max_depth
|
|
122
|
-
|
|
123
|
-
@dsl.component(
|
|
124
|
-
base_image="python:3.11-slim",
|
|
125
|
-
packages_to_install=["pandas", "scikit-learn", "joblib"],
|
|
126
|
-
)
|
|
127
|
-
def evaluate_model(
|
|
128
|
-
model_artifact: Input[Model],
|
|
129
|
-
test_dataset: Input[Dataset],
|
|
130
|
-
metrics: Output[Metrics],
|
|
131
|
-
threshold: float = 0.8,
|
|
132
|
-
) -> NamedTuple("Outputs", [("passed", bool), ("accuracy", float)]):
|
|
133
|
-
"""Evaluate model and check threshold."""
|
|
134
|
-
import pandas as pd
|
|
135
|
-
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
136
|
-
import joblib
|
|
137
|
-
from collections import namedtuple
|
|
138
|
-
|
|
139
|
-
model = joblib.load(model_artifact.path)
|
|
140
|
-
df = pd.read_parquet(test_dataset.path)
|
|
141
|
-
X = df.drop("target", axis=1)
|
|
142
|
-
y = df["target"]
|
|
143
|
-
|
|
144
|
-
predictions = model.predict(X)
|
|
145
|
-
|
|
146
|
-
accuracy = accuracy_score(y, predictions)
|
|
147
|
-
precision = precision_score(y, predictions, average="weighted")
|
|
148
|
-
recall = recall_score(y, predictions, average="weighted")
|
|
149
|
-
f1 = f1_score(y, predictions, average="weighted")
|
|
150
|
-
|
|
151
|
-
metrics.log_metric("accuracy", accuracy)
|
|
152
|
-
metrics.log_metric("precision", precision)
|
|
153
|
-
metrics.log_metric("recall", recall)
|
|
154
|
-
metrics.log_metric("f1_score", f1)
|
|
155
|
-
|
|
156
|
-
passed = accuracy >= threshold
|
|
157
|
-
|
|
158
|
-
Outputs = namedtuple("Outputs", ["passed", "accuracy"])
|
|
159
|
-
return Outputs(passed, accuracy)
|
|
160
|
-
|
|
161
|
-
@dsl.component(
|
|
162
|
-
base_image="python:3.11-slim",
|
|
163
|
-
packages_to_install=["google-cloud-storage"],
|
|
164
|
-
)
|
|
165
|
-
def deploy_model(
|
|
166
|
-
model_artifact: Input[Model],
|
|
167
|
-
model_name: str,
|
|
168
|
-
endpoint: str,
|
|
169
|
-
) -> str:
|
|
170
|
-
"""Deploy model to serving endpoint."""
|
|
171
|
-
from google.cloud import storage
|
|
172
|
-
import shutil
|
|
173
|
-
|
|
174
|
-
# Copy model to GCS
|
|
175
|
-
bucket_name = endpoint.split("/")[2]
|
|
176
|
-
model_path = f"models/{model_name}/model.joblib"
|
|
177
|
-
|
|
178
|
-
client = storage.Client()
|
|
179
|
-
bucket = client.bucket(bucket_name)
|
|
180
|
-
blob = bucket.blob(model_path)
|
|
181
|
-
blob.upload_from_filename(model_artifact.path)
|
|
182
|
-
|
|
183
|
-
return f"gs://{bucket_name}/{model_path}"
|
|
184
|
-
|
|
185
|
-
@dsl.pipeline(
|
|
186
|
-
name="ml-training-pipeline",
|
|
187
|
-
description="End-to-end ML training pipeline",
|
|
188
|
-
)
|
|
189
|
-
def ml_pipeline(
|
|
190
|
-
data_path: str,
|
|
191
|
-
n_estimators: int = 100,
|
|
192
|
-
max_depth: int = 10,
|
|
193
|
-
accuracy_threshold: float = 0.8,
|
|
194
|
-
model_name: str = "classifier",
|
|
195
|
-
endpoint: str = "gs://ml-models/serving",
|
|
196
|
-
) -> None:
|
|
197
|
-
"""Complete ML training pipeline."""
|
|
198
|
-
|
|
199
|
-
load_task = load_data(data_path=data_path)
|
|
200
|
-
|
|
201
|
-
preprocess_task = preprocess_data(
|
|
202
|
-
input_dataset=load_task.outputs["output_dataset"],
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
train_task = train_model(
|
|
206
|
-
train_dataset=preprocess_task.outputs["train_dataset"],
|
|
207
|
-
n_estimators=n_estimators,
|
|
208
|
-
max_depth=max_depth,
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
evaluate_task = evaluate_model(
|
|
212
|
-
model_artifact=train_task.outputs["model_artifact"],
|
|
213
|
-
test_dataset=preprocess_task.outputs["test_dataset"],
|
|
214
|
-
threshold=accuracy_threshold,
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
with dsl.If(evaluate_task.outputs["passed"] == True):
|
|
218
|
-
deploy_model(
|
|
219
|
-
model_artifact=train_task.outputs["model_artifact"],
|
|
220
|
-
model_name=model_name,
|
|
221
|
-
endpoint=endpoint,
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
# Compile pipeline
|
|
225
|
-
if __name__ == "__main__":
|
|
226
|
-
compiler.Compiler().compile(
|
|
227
|
-
ml_pipeline,
|
|
228
|
-
"ml_pipeline.yaml",
|
|
229
|
-
)
|
|
230
|
-
```
|
|
231
|
-
|
|
232
|
-
### Running Kubeflow Pipelines
|
|
233
|
-
|
|
234
|
-
```python
|
|
235
|
-
from kfp.client import Client
|
|
236
|
-
|
|
237
|
-
def run_pipeline(
|
|
238
|
-
pipeline_file: str,
|
|
239
|
-
experiment_name: str,
|
|
240
|
-
run_name: str,
|
|
241
|
-
parameters: dict,
|
|
242
|
-
) -> str:
|
|
243
|
-
"""Submit pipeline run to Kubeflow."""
|
|
244
|
-
client = Client(host="https://kubeflow.example.com/pipeline")
|
|
245
|
-
|
|
246
|
-
# Create or get experiment
|
|
247
|
-
experiment = client.create_experiment(name=experiment_name)
|
|
248
|
-
|
|
249
|
-
# Submit run
|
|
250
|
-
run = client.create_run_from_pipeline_package(
|
|
251
|
-
pipeline_file=pipeline_file,
|
|
252
|
-
experiment_id=experiment.experiment_id,
|
|
253
|
-
run_name=run_name,
|
|
254
|
-
arguments=parameters,
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
return run.run_id
|
|
258
|
-
|
|
259
|
-
def schedule_pipeline(
|
|
260
|
-
pipeline_file: str,
|
|
261
|
-
experiment_name: str,
|
|
262
|
-
schedule_name: str,
|
|
263
|
-
cron_expression: str,
|
|
264
|
-
parameters: dict,
|
|
265
|
-
) -> str:
|
|
266
|
-
"""Create recurring pipeline run."""
|
|
267
|
-
client = Client(host="https://kubeflow.example.com/pipeline")
|
|
268
|
-
|
|
269
|
-
experiment = client.create_experiment(name=experiment_name)
|
|
270
|
-
|
|
271
|
-
# Create recurring run
|
|
272
|
-
job = client.create_recurring_run(
|
|
273
|
-
experiment_id=experiment.experiment_id,
|
|
274
|
-
job_name=schedule_name,
|
|
275
|
-
pipeline_package_path=pipeline_file,
|
|
276
|
-
cron_expression=cron_expression,
|
|
277
|
-
enabled=True,
|
|
278
|
-
parameters=parameters,
|
|
279
|
-
)
|
|
280
|
-
|
|
281
|
-
return job.id
|
|
282
|
-
```
|
|
283
|
-
|
|
284
|
-
---
|
|
285
|
-
|
|
286
|
-
## Apache Airflow
|
|
287
|
-
|
|
288
|
-
### ML Pipeline DAG
|
|
289
|
-
|
|
290
|
-
```python
|
|
291
|
-
from airflow import DAG
|
|
292
|
-
from airflow.operators.python import PythonOperator, BranchPythonOperator
|
|
293
|
-
from airflow.operators.empty import EmptyOperator
|
|
294
|
-
from airflow.providers.amazon.aws.operators.s3 import S3CreateObjectOperator
|
|
295
|
-
from airflow.utils.trigger_rule import TriggerRule
|
|
296
|
-
from datetime import datetime, timedelta
|
|
297
|
-
import json
|
|
298
|
-
|
|
299
|
-
default_args = {
|
|
300
|
-
"owner": "ml-team",
|
|
301
|
-
"depends_on_past": False,
|
|
302
|
-
"email_on_failure": True,
|
|
303
|
-
"email_on_retry": False,
|
|
304
|
-
"retries": 2,
|
|
305
|
-
"retry_delay": timedelta(minutes=5),
|
|
306
|
-
"execution_timeout": timedelta(hours=2),
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
def load_data(**context):
|
|
310
|
-
"""Load data from source."""
|
|
311
|
-
import pandas as pd
|
|
312
|
-
|
|
313
|
-
data_path = context["params"]["data_path"]
|
|
314
|
-
df = pd.read_parquet(data_path)
|
|
315
|
-
|
|
316
|
-
# Push to XCom for downstream tasks
|
|
317
|
-
output_path = f"/tmp/data_{context['run_id']}.parquet"
|
|
318
|
-
df.to_parquet(output_path)
|
|
319
|
-
|
|
320
|
-
context["ti"].xcom_push(key="data_path", value=output_path)
|
|
321
|
-
context["ti"].xcom_push(key="num_rows", value=len(df))
|
|
322
|
-
|
|
323
|
-
return output_path
|
|
324
|
-
|
|
325
|
-
def preprocess_data(**context):
|
|
326
|
-
"""Preprocess and split data."""
|
|
327
|
-
import pandas as pd
|
|
328
|
-
from sklearn.model_selection import train_test_split
|
|
329
|
-
from sklearn.preprocessing import StandardScaler
|
|
330
|
-
|
|
331
|
-
input_path = context["ti"].xcom_pull(key="data_path", task_ids="load_data")
|
|
332
|
-
df = pd.read_parquet(input_path)
|
|
333
|
-
|
|
334
|
-
X = df.drop("target", axis=1)
|
|
335
|
-
y = df["target"]
|
|
336
|
-
|
|
337
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
|
338
|
-
X, y, test_size=0.2, random_state=42
|
|
339
|
-
)
|
|
340
|
-
|
|
341
|
-
scaler = StandardScaler()
|
|
342
|
-
X_train_scaled = scaler.fit_transform(X_train)
|
|
343
|
-
X_test_scaled = scaler.transform(X_test)
|
|
344
|
-
|
|
345
|
-
# Save processed data
|
|
346
|
-
train_path = f"/tmp/train_{context['run_id']}.parquet"
|
|
347
|
-
test_path = f"/tmp/test_{context['run_id']}.parquet"
|
|
348
|
-
|
|
349
|
-
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
|
|
350
|
-
train_df["target"] = y_train.values
|
|
351
|
-
train_df.to_parquet(train_path)
|
|
352
|
-
|
|
353
|
-
test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
|
|
354
|
-
test_df["target"] = y_test.values
|
|
355
|
-
test_df.to_parquet(test_path)
|
|
356
|
-
|
|
357
|
-
context["ti"].xcom_push(key="train_path", value=train_path)
|
|
358
|
-
context["ti"].xcom_push(key="test_path", value=test_path)
|
|
359
|
-
|
|
360
|
-
def train_model(**context):
|
|
361
|
-
"""Train ML model."""
|
|
362
|
-
import pandas as pd
|
|
363
|
-
from sklearn.ensemble import RandomForestClassifier
|
|
364
|
-
import joblib
|
|
365
|
-
|
|
366
|
-
train_path = context["ti"].xcom_pull(key="train_path", task_ids="preprocess_data")
|
|
367
|
-
df = pd.read_parquet(train_path)
|
|
368
|
-
|
|
369
|
-
X = df.drop("target", axis=1)
|
|
370
|
-
y = df["target"]
|
|
371
|
-
|
|
372
|
-
params = context["params"]
|
|
373
|
-
model = RandomForestClassifier(
|
|
374
|
-
n_estimators=params.get("n_estimators", 100),
|
|
375
|
-
max_depth=params.get("max_depth", 10),
|
|
376
|
-
random_state=42,
|
|
377
|
-
)
|
|
378
|
-
model.fit(X, y)
|
|
379
|
-
|
|
380
|
-
model_path = f"/tmp/model_{context['run_id']}.joblib"
|
|
381
|
-
joblib.dump(model, model_path)
|
|
382
|
-
|
|
383
|
-
context["ti"].xcom_push(key="model_path", value=model_path)
|
|
384
|
-
|
|
385
|
-
def evaluate_model(**context):
|
|
386
|
-
"""Evaluate model and return metrics."""
|
|
387
|
-
import pandas as pd
|
|
388
|
-
from sklearn.metrics import accuracy_score, precision_score, recall_score
|
|
389
|
-
import joblib
|
|
390
|
-
|
|
391
|
-
model_path = context["ti"].xcom_pull(key="model_path", task_ids="train_model")
|
|
392
|
-
test_path = context["ti"].xcom_pull(key="test_path", task_ids="preprocess_data")
|
|
393
|
-
|
|
394
|
-
model = joblib.load(model_path)
|
|
395
|
-
df = pd.read_parquet(test_path)
|
|
396
|
-
|
|
397
|
-
X = df.drop("target", axis=1)
|
|
398
|
-
y = df["target"]
|
|
399
|
-
|
|
400
|
-
predictions = model.predict(X)
|
|
401
|
-
|
|
402
|
-
metrics = {
|
|
403
|
-
"accuracy": accuracy_score(y, predictions),
|
|
404
|
-
"precision": precision_score(y, predictions, average="weighted"),
|
|
405
|
-
"recall": recall_score(y, predictions, average="weighted"),
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
context["ti"].xcom_push(key="metrics", value=metrics)
|
|
409
|
-
|
|
410
|
-
return metrics
|
|
411
|
-
|
|
412
|
-
def check_metrics_threshold(**context):
|
|
413
|
-
"""Branch based on model performance."""
|
|
414
|
-
metrics = context["ti"].xcom_pull(key="metrics", task_ids="evaluate_model")
|
|
415
|
-
threshold = context["params"].get("accuracy_threshold", 0.8)
|
|
416
|
-
|
|
417
|
-
if metrics["accuracy"] >= threshold:
|
|
418
|
-
return "deploy_model"
|
|
419
|
-
return "skip_deployment"
|
|
420
|
-
|
|
421
|
-
def deploy_model(**context):
|
|
422
|
-
"""Deploy model to production."""
|
|
423
|
-
import shutil
|
|
424
|
-
|
|
425
|
-
model_path = context["ti"].xcom_pull(key="model_path", task_ids="train_model")
|
|
426
|
-
metrics = context["ti"].xcom_pull(key="metrics", task_ids="evaluate_model")
|
|
427
|
-
|
|
428
|
-
# In production, this would upload to model registry/serving
|
|
429
|
-
deploy_path = f"/models/production/model_{context['run_id']}.joblib"
|
|
430
|
-
shutil.copy(model_path, deploy_path)
|
|
431
|
-
|
|
432
|
-
return {
|
|
433
|
-
"model_path": deploy_path,
|
|
434
|
-
"metrics": metrics,
|
|
435
|
-
"deployed_at": datetime.utcnow().isoformat(),
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
with DAG(
|
|
439
|
-
dag_id="ml_training_pipeline",
|
|
440
|
-
default_args=default_args,
|
|
441
|
-
description="End-to-end ML training pipeline",
|
|
442
|
-
schedule_interval="0 2 * * *", # Daily at 2 AM
|
|
443
|
-
start_date=datetime(2024, 1, 1),
|
|
444
|
-
catchup=False,
|
|
445
|
-
tags=["ml", "training", "production"],
|
|
446
|
-
params={
|
|
447
|
-
"data_path": "s3://data-bucket/training_data.parquet",
|
|
448
|
-
"n_estimators": 100,
|
|
449
|
-
"max_depth": 10,
|
|
450
|
-
"accuracy_threshold": 0.8,
|
|
451
|
-
},
|
|
452
|
-
) as dag:
|
|
453
|
-
|
|
454
|
-
start = EmptyOperator(task_id="start")
|
|
455
|
-
|
|
456
|
-
load = PythonOperator(
|
|
457
|
-
task_id="load_data",
|
|
458
|
-
python_callable=load_data,
|
|
459
|
-
)
|
|
460
|
-
|
|
461
|
-
preprocess = PythonOperator(
|
|
462
|
-
task_id="preprocess_data",
|
|
463
|
-
python_callable=preprocess_data,
|
|
464
|
-
)
|
|
465
|
-
|
|
466
|
-
train = PythonOperator(
|
|
467
|
-
task_id="train_model",
|
|
468
|
-
python_callable=train_model,
|
|
469
|
-
)
|
|
470
|
-
|
|
471
|
-
evaluate = PythonOperator(
|
|
472
|
-
task_id="evaluate_model",
|
|
473
|
-
python_callable=evaluate_model,
|
|
474
|
-
)
|
|
475
|
-
|
|
476
|
-
check_threshold = BranchPythonOperator(
|
|
477
|
-
task_id="check_metrics_threshold",
|
|
478
|
-
python_callable=check_metrics_threshold,
|
|
479
|
-
)
|
|
480
|
-
|
|
481
|
-
deploy = PythonOperator(
|
|
482
|
-
task_id="deploy_model",
|
|
483
|
-
python_callable=deploy_model,
|
|
484
|
-
)
|
|
485
|
-
|
|
486
|
-
skip = EmptyOperator(task_id="skip_deployment")
|
|
487
|
-
|
|
488
|
-
end = EmptyOperator(
|
|
489
|
-
task_id="end",
|
|
490
|
-
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,
|
|
491
|
-
)
|
|
492
|
-
|
|
493
|
-
start >> load >> preprocess >> train >> evaluate >> check_threshold
|
|
494
|
-
check_threshold >> [deploy, skip] >> end
|
|
495
|
-
```
|
|
496
|
-
|
|
497
|
-
---
|
|
498
|
-
|
|
499
|
-
## Prefect
|
|
500
|
-
|
|
501
|
-
### Modern Flow-Based Pipeline
|
|
502
|
-
|
|
503
|
-
```python
|
|
504
|
-
from prefect import flow, task, get_run_logger
|
|
505
|
-
from prefect.artifacts import create_markdown_artifact
|
|
506
|
-
from prefect.tasks import task_input_hash
|
|
507
|
-
from datetime import timedelta
|
|
508
|
-
import pandas as pd
|
|
509
|
-
|
|
510
|
-
@task(
|
|
511
|
-
retries=3,
|
|
512
|
-
retry_delay_seconds=60,
|
|
513
|
-
cache_key_fn=task_input_hash,
|
|
514
|
-
cache_expiration=timedelta(hours=1),
|
|
515
|
-
)
|
|
516
|
-
def load_data(data_path: str) -> pd.DataFrame:
|
|
517
|
-
"""Load data with caching."""
|
|
518
|
-
logger = get_run_logger()
|
|
519
|
-
logger.info(f"Loading data from {data_path}")
|
|
520
|
-
|
|
521
|
-
df = pd.read_parquet(data_path)
|
|
522
|
-
logger.info(f"Loaded {len(df)} rows")
|
|
523
|
-
|
|
524
|
-
return df
|
|
525
|
-
|
|
526
|
-
@task(retries=2)
|
|
527
|
-
def preprocess_data(
|
|
528
|
-
df: pd.DataFrame,
|
|
529
|
-
test_size: float = 0.2,
|
|
530
|
-
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
531
|
-
"""Preprocess and split data."""
|
|
532
|
-
from sklearn.model_selection import train_test_split
|
|
533
|
-
from sklearn.preprocessing import StandardScaler
|
|
534
|
-
|
|
535
|
-
logger = get_run_logger()
|
|
536
|
-
|
|
537
|
-
X = df.drop("target", axis=1)
|
|
538
|
-
y = df["target"]
|
|
539
|
-
|
|
540
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
|
541
|
-
X, y, test_size=test_size, random_state=42
|
|
542
|
-
)
|
|
543
|
-
|
|
544
|
-
scaler = StandardScaler()
|
|
545
|
-
X_train_scaled = scaler.fit_transform(X_train)
|
|
546
|
-
X_test_scaled = scaler.transform(X_test)
|
|
547
|
-
|
|
548
|
-
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
|
|
549
|
-
train_df["target"] = y_train.values
|
|
550
|
-
|
|
551
|
-
test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
|
|
552
|
-
test_df["target"] = y_test.values
|
|
553
|
-
|
|
554
|
-
logger.info(f"Train: {len(train_df)}, Test: {len(test_df)}")
|
|
555
|
-
|
|
556
|
-
return train_df, test_df
|
|
557
|
-
|
|
558
|
-
@task
|
|
559
|
-
def train_model(
|
|
560
|
-
train_df: pd.DataFrame,
|
|
561
|
-
n_estimators: int = 100,
|
|
562
|
-
max_depth: int = 10,
|
|
563
|
-
):
|
|
564
|
-
"""Train RandomForest model."""
|
|
565
|
-
from sklearn.ensemble import RandomForestClassifier
|
|
566
|
-
|
|
567
|
-
logger = get_run_logger()
|
|
568
|
-
|
|
569
|
-
X = train_df.drop("target", axis=1)
|
|
570
|
-
y = train_df["target"]
|
|
571
|
-
|
|
572
|
-
model = RandomForestClassifier(
|
|
573
|
-
n_estimators=n_estimators,
|
|
574
|
-
max_depth=max_depth,
|
|
575
|
-
random_state=42,
|
|
576
|
-
n_jobs=-1,
|
|
577
|
-
)
|
|
578
|
-
|
|
579
|
-
logger.info("Training model...")
|
|
580
|
-
model.fit(X, y)
|
|
581
|
-
logger.info("Training complete")
|
|
582
|
-
|
|
583
|
-
return model
|
|
584
|
-
|
|
585
|
-
@task
|
|
586
|
-
def evaluate_model(model, test_df: pd.DataFrame) -> dict:
|
|
587
|
-
"""Evaluate model and create artifact."""
|
|
588
|
-
from sklearn.metrics import (
|
|
589
|
-
accuracy_score, precision_score, recall_score,
|
|
590
|
-
f1_score, classification_report
|
|
591
|
-
)
|
|
592
|
-
|
|
593
|
-
logger = get_run_logger()
|
|
594
|
-
|
|
595
|
-
X = test_df.drop("target", axis=1)
|
|
596
|
-
y = test_df["target"]
|
|
597
|
-
|
|
598
|
-
predictions = model.predict(X)
|
|
599
|
-
|
|
600
|
-
metrics = {
|
|
601
|
-
"accuracy": accuracy_score(y, predictions),
|
|
602
|
-
"precision": precision_score(y, predictions, average="weighted"),
|
|
603
|
-
"recall": recall_score(y, predictions, average="weighted"),
|
|
604
|
-
"f1_score": f1_score(y, predictions, average="weighted"),
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
logger.info(f"Metrics: {metrics}")
|
|
608
|
-
|
|
609
|
-
# Create markdown artifact for Prefect UI
|
|
610
|
-
report = classification_report(y, predictions)
|
|
611
|
-
markdown = f"""
|
|
612
|
-
# Model Evaluation Report
|
|
613
|
-
|
|
614
|
-
## Metrics
|
|
615
|
-
| Metric | Value |
|
|
616
|
-
|--------|-------|
|
|
617
|
-
| Accuracy | {metrics['accuracy']:.4f} |
|
|
618
|
-
| Precision | {metrics['precision']:.4f} |
|
|
619
|
-
| Recall | {metrics['recall']:.4f} |
|
|
620
|
-
| F1 Score | {metrics['f1_score']:.4f} |
|
|
621
|
-
|
|
622
|
-
## Classification Report
|
|
623
|
-
```
|
|
624
|
-
{report}
|
|
625
|
-
```
|
|
626
|
-
"""
|
|
627
|
-
create_markdown_artifact(
|
|
628
|
-
key="model-evaluation",
|
|
629
|
-
markdown=markdown,
|
|
630
|
-
description="Model evaluation metrics",
|
|
631
|
-
)
|
|
632
|
-
|
|
633
|
-
return metrics
|
|
634
|
-
|
|
635
|
-
@task
|
|
636
|
-
def deploy_model(model, metrics: dict, threshold: float) -> bool:
|
|
637
|
-
"""Deploy model if metrics pass threshold."""
|
|
638
|
-
import joblib
|
|
639
|
-
from datetime import datetime
|
|
640
|
-
|
|
641
|
-
logger = get_run_logger()
|
|
642
|
-
|
|
643
|
-
if metrics["accuracy"] < threshold:
|
|
644
|
-
logger.warning(
|
|
645
|
-
f"Model accuracy {metrics['accuracy']:.4f} below threshold {threshold}"
|
|
646
|
-
)
|
|
647
|
-
return False
|
|
648
|
-
|
|
649
|
-
# Save model
|
|
650
|
-
model_path = f"/models/model_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.joblib"
|
|
651
|
-
joblib.dump(model, model_path)
|
|
652
|
-
logger.info(f"Model deployed to {model_path}")
|
|
653
|
-
|
|
654
|
-
return True
|
|
655
|
-
|
|
656
|
-
@flow(
|
|
657
|
-
name="ml-training-pipeline",
|
|
658
|
-
description="End-to-end ML training pipeline",
|
|
659
|
-
retries=1,
|
|
660
|
-
retry_delay_seconds=300,
|
|
661
|
-
)
|
|
662
|
-
def ml_training_flow(
|
|
663
|
-
data_path: str,
|
|
664
|
-
n_estimators: int = 100,
|
|
665
|
-
max_depth: int = 10,
|
|
666
|
-
accuracy_threshold: float = 0.8,
|
|
667
|
-
) -> dict:
|
|
668
|
-
"""Main ML training flow."""
|
|
669
|
-
logger = get_run_logger()
|
|
670
|
-
logger.info("Starting ML training pipeline")
|
|
671
|
-
|
|
672
|
-
# Load and preprocess
|
|
673
|
-
df = load_data(data_path)
|
|
674
|
-
train_df, test_df = preprocess_data(df)
|
|
675
|
-
|
|
676
|
-
# Train and evaluate
|
|
677
|
-
model = train_model(train_df, n_estimators, max_depth)
|
|
678
|
-
metrics = evaluate_model(model, test_df)
|
|
679
|
-
|
|
680
|
-
# Deploy if threshold met
|
|
681
|
-
deployed = deploy_model(model, metrics, accuracy_threshold)
|
|
682
|
-
|
|
683
|
-
return {
|
|
684
|
-
"metrics": metrics,
|
|
685
|
-
"deployed": deployed,
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
# Deployment configuration
|
|
689
|
-
if __name__ == "__main__":
|
|
690
|
-
from prefect.deployments import Deployment
|
|
691
|
-
from prefect.server.schemas.schedules import CronSchedule
|
|
692
|
-
|
|
693
|
-
deployment = Deployment.build_from_flow(
|
|
694
|
-
flow=ml_training_flow,
|
|
695
|
-
name="daily-training",
|
|
696
|
-
schedule=CronSchedule(cron="0 2 * * *"),
|
|
697
|
-
parameters={
|
|
698
|
-
"data_path": "s3://data/training.parquet",
|
|
699
|
-
"n_estimators": 100,
|
|
700
|
-
"max_depth": 10,
|
|
701
|
-
"accuracy_threshold": 0.8,
|
|
702
|
-
},
|
|
703
|
-
tags=["ml", "production"],
|
|
704
|
-
work_queue_name="ml-queue",
|
|
705
|
-
)
|
|
706
|
-
|
|
707
|
-
deployment.apply()
|
|
708
|
-
```
|
|
709
|
-
|
|
710
|
-
---
|
|
711
|
-
|
|
712
|
-
## DAG Design Patterns
|
|
713
|
-
|
|
714
|
-
### Parallel Processing Pattern
|
|
715
|
-
|
|
716
|
-
```python
|
|
717
|
-
from prefect import flow, task, unmapped
|
|
718
|
-
from typing import List
|
|
719
|
-
|
|
720
|
-
@task
|
|
721
|
-
def process_partition(partition_id: int, data_path: str) -> dict:
|
|
722
|
-
"""Process single data partition."""
|
|
723
|
-
# Process partition
|
|
724
|
-
return {"partition_id": partition_id, "records_processed": 1000}
|
|
725
|
-
|
|
726
|
-
@task
|
|
727
|
-
def aggregate_results(results: List[dict]) -> dict:
|
|
728
|
-
"""Aggregate parallel processing results."""
|
|
729
|
-
total_records = sum(r["records_processed"] for r in results)
|
|
730
|
-
return {"total_records": total_records}
|
|
731
|
-
|
|
732
|
-
@flow
|
|
733
|
-
def parallel_processing_flow(data_path: str, num_partitions: int = 4):
|
|
734
|
-
"""Process data in parallel partitions."""
|
|
735
|
-
|
|
736
|
-
# Map over partitions
|
|
737
|
-
partition_results = process_partition.map(
|
|
738
|
-
partition_id=range(num_partitions),
|
|
739
|
-
data_path=unmapped(data_path),
|
|
740
|
-
)
|
|
741
|
-
|
|
742
|
-
# Aggregate results
|
|
743
|
-
final_result = aggregate_results(partition_results)
|
|
744
|
-
|
|
745
|
-
return final_result
|
|
746
|
-
```
|
|
747
|
-
|
|
748
|
-
### Conditional Branching Pattern
|
|
749
|
-
|
|
750
|
-
```python
|
|
751
|
-
from prefect import flow, task
|
|
752
|
-
|
|
753
|
-
@task
|
|
754
|
-
def check_data_quality(df) -> bool:
|
|
755
|
-
"""Check if data meets quality standards."""
|
|
756
|
-
null_ratio = df.isnull().sum().sum() / df.size
|
|
757
|
-
return null_ratio < 0.1
|
|
758
|
-
|
|
759
|
-
@task
|
|
760
|
-
def handle_poor_quality(df):
|
|
761
|
-
"""Handle data that fails quality checks."""
|
|
762
|
-
# Impute, clean, or alert
|
|
763
|
-
pass
|
|
764
|
-
|
|
765
|
-
@task
|
|
766
|
-
def process_good_quality(df):
|
|
767
|
-
"""Process data that passes quality checks."""
|
|
768
|
-
pass
|
|
769
|
-
|
|
770
|
-
@flow
|
|
771
|
-
def conditional_flow(data_path: str):
|
|
772
|
-
"""Flow with conditional branching."""
|
|
773
|
-
df = load_data(data_path)
|
|
774
|
-
quality_ok = check_data_quality(df)
|
|
775
|
-
|
|
776
|
-
if quality_ok:
|
|
777
|
-
result = process_good_quality(df)
|
|
778
|
-
else:
|
|
779
|
-
result = handle_poor_quality(df)
|
|
780
|
-
|
|
781
|
-
return result
|
|
782
|
-
```
|
|
783
|
-
|
|
784
|
-
### Error Handling Pattern
|
|
785
|
-
|
|
786
|
-
```python
|
|
787
|
-
from prefect import flow, task
|
|
788
|
-
from prefect.states import Failed
|
|
789
|
-
|
|
790
|
-
@task
|
|
791
|
-
def risky_operation():
|
|
792
|
-
"""Operation that might fail."""
|
|
793
|
-
import random
|
|
794
|
-
if random.random() < 0.3:
|
|
795
|
-
raise ValueError("Random failure")
|
|
796
|
-
return "success"
|
|
797
|
-
|
|
798
|
-
@task
|
|
799
|
-
def fallback_operation():
|
|
800
|
-
"""Fallback when primary fails."""
|
|
801
|
-
return "fallback_result"
|
|
802
|
-
|
|
803
|
-
@task
|
|
804
|
-
def send_alert(error: Exception):
|
|
805
|
-
"""Send alert on failure."""
|
|
806
|
-
# Send to Slack, PagerDuty, etc.
|
|
807
|
-
pass
|
|
808
|
-
|
|
809
|
-
@flow
|
|
810
|
-
def resilient_flow():
|
|
811
|
-
"""Flow with error handling."""
|
|
812
|
-
try:
|
|
813
|
-
result = risky_operation()
|
|
814
|
-
except Exception as e:
|
|
815
|
-
send_alert(e)
|
|
816
|
-
result = fallback_operation()
|
|
817
|
-
|
|
818
|
-
return result
|
|
819
|
-
```
|
|
820
|
-
|
|
821
|
-
---
|
|
822
|
-
|
|
823
|
-
## Best Practices
|
|
824
|
-
|
|
825
|
-
### Pipeline Configuration
|
|
826
|
-
|
|
827
|
-
```yaml
|
|
828
|
-
# pipeline_config.yaml
|
|
829
|
-
pipeline:
|
|
830
|
-
name: ml-training
|
|
831
|
-
version: "1.0.0"
|
|
832
|
-
description: "Production ML training pipeline"
|
|
833
|
-
|
|
834
|
-
stages:
|
|
835
|
-
- name: load_data
|
|
836
|
-
timeout: 300
|
|
837
|
-
retries: 3
|
|
838
|
-
|
|
839
|
-
- name: preprocess
|
|
840
|
-
timeout: 600
|
|
841
|
-
retries: 2
|
|
842
|
-
depends_on: [load_data]
|
|
843
|
-
|
|
844
|
-
- name: train
|
|
845
|
-
timeout: 3600
|
|
846
|
-
retries: 1
|
|
847
|
-
depends_on: [preprocess]
|
|
848
|
-
resources:
|
|
849
|
-
cpu: 4
|
|
850
|
-
memory: 16Gi
|
|
851
|
-
gpu: 1
|
|
852
|
-
|
|
853
|
-
- name: evaluate
|
|
854
|
-
timeout: 300
|
|
855
|
-
depends_on: [train]
|
|
856
|
-
|
|
857
|
-
- name: deploy
|
|
858
|
-
timeout: 300
|
|
859
|
-
depends_on: [evaluate]
|
|
860
|
-
condition: "evaluate.metrics.accuracy >= 0.8"
|
|
861
|
-
|
|
862
|
-
schedule:
|
|
863
|
-
cron: "0 2 * * *"
|
|
864
|
-
timezone: "UTC"
|
|
865
|
-
|
|
866
|
-
notifications:
|
|
867
|
-
on_failure:
|
|
868
|
-
- slack: "#ml-alerts"
|
|
869
|
-
- email: ml-team@company.com
|
|
870
|
-
on_success:
|
|
871
|
-
- slack: "#ml-notifications"
|
|
872
|
-
```
|
|
873
|
-
|
|
874
|
-
### Idempotency Guidelines
|
|
875
|
-
|
|
876
|
-
```python
|
|
877
|
-
# Good: Idempotent operations
|
|
878
|
-
def process_data(run_id: str, data_path: str):
|
|
879
|
-
"""Idempotent data processing."""
|
|
880
|
-
output_path = f"s3://processed/{run_id}/data.parquet"
|
|
881
|
-
|
|
882
|
-
# Check if already processed
|
|
883
|
-
if file_exists(output_path):
|
|
884
|
-
return output_path
|
|
885
|
-
|
|
886
|
-
# Process and save
|
|
887
|
-
df = pd.read_parquet(data_path)
|
|
888
|
-
processed = transform(df)
|
|
889
|
-
processed.to_parquet(output_path)
|
|
890
|
-
|
|
891
|
-
return output_path
|
|
892
|
-
```
|
|
893
|
-
|
|
894
|
-
---
|
|
895
|
-
|
|
896
|
-
## Related References
|
|
897
|
-
|
|
898
|
-
- `training-pipelines.md` - Training components for pipelines
|
|
899
|
-
- `experiment-tracking.md` - Logging pipeline runs
|
|
900
|
-
- `feature-engineering.md` - Feature pipeline components
|
|
901
|
-
- `model-validation.md` - Validation stages in pipelines
|
|
902
|
-
|
|
903
|
-
## Cross-Reference Skills
|
|
904
|
-
|
|
905
|
-
- **DevOps Engineer** - CI/CD for pipeline deployment
|
|
906
|
-
- **Kubernetes Specialist** - Running pipelines on K8s
|
|
907
|
-
- **Cloud Architect** - Cloud infrastructure for orchestration
|
|
1
|
+
# Pipeline Orchestration
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Pipeline orchestration automates the end-to-end ML workflow from data ingestion through model deployment. Orchestrators manage dependencies, handle failures, enable scheduling, and provide observability across complex multi-step pipelines.
|
|
8
|
+
|
|
9
|
+
## When to Use This Reference
|
|
10
|
+
|
|
11
|
+
- Building Kubeflow Pipelines for ML workflows
|
|
12
|
+
- Creating Airflow DAGs for data and ML pipelines
|
|
13
|
+
- Implementing Prefect flows for modern orchestration
|
|
14
|
+
- Designing pipeline DAGs and component dependencies
|
|
15
|
+
- Setting up scheduled retraining workflows
|
|
16
|
+
|
|
17
|
+
## When NOT to Use
|
|
18
|
+
|
|
19
|
+
- Simple linear scripts without dependencies
|
|
20
|
+
- One-off data processing tasks
|
|
21
|
+
- Interactive development and experimentation
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Kubeflow Pipelines
|
|
26
|
+
|
|
27
|
+
### Pipeline Definition (KFP v2)
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from kfp import dsl
|
|
31
|
+
from kfp.dsl import Input, Output, Artifact, Dataset, Model, Metrics
|
|
32
|
+
from kfp import compiler
|
|
33
|
+
from typing import NamedTuple
|
|
34
|
+
|
|
35
|
+
@dsl.component(
|
|
36
|
+
base_image="python:3.11-slim",
|
|
37
|
+
packages_to_install=["pandas", "scikit-learn"],
|
|
38
|
+
)
|
|
39
|
+
def load_data(
|
|
40
|
+
data_path: str,
|
|
41
|
+
output_dataset: Output[Dataset],
|
|
42
|
+
) -> None:
|
|
43
|
+
"""Load and validate raw data."""
|
|
44
|
+
import pandas as pd
|
|
45
|
+
|
|
46
|
+
df = pd.read_parquet(data_path)
|
|
47
|
+
|
|
48
|
+
# Basic validation
|
|
49
|
+
assert len(df) > 0, "Dataset is empty"
|
|
50
|
+
assert "target" in df.columns, "Missing target column"
|
|
51
|
+
|
|
52
|
+
df.to_parquet(output_dataset.path)
|
|
53
|
+
output_dataset.metadata["num_rows"] = len(df)
|
|
54
|
+
output_dataset.metadata["num_features"] = len(df.columns) - 1
|
|
55
|
+
|
|
56
|
+
@dsl.component(
|
|
57
|
+
base_image="python:3.11-slim",
|
|
58
|
+
packages_to_install=["pandas", "scikit-learn"],
|
|
59
|
+
)
|
|
60
|
+
def preprocess_data(
|
|
61
|
+
input_dataset: Input[Dataset],
|
|
62
|
+
train_dataset: Output[Dataset],
|
|
63
|
+
test_dataset: Output[Dataset],
|
|
64
|
+
test_size: float = 0.2,
|
|
65
|
+
random_state: int = 42,
|
|
66
|
+
) -> None:
|
|
67
|
+
"""Preprocess and split data."""
|
|
68
|
+
import pandas as pd
|
|
69
|
+
from sklearn.model_selection import train_test_split
|
|
70
|
+
from sklearn.preprocessing import StandardScaler
|
|
71
|
+
|
|
72
|
+
df = pd.read_parquet(input_dataset.path)
|
|
73
|
+
|
|
74
|
+
X = df.drop("target", axis=1)
|
|
75
|
+
y = df["target"]
|
|
76
|
+
|
|
77
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
78
|
+
X, y, test_size=test_size, random_state=random_state
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
scaler = StandardScaler()
|
|
82
|
+
X_train_scaled = scaler.fit_transform(X_train)
|
|
83
|
+
X_test_scaled = scaler.transform(X_test)
|
|
84
|
+
|
|
85
|
+
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
|
|
86
|
+
train_df["target"] = y_train.values
|
|
87
|
+
train_df.to_parquet(train_dataset.path)
|
|
88
|
+
|
|
89
|
+
test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
|
|
90
|
+
test_df["target"] = y_test.values
|
|
91
|
+
test_df.to_parquet(test_dataset.path)
|
|
92
|
+
|
|
93
|
+
@dsl.component(
|
|
94
|
+
base_image="python:3.11-slim",
|
|
95
|
+
packages_to_install=["pandas", "scikit-learn", "joblib"],
|
|
96
|
+
)
|
|
97
|
+
def train_model(
|
|
98
|
+
train_dataset: Input[Dataset],
|
|
99
|
+
model_artifact: Output[Model],
|
|
100
|
+
n_estimators: int = 100,
|
|
101
|
+
max_depth: int = 10,
|
|
102
|
+
) -> None:
|
|
103
|
+
"""Train RandomForest model."""
|
|
104
|
+
import pandas as pd
|
|
105
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
106
|
+
import joblib
|
|
107
|
+
|
|
108
|
+
df = pd.read_parquet(train_dataset.path)
|
|
109
|
+
X = df.drop("target", axis=1)
|
|
110
|
+
y = df["target"]
|
|
111
|
+
|
|
112
|
+
model = RandomForestClassifier(
|
|
113
|
+
n_estimators=n_estimators,
|
|
114
|
+
max_depth=max_depth,
|
|
115
|
+
random_state=42,
|
|
116
|
+
)
|
|
117
|
+
model.fit(X, y)
|
|
118
|
+
|
|
119
|
+
joblib.dump(model, model_artifact.path)
|
|
120
|
+
model_artifact.metadata["n_estimators"] = n_estimators
|
|
121
|
+
model_artifact.metadata["max_depth"] = max_depth
|
|
122
|
+
|
|
123
|
+
@dsl.component(
|
|
124
|
+
base_image="python:3.11-slim",
|
|
125
|
+
packages_to_install=["pandas", "scikit-learn", "joblib"],
|
|
126
|
+
)
|
|
127
|
+
def evaluate_model(
|
|
128
|
+
model_artifact: Input[Model],
|
|
129
|
+
test_dataset: Input[Dataset],
|
|
130
|
+
metrics: Output[Metrics],
|
|
131
|
+
threshold: float = 0.8,
|
|
132
|
+
) -> NamedTuple("Outputs", [("passed", bool), ("accuracy", float)]):
|
|
133
|
+
"""Evaluate model and check threshold."""
|
|
134
|
+
import pandas as pd
|
|
135
|
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
136
|
+
import joblib
|
|
137
|
+
from collections import namedtuple
|
|
138
|
+
|
|
139
|
+
model = joblib.load(model_artifact.path)
|
|
140
|
+
df = pd.read_parquet(test_dataset.path)
|
|
141
|
+
X = df.drop("target", axis=1)
|
|
142
|
+
y = df["target"]
|
|
143
|
+
|
|
144
|
+
predictions = model.predict(X)
|
|
145
|
+
|
|
146
|
+
accuracy = accuracy_score(y, predictions)
|
|
147
|
+
precision = precision_score(y, predictions, average="weighted")
|
|
148
|
+
recall = recall_score(y, predictions, average="weighted")
|
|
149
|
+
f1 = f1_score(y, predictions, average="weighted")
|
|
150
|
+
|
|
151
|
+
metrics.log_metric("accuracy", accuracy)
|
|
152
|
+
metrics.log_metric("precision", precision)
|
|
153
|
+
metrics.log_metric("recall", recall)
|
|
154
|
+
metrics.log_metric("f1_score", f1)
|
|
155
|
+
|
|
156
|
+
passed = accuracy >= threshold
|
|
157
|
+
|
|
158
|
+
Outputs = namedtuple("Outputs", ["passed", "accuracy"])
|
|
159
|
+
return Outputs(passed, accuracy)
|
|
160
|
+
|
|
161
|
+
@dsl.component(
|
|
162
|
+
base_image="python:3.11-slim",
|
|
163
|
+
packages_to_install=["google-cloud-storage"],
|
|
164
|
+
)
|
|
165
|
+
def deploy_model(
|
|
166
|
+
model_artifact: Input[Model],
|
|
167
|
+
model_name: str,
|
|
168
|
+
endpoint: str,
|
|
169
|
+
) -> str:
|
|
170
|
+
"""Deploy model to serving endpoint."""
|
|
171
|
+
from google.cloud import storage
|
|
172
|
+
import shutil
|
|
173
|
+
|
|
174
|
+
# Copy model to GCS
|
|
175
|
+
bucket_name = endpoint.split("/")[2]
|
|
176
|
+
model_path = f"models/{model_name}/model.joblib"
|
|
177
|
+
|
|
178
|
+
client = storage.Client()
|
|
179
|
+
bucket = client.bucket(bucket_name)
|
|
180
|
+
blob = bucket.blob(model_path)
|
|
181
|
+
blob.upload_from_filename(model_artifact.path)
|
|
182
|
+
|
|
183
|
+
return f"gs://{bucket_name}/{model_path}"
|
|
184
|
+
|
|
185
|
+
@dsl.pipeline(
|
|
186
|
+
name="ml-training-pipeline",
|
|
187
|
+
description="End-to-end ML training pipeline",
|
|
188
|
+
)
|
|
189
|
+
def ml_pipeline(
|
|
190
|
+
data_path: str,
|
|
191
|
+
n_estimators: int = 100,
|
|
192
|
+
max_depth: int = 10,
|
|
193
|
+
accuracy_threshold: float = 0.8,
|
|
194
|
+
model_name: str = "classifier",
|
|
195
|
+
endpoint: str = "gs://ml-models/serving",
|
|
196
|
+
) -> None:
|
|
197
|
+
"""Complete ML training pipeline."""
|
|
198
|
+
|
|
199
|
+
load_task = load_data(data_path=data_path)
|
|
200
|
+
|
|
201
|
+
preprocess_task = preprocess_data(
|
|
202
|
+
input_dataset=load_task.outputs["output_dataset"],
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
train_task = train_model(
|
|
206
|
+
train_dataset=preprocess_task.outputs["train_dataset"],
|
|
207
|
+
n_estimators=n_estimators,
|
|
208
|
+
max_depth=max_depth,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
evaluate_task = evaluate_model(
|
|
212
|
+
model_artifact=train_task.outputs["model_artifact"],
|
|
213
|
+
test_dataset=preprocess_task.outputs["test_dataset"],
|
|
214
|
+
threshold=accuracy_threshold,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
with dsl.If(evaluate_task.outputs["passed"] == True):
|
|
218
|
+
deploy_model(
|
|
219
|
+
model_artifact=train_task.outputs["model_artifact"],
|
|
220
|
+
model_name=model_name,
|
|
221
|
+
endpoint=endpoint,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Compile pipeline
|
|
225
|
+
if __name__ == "__main__":
|
|
226
|
+
compiler.Compiler().compile(
|
|
227
|
+
ml_pipeline,
|
|
228
|
+
"ml_pipeline.yaml",
|
|
229
|
+
)
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Running Kubeflow Pipelines
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
from kfp.client import Client
|
|
236
|
+
|
|
237
|
+
def run_pipeline(
|
|
238
|
+
pipeline_file: str,
|
|
239
|
+
experiment_name: str,
|
|
240
|
+
run_name: str,
|
|
241
|
+
parameters: dict,
|
|
242
|
+
) -> str:
|
|
243
|
+
"""Submit pipeline run to Kubeflow."""
|
|
244
|
+
client = Client(host="https://kubeflow.example.com/pipeline")
|
|
245
|
+
|
|
246
|
+
# Create or get experiment
|
|
247
|
+
experiment = client.create_experiment(name=experiment_name)
|
|
248
|
+
|
|
249
|
+
# Submit run
|
|
250
|
+
run = client.create_run_from_pipeline_package(
|
|
251
|
+
pipeline_file=pipeline_file,
|
|
252
|
+
experiment_id=experiment.experiment_id,
|
|
253
|
+
run_name=run_name,
|
|
254
|
+
arguments=parameters,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return run.run_id
|
|
258
|
+
|
|
259
|
+
def schedule_pipeline(
|
|
260
|
+
pipeline_file: str,
|
|
261
|
+
experiment_name: str,
|
|
262
|
+
schedule_name: str,
|
|
263
|
+
cron_expression: str,
|
|
264
|
+
parameters: dict,
|
|
265
|
+
) -> str:
|
|
266
|
+
"""Create recurring pipeline run."""
|
|
267
|
+
client = Client(host="https://kubeflow.example.com/pipeline")
|
|
268
|
+
|
|
269
|
+
experiment = client.create_experiment(name=experiment_name)
|
|
270
|
+
|
|
271
|
+
# Create recurring run
|
|
272
|
+
job = client.create_recurring_run(
|
|
273
|
+
experiment_id=experiment.experiment_id,
|
|
274
|
+
job_name=schedule_name,
|
|
275
|
+
pipeline_package_path=pipeline_file,
|
|
276
|
+
cron_expression=cron_expression,
|
|
277
|
+
enabled=True,
|
|
278
|
+
parameters=parameters,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
return job.id
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
---
|
|
285
|
+
|
|
286
|
+
## Apache Airflow
|
|
287
|
+
|
|
288
|
+
### ML Pipeline DAG
|
|
289
|
+
|
|
290
|
+
```python
|
|
291
|
+
from airflow import DAG
|
|
292
|
+
from airflow.operators.python import PythonOperator, BranchPythonOperator
|
|
293
|
+
from airflow.operators.empty import EmptyOperator
|
|
294
|
+
from airflow.providers.amazon.aws.operators.s3 import S3CreateObjectOperator
|
|
295
|
+
from airflow.utils.trigger_rule import TriggerRule
|
|
296
|
+
from datetime import datetime, timedelta
|
|
297
|
+
import json
|
|
298
|
+
|
|
299
|
+
default_args = {
|
|
300
|
+
"owner": "ml-team",
|
|
301
|
+
"depends_on_past": False,
|
|
302
|
+
"email_on_failure": True,
|
|
303
|
+
"email_on_retry": False,
|
|
304
|
+
"retries": 2,
|
|
305
|
+
"retry_delay": timedelta(minutes=5),
|
|
306
|
+
"execution_timeout": timedelta(hours=2),
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
def load_data(**context):
|
|
310
|
+
"""Load data from source."""
|
|
311
|
+
import pandas as pd
|
|
312
|
+
|
|
313
|
+
data_path = context["params"]["data_path"]
|
|
314
|
+
df = pd.read_parquet(data_path)
|
|
315
|
+
|
|
316
|
+
# Push to XCom for downstream tasks
|
|
317
|
+
output_path = f"/tmp/data_{context['run_id']}.parquet"
|
|
318
|
+
df.to_parquet(output_path)
|
|
319
|
+
|
|
320
|
+
context["ti"].xcom_push(key="data_path", value=output_path)
|
|
321
|
+
context["ti"].xcom_push(key="num_rows", value=len(df))
|
|
322
|
+
|
|
323
|
+
return output_path
|
|
324
|
+
|
|
325
|
+
def preprocess_data(**context):
|
|
326
|
+
"""Preprocess and split data."""
|
|
327
|
+
import pandas as pd
|
|
328
|
+
from sklearn.model_selection import train_test_split
|
|
329
|
+
from sklearn.preprocessing import StandardScaler
|
|
330
|
+
|
|
331
|
+
input_path = context["ti"].xcom_pull(key="data_path", task_ids="load_data")
|
|
332
|
+
df = pd.read_parquet(input_path)
|
|
333
|
+
|
|
334
|
+
X = df.drop("target", axis=1)
|
|
335
|
+
y = df["target"]
|
|
336
|
+
|
|
337
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
338
|
+
X, y, test_size=0.2, random_state=42
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
scaler = StandardScaler()
|
|
342
|
+
X_train_scaled = scaler.fit_transform(X_train)
|
|
343
|
+
X_test_scaled = scaler.transform(X_test)
|
|
344
|
+
|
|
345
|
+
# Save processed data
|
|
346
|
+
train_path = f"/tmp/train_{context['run_id']}.parquet"
|
|
347
|
+
test_path = f"/tmp/test_{context['run_id']}.parquet"
|
|
348
|
+
|
|
349
|
+
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
|
|
350
|
+
train_df["target"] = y_train.values
|
|
351
|
+
train_df.to_parquet(train_path)
|
|
352
|
+
|
|
353
|
+
test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
|
|
354
|
+
test_df["target"] = y_test.values
|
|
355
|
+
test_df.to_parquet(test_path)
|
|
356
|
+
|
|
357
|
+
context["ti"].xcom_push(key="train_path", value=train_path)
|
|
358
|
+
context["ti"].xcom_push(key="test_path", value=test_path)
|
|
359
|
+
|
|
360
|
+
def train_model(**context):
|
|
361
|
+
"""Train ML model."""
|
|
362
|
+
import pandas as pd
|
|
363
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
364
|
+
import joblib
|
|
365
|
+
|
|
366
|
+
train_path = context["ti"].xcom_pull(key="train_path", task_ids="preprocess_data")
|
|
367
|
+
df = pd.read_parquet(train_path)
|
|
368
|
+
|
|
369
|
+
X = df.drop("target", axis=1)
|
|
370
|
+
y = df["target"]
|
|
371
|
+
|
|
372
|
+
params = context["params"]
|
|
373
|
+
model = RandomForestClassifier(
|
|
374
|
+
n_estimators=params.get("n_estimators", 100),
|
|
375
|
+
max_depth=params.get("max_depth", 10),
|
|
376
|
+
random_state=42,
|
|
377
|
+
)
|
|
378
|
+
model.fit(X, y)
|
|
379
|
+
|
|
380
|
+
model_path = f"/tmp/model_{context['run_id']}.joblib"
|
|
381
|
+
joblib.dump(model, model_path)
|
|
382
|
+
|
|
383
|
+
context["ti"].xcom_push(key="model_path", value=model_path)
|
|
384
|
+
|
|
385
|
+
def evaluate_model(**context):
|
|
386
|
+
"""Evaluate model and return metrics."""
|
|
387
|
+
import pandas as pd
|
|
388
|
+
from sklearn.metrics import accuracy_score, precision_score, recall_score
|
|
389
|
+
import joblib
|
|
390
|
+
|
|
391
|
+
model_path = context["ti"].xcom_pull(key="model_path", task_ids="train_model")
|
|
392
|
+
test_path = context["ti"].xcom_pull(key="test_path", task_ids="preprocess_data")
|
|
393
|
+
|
|
394
|
+
model = joblib.load(model_path)
|
|
395
|
+
df = pd.read_parquet(test_path)
|
|
396
|
+
|
|
397
|
+
X = df.drop("target", axis=1)
|
|
398
|
+
y = df["target"]
|
|
399
|
+
|
|
400
|
+
predictions = model.predict(X)
|
|
401
|
+
|
|
402
|
+
metrics = {
|
|
403
|
+
"accuracy": accuracy_score(y, predictions),
|
|
404
|
+
"precision": precision_score(y, predictions, average="weighted"),
|
|
405
|
+
"recall": recall_score(y, predictions, average="weighted"),
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
context["ti"].xcom_push(key="metrics", value=metrics)
|
|
409
|
+
|
|
410
|
+
return metrics
|
|
411
|
+
|
|
412
|
+
def check_metrics_threshold(**context):
|
|
413
|
+
"""Branch based on model performance."""
|
|
414
|
+
metrics = context["ti"].xcom_pull(key="metrics", task_ids="evaluate_model")
|
|
415
|
+
threshold = context["params"].get("accuracy_threshold", 0.8)
|
|
416
|
+
|
|
417
|
+
if metrics["accuracy"] >= threshold:
|
|
418
|
+
return "deploy_model"
|
|
419
|
+
return "skip_deployment"
|
|
420
|
+
|
|
421
|
+
def deploy_model(**context):
|
|
422
|
+
"""Deploy model to production."""
|
|
423
|
+
import shutil
|
|
424
|
+
|
|
425
|
+
model_path = context["ti"].xcom_pull(key="model_path", task_ids="train_model")
|
|
426
|
+
metrics = context["ti"].xcom_pull(key="metrics", task_ids="evaluate_model")
|
|
427
|
+
|
|
428
|
+
# In production, this would upload to model registry/serving
|
|
429
|
+
deploy_path = f"/models/production/model_{context['run_id']}.joblib"
|
|
430
|
+
shutil.copy(model_path, deploy_path)
|
|
431
|
+
|
|
432
|
+
return {
|
|
433
|
+
"model_path": deploy_path,
|
|
434
|
+
"metrics": metrics,
|
|
435
|
+
"deployed_at": datetime.utcnow().isoformat(),
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
with DAG(
|
|
439
|
+
dag_id="ml_training_pipeline",
|
|
440
|
+
default_args=default_args,
|
|
441
|
+
description="End-to-end ML training pipeline",
|
|
442
|
+
schedule_interval="0 2 * * *", # Daily at 2 AM
|
|
443
|
+
start_date=datetime(2024, 1, 1),
|
|
444
|
+
catchup=False,
|
|
445
|
+
tags=["ml", "training", "production"],
|
|
446
|
+
params={
|
|
447
|
+
"data_path": "s3://data-bucket/training_data.parquet",
|
|
448
|
+
"n_estimators": 100,
|
|
449
|
+
"max_depth": 10,
|
|
450
|
+
"accuracy_threshold": 0.8,
|
|
451
|
+
},
|
|
452
|
+
) as dag:
|
|
453
|
+
|
|
454
|
+
start = EmptyOperator(task_id="start")
|
|
455
|
+
|
|
456
|
+
load = PythonOperator(
|
|
457
|
+
task_id="load_data",
|
|
458
|
+
python_callable=load_data,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
preprocess = PythonOperator(
|
|
462
|
+
task_id="preprocess_data",
|
|
463
|
+
python_callable=preprocess_data,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
train = PythonOperator(
|
|
467
|
+
task_id="train_model",
|
|
468
|
+
python_callable=train_model,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
evaluate = PythonOperator(
|
|
472
|
+
task_id="evaluate_model",
|
|
473
|
+
python_callable=evaluate_model,
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
check_threshold = BranchPythonOperator(
|
|
477
|
+
task_id="check_metrics_threshold",
|
|
478
|
+
python_callable=check_metrics_threshold,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
deploy = PythonOperator(
|
|
482
|
+
task_id="deploy_model",
|
|
483
|
+
python_callable=deploy_model,
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
skip = EmptyOperator(task_id="skip_deployment")
|
|
487
|
+
|
|
488
|
+
end = EmptyOperator(
|
|
489
|
+
task_id="end",
|
|
490
|
+
trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
start >> load >> preprocess >> train >> evaluate >> check_threshold
|
|
494
|
+
check_threshold >> [deploy, skip] >> end
|
|
495
|
+
```
|
|
496
|
+
|
|
497
|
+
---
|
|
498
|
+
|
|
499
|
+
## Prefect
|
|
500
|
+
|
|
501
|
+
### Modern Flow-Based Pipeline
|
|
502
|
+
|
|
503
|
+
```python
|
|
504
|
+
from prefect import flow, task, get_run_logger
|
|
505
|
+
from prefect.artifacts import create_markdown_artifact
|
|
506
|
+
from prefect.tasks import task_input_hash
|
|
507
|
+
from datetime import timedelta
|
|
508
|
+
import pandas as pd
|
|
509
|
+
|
|
510
|
+
@task(
|
|
511
|
+
retries=3,
|
|
512
|
+
retry_delay_seconds=60,
|
|
513
|
+
cache_key_fn=task_input_hash,
|
|
514
|
+
cache_expiration=timedelta(hours=1),
|
|
515
|
+
)
|
|
516
|
+
def load_data(data_path: str) -> pd.DataFrame:
|
|
517
|
+
"""Load data with caching."""
|
|
518
|
+
logger = get_run_logger()
|
|
519
|
+
logger.info(f"Loading data from {data_path}")
|
|
520
|
+
|
|
521
|
+
df = pd.read_parquet(data_path)
|
|
522
|
+
logger.info(f"Loaded {len(df)} rows")
|
|
523
|
+
|
|
524
|
+
return df
|
|
525
|
+
|
|
526
|
+
@task(retries=2)
|
|
527
|
+
def preprocess_data(
|
|
528
|
+
df: pd.DataFrame,
|
|
529
|
+
test_size: float = 0.2,
|
|
530
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
531
|
+
"""Preprocess and split data."""
|
|
532
|
+
from sklearn.model_selection import train_test_split
|
|
533
|
+
from sklearn.preprocessing import StandardScaler
|
|
534
|
+
|
|
535
|
+
logger = get_run_logger()
|
|
536
|
+
|
|
537
|
+
X = df.drop("target", axis=1)
|
|
538
|
+
y = df["target"]
|
|
539
|
+
|
|
540
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
541
|
+
X, y, test_size=test_size, random_state=42
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
scaler = StandardScaler()
|
|
545
|
+
X_train_scaled = scaler.fit_transform(X_train)
|
|
546
|
+
X_test_scaled = scaler.transform(X_test)
|
|
547
|
+
|
|
548
|
+
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
|
|
549
|
+
train_df["target"] = y_train.values
|
|
550
|
+
|
|
551
|
+
test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
|
|
552
|
+
test_df["target"] = y_test.values
|
|
553
|
+
|
|
554
|
+
logger.info(f"Train: {len(train_df)}, Test: {len(test_df)}")
|
|
555
|
+
|
|
556
|
+
return train_df, test_df
|
|
557
|
+
|
|
558
|
+
@task
|
|
559
|
+
def train_model(
|
|
560
|
+
train_df: pd.DataFrame,
|
|
561
|
+
n_estimators: int = 100,
|
|
562
|
+
max_depth: int = 10,
|
|
563
|
+
):
|
|
564
|
+
"""Train RandomForest model."""
|
|
565
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
566
|
+
|
|
567
|
+
logger = get_run_logger()
|
|
568
|
+
|
|
569
|
+
X = train_df.drop("target", axis=1)
|
|
570
|
+
y = train_df["target"]
|
|
571
|
+
|
|
572
|
+
model = RandomForestClassifier(
|
|
573
|
+
n_estimators=n_estimators,
|
|
574
|
+
max_depth=max_depth,
|
|
575
|
+
random_state=42,
|
|
576
|
+
n_jobs=-1,
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
logger.info("Training model...")
|
|
580
|
+
model.fit(X, y)
|
|
581
|
+
logger.info("Training complete")
|
|
582
|
+
|
|
583
|
+
return model
|
|
584
|
+
|
|
585
|
+
@task
|
|
586
|
+
def evaluate_model(model, test_df: pd.DataFrame) -> dict:
|
|
587
|
+
"""Evaluate model and create artifact."""
|
|
588
|
+
from sklearn.metrics import (
|
|
589
|
+
accuracy_score, precision_score, recall_score,
|
|
590
|
+
f1_score, classification_report
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
logger = get_run_logger()
|
|
594
|
+
|
|
595
|
+
X = test_df.drop("target", axis=1)
|
|
596
|
+
y = test_df["target"]
|
|
597
|
+
|
|
598
|
+
predictions = model.predict(X)
|
|
599
|
+
|
|
600
|
+
metrics = {
|
|
601
|
+
"accuracy": accuracy_score(y, predictions),
|
|
602
|
+
"precision": precision_score(y, predictions, average="weighted"),
|
|
603
|
+
"recall": recall_score(y, predictions, average="weighted"),
|
|
604
|
+
"f1_score": f1_score(y, predictions, average="weighted"),
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
logger.info(f"Metrics: {metrics}")
|
|
608
|
+
|
|
609
|
+
# Create markdown artifact for Prefect UI
|
|
610
|
+
report = classification_report(y, predictions)
|
|
611
|
+
markdown = f"""
|
|
612
|
+
# Model Evaluation Report
|
|
613
|
+
|
|
614
|
+
## Metrics
|
|
615
|
+
| Metric | Value |
|
|
616
|
+
|--------|-------|
|
|
617
|
+
| Accuracy | {metrics['accuracy']:.4f} |
|
|
618
|
+
| Precision | {metrics['precision']:.4f} |
|
|
619
|
+
| Recall | {metrics['recall']:.4f} |
|
|
620
|
+
| F1 Score | {metrics['f1_score']:.4f} |
|
|
621
|
+
|
|
622
|
+
## Classification Report
|
|
623
|
+
```
|
|
624
|
+
{report}
|
|
625
|
+
```
|
|
626
|
+
"""
|
|
627
|
+
create_markdown_artifact(
|
|
628
|
+
key="model-evaluation",
|
|
629
|
+
markdown=markdown,
|
|
630
|
+
description="Model evaluation metrics",
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
return metrics
|
|
634
|
+
|
|
635
|
+
@task
|
|
636
|
+
def deploy_model(model, metrics: dict, threshold: float) -> bool:
|
|
637
|
+
"""Deploy model if metrics pass threshold."""
|
|
638
|
+
import joblib
|
|
639
|
+
from datetime import datetime
|
|
640
|
+
|
|
641
|
+
logger = get_run_logger()
|
|
642
|
+
|
|
643
|
+
if metrics["accuracy"] < threshold:
|
|
644
|
+
logger.warning(
|
|
645
|
+
f"Model accuracy {metrics['accuracy']:.4f} below threshold {threshold}"
|
|
646
|
+
)
|
|
647
|
+
return False
|
|
648
|
+
|
|
649
|
+
# Save model
|
|
650
|
+
model_path = f"/models/model_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.joblib"
|
|
651
|
+
joblib.dump(model, model_path)
|
|
652
|
+
logger.info(f"Model deployed to {model_path}")
|
|
653
|
+
|
|
654
|
+
return True
|
|
655
|
+
|
|
656
|
+
@flow(
|
|
657
|
+
name="ml-training-pipeline",
|
|
658
|
+
description="End-to-end ML training pipeline",
|
|
659
|
+
retries=1,
|
|
660
|
+
retry_delay_seconds=300,
|
|
661
|
+
)
|
|
662
|
+
def ml_training_flow(
|
|
663
|
+
data_path: str,
|
|
664
|
+
n_estimators: int = 100,
|
|
665
|
+
max_depth: int = 10,
|
|
666
|
+
accuracy_threshold: float = 0.8,
|
|
667
|
+
) -> dict:
|
|
668
|
+
"""Main ML training flow."""
|
|
669
|
+
logger = get_run_logger()
|
|
670
|
+
logger.info("Starting ML training pipeline")
|
|
671
|
+
|
|
672
|
+
# Load and preprocess
|
|
673
|
+
df = load_data(data_path)
|
|
674
|
+
train_df, test_df = preprocess_data(df)
|
|
675
|
+
|
|
676
|
+
# Train and evaluate
|
|
677
|
+
model = train_model(train_df, n_estimators, max_depth)
|
|
678
|
+
metrics = evaluate_model(model, test_df)
|
|
679
|
+
|
|
680
|
+
# Deploy if threshold met
|
|
681
|
+
deployed = deploy_model(model, metrics, accuracy_threshold)
|
|
682
|
+
|
|
683
|
+
return {
|
|
684
|
+
"metrics": metrics,
|
|
685
|
+
"deployed": deployed,
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
# Deployment configuration
|
|
689
|
+
if __name__ == "__main__":
|
|
690
|
+
from prefect.deployments import Deployment
|
|
691
|
+
from prefect.server.schemas.schedules import CronSchedule
|
|
692
|
+
|
|
693
|
+
deployment = Deployment.build_from_flow(
|
|
694
|
+
flow=ml_training_flow,
|
|
695
|
+
name="daily-training",
|
|
696
|
+
schedule=CronSchedule(cron="0 2 * * *"),
|
|
697
|
+
parameters={
|
|
698
|
+
"data_path": "s3://data/training.parquet",
|
|
699
|
+
"n_estimators": 100,
|
|
700
|
+
"max_depth": 10,
|
|
701
|
+
"accuracy_threshold": 0.8,
|
|
702
|
+
},
|
|
703
|
+
tags=["ml", "production"],
|
|
704
|
+
work_queue_name="ml-queue",
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
deployment.apply()
|
|
708
|
+
```
|
|
709
|
+
|
|
710
|
+
---
|
|
711
|
+
|
|
712
|
+
## DAG Design Patterns
|
|
713
|
+
|
|
714
|
+
### Parallel Processing Pattern
|
|
715
|
+
|
|
716
|
+
```python
|
|
717
|
+
from prefect import flow, task, unmapped
|
|
718
|
+
from typing import List
|
|
719
|
+
|
|
720
|
+
@task
|
|
721
|
+
def process_partition(partition_id: int, data_path: str) -> dict:
|
|
722
|
+
"""Process single data partition."""
|
|
723
|
+
# Process partition
|
|
724
|
+
return {"partition_id": partition_id, "records_processed": 1000}
|
|
725
|
+
|
|
726
|
+
@task
|
|
727
|
+
def aggregate_results(results: List[dict]) -> dict:
|
|
728
|
+
"""Aggregate parallel processing results."""
|
|
729
|
+
total_records = sum(r["records_processed"] for r in results)
|
|
730
|
+
return {"total_records": total_records}
|
|
731
|
+
|
|
732
|
+
@flow
|
|
733
|
+
def parallel_processing_flow(data_path: str, num_partitions: int = 4):
|
|
734
|
+
"""Process data in parallel partitions."""
|
|
735
|
+
|
|
736
|
+
# Map over partitions
|
|
737
|
+
partition_results = process_partition.map(
|
|
738
|
+
partition_id=range(num_partitions),
|
|
739
|
+
data_path=unmapped(data_path),
|
|
740
|
+
)
|
|
741
|
+
|
|
742
|
+
# Aggregate results
|
|
743
|
+
final_result = aggregate_results(partition_results)
|
|
744
|
+
|
|
745
|
+
return final_result
|
|
746
|
+
```
|
|
747
|
+
|
|
748
|
+
### Conditional Branching Pattern
|
|
749
|
+
|
|
750
|
+
```python
|
|
751
|
+
from prefect import flow, task
|
|
752
|
+
|
|
753
|
+
@task
|
|
754
|
+
def check_data_quality(df) -> bool:
|
|
755
|
+
"""Check if data meets quality standards."""
|
|
756
|
+
null_ratio = df.isnull().sum().sum() / df.size
|
|
757
|
+
return null_ratio < 0.1
|
|
758
|
+
|
|
759
|
+
@task
|
|
760
|
+
def handle_poor_quality(df):
|
|
761
|
+
"""Handle data that fails quality checks."""
|
|
762
|
+
# Impute, clean, or alert
|
|
763
|
+
pass
|
|
764
|
+
|
|
765
|
+
@task
|
|
766
|
+
def process_good_quality(df):
|
|
767
|
+
"""Process data that passes quality checks."""
|
|
768
|
+
pass
|
|
769
|
+
|
|
770
|
+
@flow
|
|
771
|
+
def conditional_flow(data_path: str):
|
|
772
|
+
"""Flow with conditional branching."""
|
|
773
|
+
df = load_data(data_path)
|
|
774
|
+
quality_ok = check_data_quality(df)
|
|
775
|
+
|
|
776
|
+
if quality_ok:
|
|
777
|
+
result = process_good_quality(df)
|
|
778
|
+
else:
|
|
779
|
+
result = handle_poor_quality(df)
|
|
780
|
+
|
|
781
|
+
return result
|
|
782
|
+
```
|
|
783
|
+
|
|
784
|
+
### Error Handling Pattern
|
|
785
|
+
|
|
786
|
+
```python
|
|
787
|
+
from prefect import flow, task
|
|
788
|
+
from prefect.states import Failed
|
|
789
|
+
|
|
790
|
+
@task
|
|
791
|
+
def risky_operation():
|
|
792
|
+
"""Operation that might fail."""
|
|
793
|
+
import random
|
|
794
|
+
if random.random() < 0.3:
|
|
795
|
+
raise ValueError("Random failure")
|
|
796
|
+
return "success"
|
|
797
|
+
|
|
798
|
+
@task
|
|
799
|
+
def fallback_operation():
|
|
800
|
+
"""Fallback when primary fails."""
|
|
801
|
+
return "fallback_result"
|
|
802
|
+
|
|
803
|
+
@task
|
|
804
|
+
def send_alert(error: Exception):
|
|
805
|
+
"""Send alert on failure."""
|
|
806
|
+
# Send to Slack, PagerDuty, etc.
|
|
807
|
+
pass
|
|
808
|
+
|
|
809
|
+
@flow
|
|
810
|
+
def resilient_flow():
|
|
811
|
+
"""Flow with error handling."""
|
|
812
|
+
try:
|
|
813
|
+
result = risky_operation()
|
|
814
|
+
except Exception as e:
|
|
815
|
+
send_alert(e)
|
|
816
|
+
result = fallback_operation()
|
|
817
|
+
|
|
818
|
+
return result
|
|
819
|
+
```
|
|
820
|
+
|
|
821
|
+
---
|
|
822
|
+
|
|
823
|
+
## Best Practices
|
|
824
|
+
|
|
825
|
+
### Pipeline Configuration
|
|
826
|
+
|
|
827
|
+
```yaml
|
|
828
|
+
# pipeline_config.yaml
|
|
829
|
+
pipeline:
|
|
830
|
+
name: ml-training
|
|
831
|
+
version: "1.0.0"
|
|
832
|
+
description: "Production ML training pipeline"
|
|
833
|
+
|
|
834
|
+
stages:
|
|
835
|
+
- name: load_data
|
|
836
|
+
timeout: 300
|
|
837
|
+
retries: 3
|
|
838
|
+
|
|
839
|
+
- name: preprocess
|
|
840
|
+
timeout: 600
|
|
841
|
+
retries: 2
|
|
842
|
+
depends_on: [load_data]
|
|
843
|
+
|
|
844
|
+
- name: train
|
|
845
|
+
timeout: 3600
|
|
846
|
+
retries: 1
|
|
847
|
+
depends_on: [preprocess]
|
|
848
|
+
resources:
|
|
849
|
+
cpu: 4
|
|
850
|
+
memory: 16Gi
|
|
851
|
+
gpu: 1
|
|
852
|
+
|
|
853
|
+
- name: evaluate
|
|
854
|
+
timeout: 300
|
|
855
|
+
depends_on: [train]
|
|
856
|
+
|
|
857
|
+
- name: deploy
|
|
858
|
+
timeout: 300
|
|
859
|
+
depends_on: [evaluate]
|
|
860
|
+
condition: "evaluate.metrics.accuracy >= 0.8"
|
|
861
|
+
|
|
862
|
+
schedule:
|
|
863
|
+
cron: "0 2 * * *"
|
|
864
|
+
timezone: "UTC"
|
|
865
|
+
|
|
866
|
+
notifications:
|
|
867
|
+
on_failure:
|
|
868
|
+
- slack: "#ml-alerts"
|
|
869
|
+
- email: ml-team@company.com
|
|
870
|
+
on_success:
|
|
871
|
+
- slack: "#ml-notifications"
|
|
872
|
+
```
|
|
873
|
+
|
|
874
|
+
### Idempotency Guidelines
|
|
875
|
+
|
|
876
|
+
```python
|
|
877
|
+
# Good: Idempotent operations
|
|
878
|
+
def process_data(run_id: str, data_path: str):
|
|
879
|
+
"""Idempotent data processing."""
|
|
880
|
+
output_path = f"s3://processed/{run_id}/data.parquet"
|
|
881
|
+
|
|
882
|
+
# Check if already processed
|
|
883
|
+
if file_exists(output_path):
|
|
884
|
+
return output_path
|
|
885
|
+
|
|
886
|
+
# Process and save
|
|
887
|
+
df = pd.read_parquet(data_path)
|
|
888
|
+
processed = transform(df)
|
|
889
|
+
processed.to_parquet(output_path)
|
|
890
|
+
|
|
891
|
+
return output_path
|
|
892
|
+
```
|
|
893
|
+
|
|
894
|
+
---
|
|
895
|
+
|
|
896
|
+
## Related References
|
|
897
|
+
|
|
898
|
+
- `training-pipelines.md` - Training components for pipelines
|
|
899
|
+
- `experiment-tracking.md` - Logging pipeline runs
|
|
900
|
+
- `feature-engineering.md` - Feature pipeline components
|
|
901
|
+
- `model-validation.md` - Validation stages in pipelines
|
|
902
|
+
|
|
903
|
+
## Cross-Reference Skills
|
|
904
|
+
|
|
905
|
+
- **DevOps Engineer** - CI/CD for pipeline deployment
|
|
906
|
+
- **Kubernetes Specialist** - Running pipelines on K8s
|
|
907
|
+
- **Cloud Architect** - Cloud infrastructure for orchestration
|