aigroup-workflow 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/fix-build.md +10 -5
- package/.claude/commands/init-project.md +13 -8
- package/.claude/commands/plan.md +15 -8
- package/.claude/commands/review.md +12 -6
- package/.claude/commands/tdd.md +11 -5
- package/.claude/commands/workflow-start.md +20 -11
- package/.claude/settings.json +28 -0
- package/.codex/agents/architect.toml +207 -0
- package/.codex/agents/build-error-resolver.toml +110 -0
- package/.codex/agents/code-reviewer.toml +233 -0
- package/.codex/agents/doc-updater.toml +103 -0
- package/.codex/agents/e2e-runner.toml +103 -0
- package/.codex/agents/get-current-datetime.toml +23 -0
- package/.codex/agents/init-architect.toml +181 -0
- package/.codex/agents/planner.toml +208 -0
- package/.codex/agents/refactor-cleaner.toml +81 -0
- package/.codex/agents/rust-reviewer.toml +90 -0
- package/.codex/agents/security-reviewer.toml +104 -0
- package/.codex/agents/tdd-guide.toml +87 -0
- package/AGENTS.md +2 -2
- package/CLAUDE.md +23 -1
- package/LICENSE +20 -20
- package/README.md +333 -333
- package/agents/a11y-architect.md +141 -141
- package/agents/architect.md +211 -211
- package/agents/build-error-resolver.md +114 -114
- package/agents/chief-of-staff.md +151 -151
- package/agents/code-architect.md +71 -71
- package/agents/code-explorer.md +69 -69
- package/agents/code-reviewer.md +237 -237
- package/agents/code-simplifier.md +47 -47
- package/agents/comment-analyzer.md +45 -45
- package/agents/conversation-analyzer.md +52 -52
- package/agents/cpp-build-resolver.md +90 -90
- package/agents/cpp-reviewer.md +72 -72
- package/agents/csharp-reviewer.md +101 -101
- package/agents/dart-build-resolver.md +201 -201
- package/agents/database-reviewer.md +91 -91
- package/agents/doc-updater.md +107 -107
- package/agents/docs-lookup.md +68 -68
- package/agents/e2e-runner.md +107 -107
- package/agents/flutter-reviewer.md +243 -243
- package/agents/gan-evaluator.md +209 -209
- package/agents/gan-generator.md +131 -131
- package/agents/gan-planner.md +99 -99
- package/agents/get-current-datetime.md +26 -26
- package/agents/go-build-resolver.md +94 -94
- package/agents/go-reviewer.md +76 -76
- package/agents/harness-optimizer.md +35 -35
- package/agents/healthcare-reviewer.md +83 -83
- package/agents/java-build-resolver.md +153 -153
- package/agents/java-reviewer.md +92 -92
- package/agents/kotlin-build-resolver.md +118 -118
- package/agents/kotlin-reviewer.md +159 -159
- package/agents/loop-operator.md +36 -36
- package/agents/opensource-forker.md +198 -198
- package/agents/opensource-packager.md +249 -249
- package/agents/opensource-sanitizer.md +188 -188
- package/agents/performance-optimizer.md +446 -446
- package/agents/planner.md +212 -212
- package/agents/pr-test-analyzer.md +45 -45
- package/agents/python-reviewer.md +98 -98
- package/agents/pytorch-build-resolver.md +120 -120
- package/agents/refactor-cleaner.md +85 -85
- package/agents/rust-build-resolver.md +148 -148
- package/agents/rust-reviewer.md +94 -94
- package/agents/security-reviewer.md +108 -108
- package/agents/seo-specialist.md +59 -59
- package/agents/silent-failure-hunter.md +50 -50
- package/agents/tdd-guide.md +91 -91
- package/agents/type-design-analyzer.md +41 -41
- package/agents/typescript-reviewer.md +112 -112
- package/cli/commands/update.mjs +1 -1
- package/cli/utils/scaffold.mjs +53 -0
- package/docs/rules/agents.md +166 -50
- package/docs/rules/cpp/coding-style.md +44 -44
- package/docs/rules/cpp/hooks.md +39 -39
- package/docs/rules/cpp/patterns.md +51 -51
- package/docs/rules/cpp/security.md +51 -51
- package/docs/rules/cpp/testing.md +44 -44
- package/docs/rules/csharp/coding-style.md +72 -72
- package/docs/rules/csharp/hooks.md +25 -25
- package/docs/rules/csharp/patterns.md +50 -50
- package/docs/rules/csharp/security.md +58 -58
- package/docs/rules/csharp/testing.md +46 -46
- package/docs/rules/dart/coding-style.md +159 -159
- package/docs/rules/dart/hooks.md +66 -66
- package/docs/rules/dart/patterns.md +261 -261
- package/docs/rules/dart/security.md +135 -135
- package/docs/rules/dart/testing.md +215 -215
- package/docs/rules/golang/coding-style.md +32 -32
- package/docs/rules/golang/hooks.md +17 -17
- package/docs/rules/golang/patterns.md +45 -45
- package/docs/rules/golang/security.md +34 -34
- package/docs/rules/golang/testing.md +31 -31
- package/docs/rules/java/coding-style.md +114 -114
- package/docs/rules/java/hooks.md +18 -18
- package/docs/rules/java/patterns.md +146 -146
- package/docs/rules/java/security.md +100 -100
- package/docs/rules/java/testing.md +131 -131
- package/docs/rules/kotlin/coding-style.md +86 -86
- package/docs/rules/kotlin/hooks.md +17 -17
- package/docs/rules/kotlin/patterns.md +146 -146
- package/docs/rules/kotlin/security.md +82 -82
- package/docs/rules/kotlin/testing.md +128 -128
- package/docs/rules/perl/coding-style.md +46 -46
- package/docs/rules/perl/hooks.md +22 -22
- package/docs/rules/perl/patterns.md +76 -76
- package/docs/rules/perl/security.md +69 -69
- package/docs/rules/perl/testing.md +54 -54
- package/docs/rules/php/coding-style.md +40 -40
- package/docs/rules/php/hooks.md +24 -24
- package/docs/rules/php/patterns.md +33 -33
- package/docs/rules/php/security.md +37 -37
- package/docs/rules/php/testing.md +39 -39
- package/docs/rules/python/coding-style.md +42 -42
- package/docs/rules/python/hooks.md +19 -19
- package/docs/rules/python/patterns.md +39 -39
- package/docs/rules/python/security.md +30 -30
- package/docs/rules/python/testing.md +38 -38
- package/docs/rules/rust/coding-style.md +151 -151
- package/docs/rules/rust/hooks.md +16 -16
- package/docs/rules/rust/patterns.md +168 -168
- package/docs/rules/rust/security.md +141 -141
- package/docs/rules/rust/testing.md +154 -154
- package/docs/rules/swift/coding-style.md +47 -47
- package/docs/rules/swift/hooks.md +20 -20
- package/docs/rules/swift/patterns.md +66 -66
- package/docs/rules/swift/security.md +33 -33
- package/docs/rules/swift/testing.md +45 -45
- package/docs/rules/typescript/coding-style.md +199 -199
- package/docs/rules/typescript/hooks.md +22 -22
- package/docs/rules/typescript/patterns.md +52 -52
- package/docs/rules/typescript/security.md +28 -28
- package/docs/rules/typescript/testing.md +18 -18
- package/docs/rules/web/coding-style.md +96 -96
- package/docs/rules/web/design-quality.md +62 -62
- package/docs/rules/web/hooks.md +120 -120
- package/docs/rules/web/patterns.md +79 -79
- package/docs/rules/web/performance.md +64 -64
- package/docs/rules/web/security.md +57 -57
- package/docs/rules/web/testing.md +55 -55
- package/docs/templates/README.md +36 -36
- package/docs/templates/ai-project-final.md +124 -124
- package/docs/templates/ai-project.md +105 -105
- package/docs/templates/api.md +157 -157
- package/docs/templates/bug.md +62 -62
- package/docs/templates/code-review.md +87 -87
- package/docs/templates/generic.md +116 -116
- package/docs/templates/implementation-plan.md +1 -1
- package/docs/templates/meeting.md +68 -68
- package/docs/templates/prd.md +98 -98
- package/docs/templates/ui.md +134 -134
- package/docs/workflow-pipeline.md +11 -10
- package/package.json +40 -39
- package/scripts/hooks/checks/orchestration-artifacts.cjs +28 -23
- package/scripts/hooks/checks/workflow-state.cjs +4 -5
- package/scripts/orchestration/lib/orchestrator.cjs +344 -117
- package/scripts/orchestration/lib/validate.cjs +145 -0
- package/scripts/orchestration/session.cjs +88 -44
- package/skills/SUPERPOWERS-LICENSE +21 -21
- package/skills/ai-ml/fine-tuning-expert/SKILL.md +162 -162
- package/skills/ai-ml/fine-tuning-expert/references/dataset-preparation.md +540 -540
- package/skills/ai-ml/fine-tuning-expert/references/deployment-optimization.md +673 -673
- package/skills/ai-ml/fine-tuning-expert/references/evaluation-metrics.md +597 -597
- package/skills/ai-ml/fine-tuning-expert/references/hyperparameter-tuning.md +565 -565
- package/skills/ai-ml/fine-tuning-expert/references/lora-peft.md +347 -347
- package/skills/ai-ml/ml-pipeline/SKILL.md +159 -159
- package/skills/ai-ml/ml-pipeline/references/experiment-tracking.md +833 -833
- package/skills/ai-ml/ml-pipeline/references/feature-engineering.md +631 -631
- package/skills/ai-ml/ml-pipeline/references/model-validation.md +978 -978
- package/skills/ai-ml/ml-pipeline/references/pipeline-orchestration.md +907 -907
- package/skills/ai-ml/ml-pipeline/references/training-pipelines.md +782 -782
- package/skills/ai-ml/rag-architect/SKILL.md +194 -194
- package/skills/ai-ml/rag-architect/references/chunking-strategies.md +878 -878
- package/skills/ai-ml/rag-architect/references/embedding-models.md +561 -561
- package/skills/ai-ml/rag-architect/references/rag-evaluation.md +833 -833
- package/skills/ai-ml/rag-architect/references/retrieval-optimization.md +795 -795
- package/skills/ai-ml/rag-architect/references/vector-databases.md +589 -589
- package/skills/ai-ml/spark-engineer/SKILL.md +148 -148
- package/skills/ai-ml/spark-engineer/references/partitioning-caching.md +543 -543
- package/skills/ai-ml/spark-engineer/references/performance-tuning.md +544 -544
- package/skills/ai-ml/spark-engineer/references/rdd-operations.md +599 -599
- package/skills/ai-ml/spark-engineer/references/spark-sql-dataframes.md +474 -474
- package/skills/ai-ml/spark-engineer/references/streaming-patterns.md +786 -786
- package/skills/backend/api-designer/SKILL.md +217 -217
- package/skills/backend/api-designer/references/error-handling.md +541 -541
- package/skills/backend/api-designer/references/openapi.md +824 -824
- package/skills/backend/api-designer/references/pagination.md +494 -494
- package/skills/backend/api-designer/references/rest-patterns.md +335 -335
- package/skills/backend/api-designer/references/versioning.md +391 -391
- package/skills/backend/architecture-designer/SKILL.md +117 -117
- package/skills/backend/architecture-designer/references/adr-template.md +116 -116
- package/skills/backend/architecture-designer/references/architecture-patterns.md +111 -111
- package/skills/backend/architecture-designer/references/database-selection.md +102 -102
- package/skills/backend/architecture-designer/references/nfr-checklist.md +112 -112
- package/skills/backend/architecture-designer/references/system-design.md +100 -100
- package/skills/backend/code-documenter/SKILL.md +147 -147
- package/skills/backend/code-documenter/references/api-docs-fastapi-django.md +166 -166
- package/skills/backend/code-documenter/references/api-docs-nestjs-express.md +220 -220
- package/skills/backend/code-documenter/references/coverage-reports.md +125 -125
- package/skills/backend/code-documenter/references/documentation-systems.md +333 -333
- package/skills/backend/code-documenter/references/interactive-api-docs.md +531 -531
- package/skills/backend/code-documenter/references/python-docstrings.md +121 -121
- package/skills/backend/code-documenter/references/typescript-jsdoc.md +145 -145
- package/skills/backend/code-documenter/references/user-guides-tutorials.md +530 -530
- package/skills/backend/debugging-wizard/SKILL.md +105 -105
- package/skills/backend/debugging-wizard/references/common-patterns.md +132 -132
- package/skills/backend/debugging-wizard/references/debugging-tools.md +140 -140
- package/skills/backend/debugging-wizard/references/quick-fixes.md +177 -177
- package/skills/backend/debugging-wizard/references/strategies.md +142 -142
- package/skills/backend/debugging-wizard/references/systematic-debugging.md +367 -367
- package/skills/backend/feature-forge/SKILL.md +98 -98
- package/skills/backend/feature-forge/references/acceptance-criteria.md +104 -104
- package/skills/backend/feature-forge/references/ears-syntax.md +99 -99
- package/skills/backend/feature-forge/references/interview-questions.md +150 -150
- package/skills/backend/feature-forge/references/pre-discovery-subagents.md +54 -54
- package/skills/backend/feature-forge/references/specification-template.md +103 -103
- package/skills/backend/fullstack-guardian/SKILL.md +105 -105
- package/skills/backend/fullstack-guardian/references/api-design-standards.md +307 -307
- package/skills/backend/fullstack-guardian/references/architecture-decisions.md +350 -350
- package/skills/backend/fullstack-guardian/references/backend-patterns.md +237 -237
- package/skills/backend/fullstack-guardian/references/common-patterns.md +134 -134
- package/skills/backend/fullstack-guardian/references/deliverables-checklist.md +354 -354
- package/skills/backend/fullstack-guardian/references/design-template.md +91 -91
- package/skills/backend/fullstack-guardian/references/error-handling.md +135 -135
- package/skills/backend/fullstack-guardian/references/frontend-patterns.md +340 -340
- package/skills/backend/fullstack-guardian/references/integration-patterns.md +333 -333
- package/skills/backend/fullstack-guardian/references/security-checklist.md +106 -106
- package/skills/backend/graphql-architect/SKILL.md +146 -146
- package/skills/backend/graphql-architect/references/federation.md +418 -418
- package/skills/backend/graphql-architect/references/migration-from-rest.md +1141 -1141
- package/skills/backend/graphql-architect/references/resolvers.md +425 -425
- package/skills/backend/graphql-architect/references/schema-design.md +393 -393
- package/skills/backend/graphql-architect/references/security.md +569 -569
- package/skills/backend/graphql-architect/references/subscriptions.md +510 -510
- package/skills/backend/legacy-modernizer/SKILL.md +137 -137
- package/skills/backend/legacy-modernizer/references/legacy-testing.md +381 -381
- package/skills/backend/legacy-modernizer/references/migration-strategies.md +423 -423
- package/skills/backend/legacy-modernizer/references/refactoring-patterns.md +395 -395
- package/skills/backend/legacy-modernizer/references/strangler-fig-pattern.md +281 -281
- package/skills/backend/legacy-modernizer/references/system-assessment.md +487 -487
- package/skills/backend/microservices-architect/SKILL.md +164 -164
- package/skills/backend/microservices-architect/references/communication.md +499 -499
- package/skills/backend/microservices-architect/references/data.md +721 -721
- package/skills/backend/microservices-architect/references/decomposition.md +344 -344
- package/skills/backend/microservices-architect/references/observability.md +805 -805
- package/skills/backend/microservices-architect/references/patterns.md +603 -603
- package/skills/database/database-optimizer/SKILL.md +147 -147
- package/skills/database/database-optimizer/references/index-strategies.md +331 -331
- package/skills/database/database-optimizer/references/monitoring-analysis.md +501 -501
- package/skills/database/database-optimizer/references/mysql-tuning.md +452 -452
- package/skills/database/database-optimizer/references/postgresql-tuning.md +413 -413
- package/skills/database/database-optimizer/references/query-optimization.md +251 -251
- package/skills/database/postgres-pro/SKILL.md +152 -152
- package/skills/database/postgres-pro/references/extensions.md +404 -404
- package/skills/database/postgres-pro/references/jsonb.md +321 -321
- package/skills/database/postgres-pro/references/maintenance.md +481 -481
- package/skills/database/postgres-pro/references/performance.md +265 -265
- package/skills/database/postgres-pro/references/replication.md +446 -446
- package/skills/database/sql-pro/SKILL.md +129 -129
- package/skills/database/sql-pro/references/database-design.md +402 -402
- package/skills/database/sql-pro/references/dialect-differences.md +419 -419
- package/skills/database/sql-pro/references/optimization.md +384 -384
- package/skills/database/sql-pro/references/query-patterns.md +285 -285
- package/skills/database/sql-pro/references/window-functions.md +328 -328
- package/skills/dotnet/csharp-developer/SKILL.md +125 -125
- package/skills/dotnet/csharp-developer/references/aspnet-core.md +394 -394
- package/skills/dotnet/csharp-developer/references/blazor.md +553 -553
- package/skills/dotnet/csharp-developer/references/entity-framework.md +409 -409
- package/skills/dotnet/csharp-developer/references/modern-csharp.md +248 -248
- package/skills/dotnet/csharp-developer/references/performance.md +498 -498
- package/skills/dotnet/dotnet-core-expert/SKILL.md +138 -138
- package/skills/dotnet/dotnet-core-expert/references/authentication.md +546 -546
- package/skills/dotnet/dotnet-core-expert/references/clean-architecture.md +455 -455
- package/skills/dotnet/dotnet-core-expert/references/cloud-native.md +548 -548
- package/skills/dotnet/dotnet-core-expert/references/entity-framework.md +440 -440
- package/skills/dotnet/dotnet-core-expert/references/minimal-apis.md +319 -319
- package/skills/frontend/angular-architect/SKILL.md +152 -152
- package/skills/frontend/angular-architect/references/components.md +297 -297
- package/skills/frontend/angular-architect/references/ngrx.md +401 -401
- package/skills/frontend/angular-architect/references/routing.md +361 -361
- package/skills/frontend/angular-architect/references/rxjs.md +319 -319
- package/skills/frontend/angular-architect/references/testing.md +405 -405
- package/skills/frontend/design-commands/design.md +91 -91
- package/skills/frontend/design-commands/handoff.md +97 -97
- package/skills/frontend/design-commands/prototype.md +120 -120
- package/skills/frontend/design-commands/spec.md +160 -160
- package/skills/frontend/design-commands/style.md +78 -78
- package/skills/frontend/flutter-expert/SKILL.md +138 -138
- package/skills/frontend/flutter-expert/references/bloc-state.md +259 -259
- package/skills/frontend/flutter-expert/references/gorouter-navigation.md +119 -119
- package/skills/frontend/flutter-expert/references/performance.md +99 -99
- package/skills/frontend/flutter-expert/references/project-structure.md +118 -118
- package/skills/frontend/flutter-expert/references/riverpod-state.md +130 -130
- package/skills/frontend/flutter-expert/references/widget-patterns.md +123 -123
- package/skills/frontend/nextjs-developer/SKILL.md +143 -143
- package/skills/frontend/nextjs-developer/references/app-router.md +311 -311
- package/skills/frontend/nextjs-developer/references/data-fetching.md +482 -482
- package/skills/frontend/nextjs-developer/references/deployment.md +545 -545
- package/skills/frontend/nextjs-developer/references/server-actions.md +462 -462
- package/skills/frontend/nextjs-developer/references/server-components.md +384 -384
- package/skills/frontend/react-expert/SKILL.md +149 -149
- package/skills/frontend/react-expert/references/hooks-patterns.md +162 -162
- package/skills/frontend/react-expert/references/migration-class-to-modern.md +1119 -1119
- package/skills/frontend/react-expert/references/performance.md +168 -168
- package/skills/frontend/react-expert/references/react-19-features.md +174 -174
- package/skills/frontend/react-expert/references/server-components.md +143 -143
- package/skills/frontend/react-expert/references/state-management.md +171 -171
- package/skills/frontend/react-expert/references/testing-react.md +174 -174
- package/skills/frontend/react-native-expert/SKILL.md +185 -185
- package/skills/frontend/react-native-expert/references/expo-router.md +187 -187
- package/skills/frontend/react-native-expert/references/list-optimization.md +204 -204
- package/skills/frontend/react-native-expert/references/platform-handling.md +188 -188
- package/skills/frontend/react-native-expert/references/project-structure.md +171 -171
- package/skills/frontend/react-native-expert/references/storage-hooks.md +173 -173
- package/skills/frontend/senior-frontend/SKILL.md +477 -477
- package/skills/frontend/senior-frontend/references/frontend_best_practices.md +806 -806
- package/skills/frontend/senior-frontend/references/nextjs_optimization_guide.md +724 -724
- package/skills/frontend/senior-frontend/references/react_patterns.md +746 -746
- package/skills/frontend/senior-frontend/scripts/bundle_analyzer.py +407 -407
- package/skills/frontend/senior-frontend/scripts/component_generator.py +329 -329
- package/skills/frontend/senior-frontend/scripts/frontend_scaffolder.py +1005 -1005
- package/skills/frontend/ui-ux-pro-max/SKILL.md +386 -386
- package/skills/frontend/ui-ux-pro-max/data/charts.csv +26 -26
- package/skills/frontend/ui-ux-pro-max/data/colors.csv +97 -97
- package/skills/frontend/ui-ux-pro-max/data/icons.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/landing.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/data/products.csv +96 -96
- package/skills/frontend/ui-ux-pro-max/data/react-performance.csv +45 -45
- package/skills/frontend/ui-ux-pro-max/data/stacks/astro.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/flutter.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -56
- package/skills/frontend/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nextjs.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -59
- package/skills/frontend/ui-ux-pro-max/data/stacks/react-native.csv +52 -52
- package/skills/frontend/ui-ux-pro-max/data/stacks/react.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/shadcn.csv +61 -61
- package/skills/frontend/ui-ux-pro-max/data/stacks/svelte.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/swiftui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/vue.csv +50 -50
- package/skills/frontend/ui-ux-pro-max/data/styles.csv +68 -68
- package/skills/frontend/ui-ux-pro-max/data/typography.csv +57 -57
- package/skills/frontend/ui-ux-pro-max/data/ui-reasoning.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/ux-guidelines.csv +99 -99
- package/skills/frontend/ui-ux-pro-max/data/web-interface.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/scripts/core.py +253 -253
- package/skills/frontend/ui-ux-pro-max/scripts/design_system.py +1067 -1067
- package/skills/frontend/ui-ux-pro-max/scripts/search.py +114 -114
- package/skills/frontend/vue-expert/SKILL.md +98 -98
- package/skills/frontend/vue-expert/references/build-tooling.md +480 -480
- package/skills/frontend/vue-expert/references/components.md +448 -448
- package/skills/frontend/vue-expert/references/composition-api.md +299 -299
- package/skills/frontend/vue-expert/references/mobile-hybrid.md +636 -636
- package/skills/frontend/vue-expert/references/nuxt.md +669 -669
- package/skills/frontend/vue-expert/references/state-management.md +449 -449
- package/skills/frontend/vue-expert/references/typescript.md +584 -584
- package/skills/frontend/vue-expert-js/SKILL.md +167 -167
- package/skills/frontend/vue-expert-js/references/component-architecture.md +219 -219
- package/skills/frontend/vue-expert-js/references/composables-patterns.md +183 -183
- package/skills/frontend/vue-expert-js/references/jsdoc-typing.md +535 -535
- package/skills/frontend/vue-expert-js/references/state-management.md +249 -249
- package/skills/frontend/vue-expert-js/references/testing-patterns.md +237 -237
- package/skills/go-rust-cpp/cpp-pro/SKILL.md +115 -115
- package/skills/go-rust-cpp/cpp-pro/references/build-tooling.md +440 -440
- package/skills/go-rust-cpp/cpp-pro/references/concurrency.md +437 -437
- package/skills/go-rust-cpp/cpp-pro/references/memory-performance.md +397 -397
- package/skills/go-rust-cpp/cpp-pro/references/modern-cpp.md +304 -304
- package/skills/go-rust-cpp/cpp-pro/references/templates.md +357 -357
- package/skills/go-rust-cpp/golang-pro/SKILL.md +122 -122
- package/skills/go-rust-cpp/golang-pro/references/concurrency.md +329 -329
- package/skills/go-rust-cpp/golang-pro/references/generics.md +442 -442
- package/skills/go-rust-cpp/golang-pro/references/interfaces.md +432 -432
- package/skills/go-rust-cpp/golang-pro/references/project-structure.md +477 -477
- package/skills/go-rust-cpp/golang-pro/references/testing.md +451 -451
- package/skills/go-rust-cpp/rust-engineer/SKILL.md +167 -167
- package/skills/go-rust-cpp/rust-engineer/references/async.md +458 -458
- package/skills/go-rust-cpp/rust-engineer/references/error-handling.md +334 -334
- package/skills/go-rust-cpp/rust-engineer/references/ownership.md +278 -278
- package/skills/go-rust-cpp/rust-engineer/references/testing.md +470 -470
- package/skills/go-rust-cpp/rust-engineer/references/traits.md +413 -413
- package/skills/infra/cli-developer/SKILL.md +113 -113
- package/skills/infra/cli-developer/references/design-patterns.md +221 -221
- package/skills/infra/cli-developer/references/go-cli.md +540 -540
- package/skills/infra/cli-developer/references/node-cli.md +383 -383
- package/skills/infra/cli-developer/references/python-cli.md +422 -422
- package/skills/infra/cli-developer/references/ux-patterns.md +448 -448
- package/skills/infra/cloud-architect/SKILL.md +216 -216
- package/skills/infra/cloud-architect/references/aws.md +394 -394
- package/skills/infra/cloud-architect/references/azure.md +562 -562
- package/skills/infra/cloud-architect/references/cost.md +582 -582
- package/skills/infra/cloud-architect/references/gcp.md +633 -633
- package/skills/infra/cloud-architect/references/multi-cloud.md +483 -483
- package/skills/infra/devops-engineer/SKILL.md +144 -144
- package/skills/infra/devops-engineer/references/deployment-strategies.md +241 -241
- package/skills/infra/devops-engineer/references/docker-patterns.md +113 -113
- package/skills/infra/devops-engineer/references/github-actions.md +139 -139
- package/skills/infra/devops-engineer/references/incident-response.md +331 -331
- package/skills/infra/devops-engineer/references/kubernetes.md +154 -154
- package/skills/infra/devops-engineer/references/platform-engineering.md +417 -417
- package/skills/infra/devops-engineer/references/release-automation.md +527 -527
- package/skills/infra/devops-engineer/references/terraform-iac.md +141 -141
- package/skills/infra/kubernetes-specialist/SKILL.md +241 -241
- package/skills/infra/kubernetes-specialist/references/configuration.md +452 -452
- package/skills/infra/kubernetes-specialist/references/cost-optimization.md +458 -458
- package/skills/infra/kubernetes-specialist/references/custom-operators.md +563 -563
- package/skills/infra/kubernetes-specialist/references/gitops.md +530 -530
- package/skills/infra/kubernetes-specialist/references/helm-charts.md +912 -912
- package/skills/infra/kubernetes-specialist/references/multi-cluster.md +507 -507
- package/skills/infra/kubernetes-specialist/references/networking.md +447 -447
- package/skills/infra/kubernetes-specialist/references/service-mesh.md +459 -459
- package/skills/infra/kubernetes-specialist/references/storage.md +535 -535
- package/skills/infra/kubernetes-specialist/references/troubleshooting.md +414 -414
- package/skills/infra/kubernetes-specialist/references/workloads.md +377 -377
- package/skills/infra/mcp-developer/SKILL.md +143 -143
- package/skills/infra/mcp-developer/references/protocol.md +244 -244
- package/skills/infra/mcp-developer/references/python-sdk.md +367 -367
- package/skills/infra/mcp-developer/references/resources.md +554 -554
- package/skills/infra/mcp-developer/references/tools.md +480 -480
- package/skills/infra/mcp-developer/references/typescript-sdk.md +350 -350
- package/skills/infra/monitoring-expert/SKILL.md +176 -176
- package/skills/infra/monitoring-expert/references/alerting-rules.md +141 -141
- package/skills/infra/monitoring-expert/references/application-profiling.md +331 -331
- package/skills/infra/monitoring-expert/references/capacity-planning.md +344 -344
- package/skills/infra/monitoring-expert/references/dashboards.md +126 -126
- package/skills/infra/monitoring-expert/references/opentelemetry.md +123 -123
- package/skills/infra/monitoring-expert/references/performance-testing.md +269 -269
- package/skills/infra/monitoring-expert/references/prometheus-metrics.md +136 -136
- package/skills/infra/monitoring-expert/references/structured-logging.md +142 -142
- package/skills/infra/sre-engineer/SKILL.md +181 -181
- package/skills/infra/sre-engineer/references/automation-toil.md +492 -492
- package/skills/infra/sre-engineer/references/error-budget-policy.md +334 -334
- package/skills/infra/sre-engineer/references/incident-chaos.md +576 -576
- package/skills/infra/sre-engineer/references/monitoring-alerting.md +424 -424
- package/skills/infra/sre-engineer/references/slo-sli-management.md +238 -238
- package/skills/infra/terraform-engineer/SKILL.md +143 -143
- package/skills/infra/terraform-engineer/references/best-practices.md +583 -583
- package/skills/infra/terraform-engineer/references/module-patterns.md +297 -297
- package/skills/infra/terraform-engineer/references/providers.md +452 -452
- package/skills/infra/terraform-engineer/references/state-management.md +371 -371
- package/skills/infra/terraform-engineer/references/testing.md +486 -486
- package/skills/infra/websocket-engineer/SKILL.md +168 -168
- package/skills/infra/websocket-engineer/references/alternatives.md +391 -391
- package/skills/infra/websocket-engineer/references/patterns.md +400 -400
- package/skills/infra/websocket-engineer/references/protocol.md +195 -195
- package/skills/infra/websocket-engineer/references/scaling.md +333 -333
- package/skills/infra/websocket-engineer/references/security.md +474 -474
- package/skills/java/java-architect/SKILL.md +132 -132
- package/skills/java/java-architect/references/jpa-optimization.md +393 -393
- package/skills/java/java-architect/references/reactive-webflux.md +356 -356
- package/skills/java/java-architect/references/spring-boot-setup.md +269 -269
- package/skills/java/java-architect/references/spring-security.md +445 -445
- package/skills/java/java-architect/references/testing-patterns.md +500 -500
- package/skills/java/kotlin-specialist/SKILL.md +147 -147
- package/skills/java/kotlin-specialist/references/android-compose.md +419 -419
- package/skills/java/kotlin-specialist/references/coroutines-flow.md +276 -276
- package/skills/java/kotlin-specialist/references/dsl-idioms.md +421 -421
- package/skills/java/kotlin-specialist/references/ktor-server.md +426 -426
- package/skills/java/kotlin-specialist/references/multiplatform-kmp.md +380 -380
- package/skills/java/spring-boot-engineer/SKILL.md +195 -195
- package/skills/java/spring-boot-engineer/references/cloud.md +498 -498
- package/skills/java/spring-boot-engineer/references/data.md +381 -381
- package/skills/java/spring-boot-engineer/references/security.md +459 -459
- package/skills/java/spring-boot-engineer/references/testing.md +545 -545
- package/skills/java/spring-boot-engineer/references/web.md +295 -295
- package/skills/javascript/javascript-pro/SKILL.md +132 -132
- package/skills/javascript/javascript-pro/references/async-patterns.md +334 -334
- package/skills/javascript/javascript-pro/references/browser-apis.md +398 -398
- package/skills/javascript/javascript-pro/references/modern-syntax.md +272 -272
- package/skills/javascript/javascript-pro/references/modules.md +357 -357
- package/skills/javascript/javascript-pro/references/node-essentials.md +471 -471
- package/skills/javascript/nestjs-expert/SKILL.md +206 -206
- package/skills/javascript/nestjs-expert/references/authentication.md +166 -166
- package/skills/javascript/nestjs-expert/references/controllers-routing.md +111 -111
- package/skills/javascript/nestjs-expert/references/dtos-validation.md +153 -153
- package/skills/javascript/nestjs-expert/references/migration-from-express.md +1237 -1237
- package/skills/javascript/nestjs-expert/references/services-di.md +140 -140
- package/skills/javascript/nestjs-expert/references/testing-patterns.md +186 -186
- package/skills/javascript/typescript-pro/SKILL.md +145 -145
- package/skills/javascript/typescript-pro/references/advanced-types.md +259 -259
- package/skills/javascript/typescript-pro/references/configuration.md +445 -445
- package/skills/javascript/typescript-pro/references/patterns.md +484 -484
- package/skills/javascript/typescript-pro/references/type-guards.md +352 -352
- package/skills/javascript/typescript-pro/references/utility-types.md +329 -329
- package/skills/php/laravel-specialist/SKILL.md +262 -262
- package/skills/php/laravel-specialist/references/eloquent.md +351 -351
- package/skills/php/laravel-specialist/references/livewire.md +512 -512
- package/skills/php/laravel-specialist/references/queues.md +423 -423
- package/skills/php/laravel-specialist/references/routing.md +362 -362
- package/skills/php/laravel-specialist/references/testing.md +522 -522
- package/skills/php/php-pro/SKILL.md +206 -206
- package/skills/php/php-pro/references/async-patterns.md +412 -412
- package/skills/php/php-pro/references/laravel-patterns.md +377 -377
- package/skills/php/php-pro/references/modern-php-features.md +323 -323
- package/skills/php/php-pro/references/symfony-patterns.md +466 -466
- package/skills/php/php-pro/references/testing-quality.md +466 -466
- package/skills/product/competitive-analysis/SKILL.md +257 -257
- package/skills/product/meeting-notes/SKILL.md +266 -266
- package/skills/product/prd-template/SKILL.md +150 -150
- package/skills/product/stakeholder-update/SKILL.md +225 -225
- package/skills/product/user-research-synthesis/SKILL.md +235 -235
- package/skills/python/django-expert/SKILL.md +162 -162
- package/skills/python/django-expert/references/authentication.md +145 -145
- package/skills/python/django-expert/references/drf-serializers.md +148 -148
- package/skills/python/django-expert/references/models-orm.md +151 -151
- package/skills/python/django-expert/references/testing-django.md +204 -204
- package/skills/python/django-expert/references/viewsets-views.md +153 -153
- package/skills/python/fastapi-expert/SKILL.md +185 -185
- package/skills/python/fastapi-expert/references/async-sqlalchemy.md +146 -146
- package/skills/python/fastapi-expert/references/authentication.md +159 -159
- package/skills/python/fastapi-expert/references/endpoints-routing.md +142 -142
- package/skills/python/fastapi-expert/references/migration-from-django.md +996 -996
- package/skills/python/fastapi-expert/references/pydantic-v2.md +135 -135
- package/skills/python/fastapi-expert/references/testing-async.md +159 -159
- package/skills/python/pandas-pro/SKILL.md +178 -178
- package/skills/python/pandas-pro/references/aggregation-groupby.md +545 -545
- package/skills/python/pandas-pro/references/data-cleaning.md +500 -500
- package/skills/python/pandas-pro/references/dataframe-operations.md +420 -420
- package/skills/python/pandas-pro/references/merging-joining.md +596 -596
- package/skills/python/pandas-pro/references/performance-optimization.md +597 -597
- package/skills/python/python-pro/SKILL.md +177 -177
- package/skills/python/python-pro/references/async-patterns.md +356 -356
- package/skills/python/python-pro/references/packaging.md +460 -460
- package/skills/python/python-pro/references/standard-library.md +378 -378
- package/skills/python/python-pro/references/testing.md +404 -404
- package/skills/python/python-pro/references/type-system.md +290 -290
- package/skills/quality/chaos-engineer/SKILL.md +182 -182
- package/skills/quality/chaos-engineer/references/chaos-tools.md +511 -511
- package/skills/quality/chaos-engineer/references/experiment-design.md +229 -229
- package/skills/quality/chaos-engineer/references/game-days.md +434 -434
- package/skills/quality/chaos-engineer/references/infrastructure-chaos.md +348 -348
- package/skills/quality/chaos-engineer/references/kubernetes-chaos.md +432 -432
- package/skills/quality/code-reviewer/SKILL.md +119 -119
- package/skills/quality/code-reviewer/references/common-issues.md +142 -142
- package/skills/quality/code-reviewer/references/feedback-examples.md +144 -144
- package/skills/quality/code-reviewer/references/receiving-feedback.md +238 -238
- package/skills/quality/code-reviewer/references/report-template.md +109 -109
- package/skills/quality/code-reviewer/references/review-checklist.md +88 -88
- package/skills/quality/code-reviewer/references/spec-compliance-review.md +258 -258
- package/skills/quality/playwright-expert/SKILL.md +169 -169
- package/skills/quality/playwright-expert/references/api-mocking.md +140 -140
- package/skills/quality/playwright-expert/references/configuration.md +155 -155
- package/skills/quality/playwright-expert/references/debugging-flaky.md +150 -150
- package/skills/quality/playwright-expert/references/page-object-model.md +152 -152
- package/skills/quality/playwright-expert/references/selectors-locators.md +119 -119
- package/skills/quality/secure-code-guardian/SKILL.md +191 -191
- package/skills/quality/secure-code-guardian/references/authentication.md +136 -136
- package/skills/quality/secure-code-guardian/references/input-validation.md +146 -146
- package/skills/quality/secure-code-guardian/references/owasp-prevention.md +135 -135
- package/skills/quality/secure-code-guardian/references/security-headers.md +133 -133
- package/skills/quality/secure-code-guardian/references/xss-csrf.md +157 -157
- package/skills/quality/security-reviewer/SKILL.md +103 -103
- package/skills/quality/security-reviewer/references/infrastructure-security.md +268 -268
- package/skills/quality/security-reviewer/references/penetration-testing.md +268 -268
- package/skills/quality/security-reviewer/references/report-template.md +170 -170
- package/skills/quality/security-reviewer/references/sast-tools.md +117 -117
- package/skills/quality/security-reviewer/references/secret-scanning.md +125 -125
- package/skills/quality/security-reviewer/references/vulnerability-patterns.md +152 -152
- package/skills/quality/senior-qa/README.md +196 -196
- package/skills/quality/senior-qa/SKILL.md +399 -399
- package/skills/quality/senior-qa/references/qa_best_practices.md +964 -964
- package/skills/quality/senior-qa/references/test_automation_patterns.md +1009 -1009
- package/skills/quality/senior-qa/references/testing_strategies.md +649 -649
- package/skills/quality/senior-qa/scripts/coverage_analyzer.py +836 -836
- package/skills/quality/senior-qa/scripts/e2e_test_scaffolder.py +820 -820
- package/skills/quality/senior-qa/scripts/test_suite_generator.py +605 -605
- package/skills/quality/tdd-guide/HOW_TO_USE.md +313 -313
- package/skills/quality/tdd-guide/README.md +680 -680
- package/skills/quality/tdd-guide/SKILL.md +122 -122
- package/skills/quality/tdd-guide/assets/expected_output.json +77 -77
- package/skills/quality/tdd-guide/assets/sample_input_python.json +39 -39
- package/skills/quality/tdd-guide/assets/sample_input_typescript.json +36 -36
- package/skills/quality/tdd-guide/references/ci-integration.md +195 -195
- package/skills/quality/tdd-guide/references/framework-guide.md +206 -206
- package/skills/quality/tdd-guide/references/tdd-best-practices.md +128 -128
- package/skills/quality/tdd-guide/scripts/coverage_analyzer.py +434 -434
- package/skills/quality/tdd-guide/scripts/fixture_generator.py +440 -440
- package/skills/quality/tdd-guide/scripts/format_detector.py +384 -384
- package/skills/quality/tdd-guide/scripts/framework_adapter.py +428 -428
- package/skills/quality/tdd-guide/scripts/metrics_calculator.py +456 -456
- package/skills/quality/tdd-guide/scripts/output_formatter.py +354 -354
- package/skills/quality/tdd-guide/scripts/tdd_workflow.py +474 -474
- package/skills/quality/tdd-guide/scripts/test_generator.py +438 -438
- package/skills/quality/test-master/SKILL.md +94 -94
- package/skills/quality/test-master/references/automation-frameworks.md +294 -294
- package/skills/quality/test-master/references/e2e-testing.md +128 -128
- package/skills/quality/test-master/references/integration-testing.md +120 -120
- package/skills/quality/test-master/references/performance-testing.md +118 -118
- package/skills/quality/test-master/references/qa-methodology.md +247 -247
- package/skills/quality/test-master/references/security-testing.md +127 -127
- package/skills/quality/test-master/references/tdd-iron-laws.md +174 -174
- package/skills/quality/test-master/references/test-reports.md +104 -104
- package/skills/quality/test-master/references/testing-anti-patterns.md +231 -231
- package/skills/quality/test-master/references/unit-testing.md +113 -113
- package/skills/ruby/rails-expert/SKILL.md +154 -154
- package/skills/ruby/rails-expert/references/active-record.md +244 -244
- package/skills/ruby/rails-expert/references/api-development.md +401 -401
- package/skills/ruby/rails-expert/references/background-jobs.md +272 -272
- package/skills/ruby/rails-expert/references/hotwire-turbo.md +228 -228
- package/skills/ruby/rails-expert/references/rspec-testing.md +367 -367
- package/skills/swift/swift-expert/SKILL.md +163 -163
- package/skills/swift/swift-expert/references/async-concurrency.md +360 -360
- package/skills/swift/swift-expert/references/memory-performance.md +377 -377
- package/skills/swift/swift-expert/references/protocol-oriented.md +354 -354
- package/skills/swift/swift-expert/references/swiftui-patterns.md +291 -291
- package/skills/swift/swift-expert/references/testing-patterns.md +399 -399
- package/skills/workflow/brainstorming/SKILL.md +164 -164
- package/skills/workflow/brainstorming/scripts/frame-template.html +214 -214
- package/skills/workflow/brainstorming/scripts/helper.js +88 -88
- package/skills/workflow/brainstorming/scripts/server.cjs +354 -354
- package/skills/workflow/brainstorming/scripts/start-server.sh +148 -148
- package/skills/workflow/brainstorming/scripts/stop-server.sh +56 -56
- package/skills/workflow/brainstorming/spec-document-reviewer-prompt.md +49 -49
- package/skills/workflow/brainstorming/visual-companion.md +287 -287
- package/skills/workflow/documentation/SKILL.md +45 -45
- package/skills/workflow/entropy-management/SKILL.md +115 -115
- package/skills/workflow/executing-plans/SKILL.md +70 -70
- package/skills/workflow/finishing-a-development-branch/SKILL.md +200 -200
- package/skills/workflow/receiving-code-review/SKILL.md +213 -213
- package/skills/workflow/requesting-code-review/SKILL.md +105 -105
- package/skills/workflow/requesting-code-review/code-reviewer.md +146 -146
- package/skills/workflow/requirement-engineering/SKILL.md +111 -111
- package/skills/workflow/systematic-debugging/CREATION-LOG.md +119 -119
- package/skills/workflow/systematic-debugging/SKILL.md +296 -296
- package/skills/workflow/systematic-debugging/condition-based-waiting-example.ts +158 -158
- package/skills/workflow/systematic-debugging/condition-based-waiting.md +115 -115
- package/skills/workflow/systematic-debugging/defense-in-depth.md +122 -122
- package/skills/workflow/systematic-debugging/find-polluter.sh +63 -63
- package/skills/workflow/systematic-debugging/root-cause-tracing.md +169 -169
- package/skills/workflow/systematic-debugging/test-academic.md +14 -14
- package/skills/workflow/systematic-debugging/test-pressure-1.md +58 -58
- package/skills/workflow/systematic-debugging/test-pressure-2.md +68 -68
- package/skills/workflow/systematic-debugging/test-pressure-3.md +69 -69
- package/skills/workflow/using-git-worktrees/SKILL.md +218 -218
- package/skills/workflow/verification-before-completion/SKILL.md +139 -139
- package/skills/workflow/writing-plans/SKILL.md +151 -151
- package/skills/workflow/writing-plans/plan-document-reviewer-prompt.md +49 -49
- package/skills/workflow/writing-skills/SKILL.md +655 -655
- package/skills/workflow/writing-skills/anthropic-best-practices.md +1150 -1150
- package/skills/workflow/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -189
- package/skills/workflow/writing-skills/persuasion-principles.md +187 -187
- package/skills/workflow/writing-skills/render-graphs.js +168 -168
- package/skills/workflow/writing-skills/testing-skills-with-subagents.md +384 -384
|
@@ -1,833 +1,833 @@
|
|
|
1
|
-
# Experiment Tracking
|
|
2
|
-
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
## Overview
|
|
6
|
-
|
|
7
|
-
Experiment tracking enables reproducibility, comparison, and collaboration in ML development. It captures hyperparameters, metrics, artifacts, and model versions to ensure every experiment can be reproduced and compared.
|
|
8
|
-
|
|
9
|
-
## When to Use This Reference
|
|
10
|
-
|
|
11
|
-
- Setting up MLflow for experiment tracking
|
|
12
|
-
- Implementing Weights & Biases integration
|
|
13
|
-
- Creating model registries and versioning
|
|
14
|
-
- Comparing experiments and selecting models
|
|
15
|
-
- Building custom tracking solutions
|
|
16
|
-
|
|
17
|
-
## When NOT to Use
|
|
18
|
-
|
|
19
|
-
- Quick one-off experiments without reproducibility needs
|
|
20
|
-
- Simple scripts without hyperparameters
|
|
21
|
-
- Non-ML projects
|
|
22
|
-
|
|
23
|
-
---
|
|
24
|
-
|
|
25
|
-
## MLflow Integration
|
|
26
|
-
|
|
27
|
-
### Basic Experiment Tracking
|
|
28
|
-
|
|
29
|
-
```python
|
|
30
|
-
import mlflow
|
|
31
|
-
from mlflow.tracking import MlflowClient
|
|
32
|
-
from pathlib import Path
|
|
33
|
-
import json
|
|
34
|
-
|
|
35
|
-
class MLflowTracker:
|
|
36
|
-
"""MLflow experiment tracking wrapper."""
|
|
37
|
-
|
|
38
|
-
def __init__(
|
|
39
|
-
self,
|
|
40
|
-
experiment_name: str,
|
|
41
|
-
tracking_uri: str = "http://localhost:5000",
|
|
42
|
-
artifact_location: str = None,
|
|
43
|
-
):
|
|
44
|
-
mlflow.set_tracking_uri(tracking_uri)
|
|
45
|
-
|
|
46
|
-
# Create or get experiment
|
|
47
|
-
experiment = mlflow.get_experiment_by_name(experiment_name)
|
|
48
|
-
if experiment is None:
|
|
49
|
-
self.experiment_id = mlflow.create_experiment(
|
|
50
|
-
experiment_name,
|
|
51
|
-
artifact_location=artifact_location,
|
|
52
|
-
)
|
|
53
|
-
else:
|
|
54
|
-
self.experiment_id = experiment.experiment_id
|
|
55
|
-
|
|
56
|
-
mlflow.set_experiment(experiment_name)
|
|
57
|
-
self.client = MlflowClient()
|
|
58
|
-
self.run = None
|
|
59
|
-
|
|
60
|
-
def start_run(
|
|
61
|
-
self,
|
|
62
|
-
run_name: str = None,
|
|
63
|
-
tags: dict = None,
|
|
64
|
-
nested: bool = False,
|
|
65
|
-
) -> str:
|
|
66
|
-
"""Start a new MLflow run."""
|
|
67
|
-
self.run = mlflow.start_run(
|
|
68
|
-
run_name=run_name,
|
|
69
|
-
experiment_id=self.experiment_id,
|
|
70
|
-
nested=nested,
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
if tags:
|
|
74
|
-
mlflow.set_tags(tags)
|
|
75
|
-
|
|
76
|
-
return self.run.info.run_id
|
|
77
|
-
|
|
78
|
-
def end_run(self, status: str = "FINISHED") -> None:
|
|
79
|
-
"""End the current run."""
|
|
80
|
-
mlflow.end_run(status=status)
|
|
81
|
-
self.run = None
|
|
82
|
-
|
|
83
|
-
def log_params(self, params: dict) -> None:
|
|
84
|
-
"""Log hyperparameters."""
|
|
85
|
-
mlflow.log_params(params)
|
|
86
|
-
|
|
87
|
-
def log_metrics(self, metrics: dict, step: int = None) -> None:
|
|
88
|
-
"""Log metrics with optional step."""
|
|
89
|
-
for key, value in metrics.items():
|
|
90
|
-
mlflow.log_metric(key, value, step=step)
|
|
91
|
-
|
|
92
|
-
def log_artifact(self, local_path: str, artifact_path: str = None) -> None:
|
|
93
|
-
"""Log file or directory as artifact."""
|
|
94
|
-
mlflow.log_artifact(local_path, artifact_path)
|
|
95
|
-
|
|
96
|
-
def log_model(
|
|
97
|
-
self,
|
|
98
|
-
model,
|
|
99
|
-
artifact_path: str,
|
|
100
|
-
registered_model_name: str = None,
|
|
101
|
-
signature=None,
|
|
102
|
-
input_example=None,
|
|
103
|
-
) -> str:
|
|
104
|
-
"""Log model with optional registration."""
|
|
105
|
-
from mlflow.models import infer_signature
|
|
106
|
-
|
|
107
|
-
if signature is None and input_example is not None:
|
|
108
|
-
signature = infer_signature(input_example, model.predict(input_example))
|
|
109
|
-
|
|
110
|
-
model_info = mlflow.sklearn.log_model(
|
|
111
|
-
model,
|
|
112
|
-
artifact_path=artifact_path,
|
|
113
|
-
registered_model_name=registered_model_name,
|
|
114
|
-
signature=signature,
|
|
115
|
-
input_example=input_example,
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
return model_info.model_uri
|
|
119
|
-
|
|
120
|
-
# Usage example
|
|
121
|
-
def train_with_mlflow(
|
|
122
|
-
model,
|
|
123
|
-
X_train,
|
|
124
|
-
y_train,
|
|
125
|
-
X_val,
|
|
126
|
-
y_val,
|
|
127
|
-
params: dict,
|
|
128
|
-
):
|
|
129
|
-
"""Complete training run with MLflow tracking."""
|
|
130
|
-
tracker = MLflowTracker("my_experiment")
|
|
131
|
-
|
|
132
|
-
tracker.start_run(
|
|
133
|
-
run_name=f"run_{params['model_type']}",
|
|
134
|
-
tags={
|
|
135
|
-
"model_type": params["model_type"],
|
|
136
|
-
"dataset_version": "v1.0",
|
|
137
|
-
"author": "ml-team",
|
|
138
|
-
},
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
try:
|
|
142
|
-
# Log parameters
|
|
143
|
-
tracker.log_params(params)
|
|
144
|
-
|
|
145
|
-
# Train model
|
|
146
|
-
model.fit(X_train, y_train)
|
|
147
|
-
|
|
148
|
-
# Evaluate and log metrics
|
|
149
|
-
train_score = model.score(X_train, y_train)
|
|
150
|
-
val_score = model.score(X_val, y_val)
|
|
151
|
-
|
|
152
|
-
tracker.log_metrics({
|
|
153
|
-
"train_accuracy": train_score,
|
|
154
|
-
"val_accuracy": val_score,
|
|
155
|
-
})
|
|
156
|
-
|
|
157
|
-
# Log model
|
|
158
|
-
model_uri = tracker.log_model(
|
|
159
|
-
model,
|
|
160
|
-
artifact_path="model",
|
|
161
|
-
registered_model_name="my_model",
|
|
162
|
-
input_example=X_train[:5],
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
tracker.end_run()
|
|
166
|
-
return model_uri
|
|
167
|
-
|
|
168
|
-
except Exception as e:
|
|
169
|
-
tracker.end_run(status="FAILED")
|
|
170
|
-
raise
|
|
171
|
-
```
|
|
172
|
-
|
|
173
|
-
### PyTorch Model Logging
|
|
174
|
-
|
|
175
|
-
```python
|
|
176
|
-
import mlflow.pytorch
|
|
177
|
-
import torch
|
|
178
|
-
|
|
179
|
-
def log_pytorch_model(
|
|
180
|
-
model: torch.nn.Module,
|
|
181
|
-
artifact_path: str,
|
|
182
|
-
registered_model_name: str = None,
|
|
183
|
-
sample_input: torch.Tensor = None,
|
|
184
|
-
) -> str:
|
|
185
|
-
"""Log PyTorch model with signature inference."""
|
|
186
|
-
from mlflow.models import infer_signature
|
|
187
|
-
|
|
188
|
-
# Create signature from sample input
|
|
189
|
-
signature = None
|
|
190
|
-
if sample_input is not None:
|
|
191
|
-
model.eval()
|
|
192
|
-
with torch.no_grad():
|
|
193
|
-
sample_output = model(sample_input)
|
|
194
|
-
|
|
195
|
-
signature = infer_signature(
|
|
196
|
-
sample_input.numpy(),
|
|
197
|
-
sample_output.numpy(),
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
model_info = mlflow.pytorch.log_model(
|
|
201
|
-
model,
|
|
202
|
-
artifact_path=artifact_path,
|
|
203
|
-
registered_model_name=registered_model_name,
|
|
204
|
-
signature=signature,
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
return model_info.model_uri
|
|
208
|
-
|
|
209
|
-
def load_pytorch_model(model_uri: str, device: str = "cpu") -> torch.nn.Module:
|
|
210
|
-
"""Load PyTorch model from MLflow."""
|
|
211
|
-
model = mlflow.pytorch.load_model(model_uri, map_location=device)
|
|
212
|
-
return model
|
|
213
|
-
```
|
|
214
|
-
|
|
215
|
-
### Model Registry Operations
|
|
216
|
-
|
|
217
|
-
```python
|
|
218
|
-
from mlflow.tracking import MlflowClient
|
|
219
|
-
from mlflow.entities.model_registry import ModelVersion
|
|
220
|
-
|
|
221
|
-
class ModelRegistry:
|
|
222
|
-
"""MLflow Model Registry wrapper."""
|
|
223
|
-
|
|
224
|
-
def __init__(self, tracking_uri: str = "http://localhost:5000"):
|
|
225
|
-
mlflow.set_tracking_uri(tracking_uri)
|
|
226
|
-
self.client = MlflowClient()
|
|
227
|
-
|
|
228
|
-
def register_model(
|
|
229
|
-
self,
|
|
230
|
-
model_uri: str,
|
|
231
|
-
name: str,
|
|
232
|
-
tags: dict = None,
|
|
233
|
-
description: str = None,
|
|
234
|
-
) -> ModelVersion:
|
|
235
|
-
"""Register a new model version."""
|
|
236
|
-
result = mlflow.register_model(model_uri, name)
|
|
237
|
-
|
|
238
|
-
if tags:
|
|
239
|
-
for key, value in tags.items():
|
|
240
|
-
self.client.set_model_version_tag(name, result.version, key, value)
|
|
241
|
-
|
|
242
|
-
if description:
|
|
243
|
-
self.client.update_model_version(
|
|
244
|
-
name,
|
|
245
|
-
result.version,
|
|
246
|
-
description=description,
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
return result
|
|
250
|
-
|
|
251
|
-
def transition_model_stage(
|
|
252
|
-
self,
|
|
253
|
-
name: str,
|
|
254
|
-
version: str,
|
|
255
|
-
stage: str,
|
|
256
|
-
archive_existing: bool = True,
|
|
257
|
-
) -> ModelVersion:
|
|
258
|
-
"""Transition model to new stage (Staging, Production, Archived)."""
|
|
259
|
-
return self.client.transition_model_version_stage(
|
|
260
|
-
name=name,
|
|
261
|
-
version=version,
|
|
262
|
-
stage=stage,
|
|
263
|
-
archive_existing_versions=archive_existing,
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
def get_latest_version(
|
|
267
|
-
self,
|
|
268
|
-
name: str,
|
|
269
|
-
stages: list[str] = None,
|
|
270
|
-
) -> list[ModelVersion]:
|
|
271
|
-
"""Get latest model versions by stage."""
|
|
272
|
-
return self.client.get_latest_versions(name, stages=stages)
|
|
273
|
-
|
|
274
|
-
def load_production_model(self, name: str) -> any:
|
|
275
|
-
"""Load the production model."""
|
|
276
|
-
model_uri = f"models:/{name}/Production"
|
|
277
|
-
return mlflow.pyfunc.load_model(model_uri)
|
|
278
|
-
|
|
279
|
-
def compare_versions(
|
|
280
|
-
self,
|
|
281
|
-
name: str,
|
|
282
|
-
version_a: str,
|
|
283
|
-
version_b: str,
|
|
284
|
-
) -> dict:
|
|
285
|
-
"""Compare two model versions."""
|
|
286
|
-
v_a = self.client.get_model_version(name, version_a)
|
|
287
|
-
v_b = self.client.get_model_version(name, version_b)
|
|
288
|
-
|
|
289
|
-
run_a = self.client.get_run(v_a.run_id)
|
|
290
|
-
run_b = self.client.get_run(v_b.run_id)
|
|
291
|
-
|
|
292
|
-
return {
|
|
293
|
-
"version_a": {
|
|
294
|
-
"version": version_a,
|
|
295
|
-
"metrics": run_a.data.metrics,
|
|
296
|
-
"params": run_a.data.params,
|
|
297
|
-
},
|
|
298
|
-
"version_b": {
|
|
299
|
-
"version": version_b,
|
|
300
|
-
"metrics": run_b.data.metrics,
|
|
301
|
-
"params": run_b.data.params,
|
|
302
|
-
},
|
|
303
|
-
}
|
|
304
|
-
```
|
|
305
|
-
|
|
306
|
-
---
|
|
307
|
-
|
|
308
|
-
## Weights & Biases Integration
|
|
309
|
-
|
|
310
|
-
### Basic W&B Tracking
|
|
311
|
-
|
|
312
|
-
```python
|
|
313
|
-
import wandb
|
|
314
|
-
from pathlib import Path
|
|
315
|
-
|
|
316
|
-
class WandbTracker:
|
|
317
|
-
"""Weights & Biases experiment tracking wrapper."""
|
|
318
|
-
|
|
319
|
-
def __init__(
|
|
320
|
-
self,
|
|
321
|
-
project: str,
|
|
322
|
-
entity: str = None,
|
|
323
|
-
config: dict = None,
|
|
324
|
-
):
|
|
325
|
-
self.project = project
|
|
326
|
-
self.entity = entity
|
|
327
|
-
self.config = config
|
|
328
|
-
self.run = None
|
|
329
|
-
|
|
330
|
-
def start_run(
|
|
331
|
-
self,
|
|
332
|
-
name: str = None,
|
|
333
|
-
tags: list[str] = None,
|
|
334
|
-
group: str = None,
|
|
335
|
-
job_type: str = "train",
|
|
336
|
-
resume: str = None,
|
|
337
|
-
) -> wandb.Run:
|
|
338
|
-
"""Initialize W&B run."""
|
|
339
|
-
self.run = wandb.init(
|
|
340
|
-
project=self.project,
|
|
341
|
-
entity=self.entity,
|
|
342
|
-
name=name,
|
|
343
|
-
config=self.config,
|
|
344
|
-
tags=tags,
|
|
345
|
-
group=group,
|
|
346
|
-
job_type=job_type,
|
|
347
|
-
resume=resume,
|
|
348
|
-
)
|
|
349
|
-
return self.run
|
|
350
|
-
|
|
351
|
-
def log(self, data: dict, step: int = None, commit: bool = True) -> None:
|
|
352
|
-
"""Log metrics and data."""
|
|
353
|
-
wandb.log(data, step=step, commit=commit)
|
|
354
|
-
|
|
355
|
-
def log_artifact(
|
|
356
|
-
self,
|
|
357
|
-
name: str,
|
|
358
|
-
artifact_type: str,
|
|
359
|
-
path: str,
|
|
360
|
-
metadata: dict = None,
|
|
361
|
-
) -> wandb.Artifact:
|
|
362
|
-
"""Log artifact (model, dataset, etc.)."""
|
|
363
|
-
artifact = wandb.Artifact(
|
|
364
|
-
name=name,
|
|
365
|
-
type=artifact_type,
|
|
366
|
-
metadata=metadata,
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
if Path(path).is_dir():
|
|
370
|
-
artifact.add_dir(path)
|
|
371
|
-
else:
|
|
372
|
-
artifact.add_file(path)
|
|
373
|
-
|
|
374
|
-
self.run.log_artifact(artifact)
|
|
375
|
-
return artifact
|
|
376
|
-
|
|
377
|
-
def log_model(
|
|
378
|
-
self,
|
|
379
|
-
model_path: str,
|
|
380
|
-
name: str,
|
|
381
|
-
metadata: dict = None,
|
|
382
|
-
aliases: list[str] = None,
|
|
383
|
-
) -> wandb.Artifact:
|
|
384
|
-
"""Log model artifact with aliases."""
|
|
385
|
-
artifact = wandb.Artifact(
|
|
386
|
-
name=name,
|
|
387
|
-
type="model",
|
|
388
|
-
metadata=metadata,
|
|
389
|
-
)
|
|
390
|
-
|
|
391
|
-
if Path(model_path).is_dir():
|
|
392
|
-
artifact.add_dir(model_path)
|
|
393
|
-
else:
|
|
394
|
-
artifact.add_file(model_path)
|
|
395
|
-
|
|
396
|
-
self.run.log_artifact(artifact, aliases=aliases or ["latest"])
|
|
397
|
-
return artifact
|
|
398
|
-
|
|
399
|
-
def watch_model(
|
|
400
|
-
self,
|
|
401
|
-
model,
|
|
402
|
-
log: str = "all",
|
|
403
|
-
log_freq: int = 100,
|
|
404
|
-
) -> None:
|
|
405
|
-
"""Watch model for gradient and parameter logging."""
|
|
406
|
-
wandb.watch(model, log=log, log_freq=log_freq)
|
|
407
|
-
|
|
408
|
-
def finish(self, exit_code: int = 0) -> None:
|
|
409
|
-
"""Finish the run."""
|
|
410
|
-
wandb.finish(exit_code=exit_code)
|
|
411
|
-
|
|
412
|
-
# Usage with PyTorch
|
|
413
|
-
def train_with_wandb(
|
|
414
|
-
model: torch.nn.Module,
|
|
415
|
-
train_loader,
|
|
416
|
-
val_loader,
|
|
417
|
-
config: dict,
|
|
418
|
-
):
|
|
419
|
-
"""Training with W&B tracking."""
|
|
420
|
-
tracker = WandbTracker(
|
|
421
|
-
project="my-project",
|
|
422
|
-
config=config,
|
|
423
|
-
)
|
|
424
|
-
|
|
425
|
-
tracker.start_run(
|
|
426
|
-
name=f"experiment_{config['model_type']}",
|
|
427
|
-
tags=["baseline", config["model_type"]],
|
|
428
|
-
group="hyperparameter_search",
|
|
429
|
-
)
|
|
430
|
-
|
|
431
|
-
# Watch model gradients
|
|
432
|
-
tracker.watch_model(model)
|
|
433
|
-
|
|
434
|
-
for epoch in range(config["epochs"]):
|
|
435
|
-
model.train()
|
|
436
|
-
for batch_idx, (data, target) in enumerate(train_loader):
|
|
437
|
-
# Training step
|
|
438
|
-
loss = train_step(model, data, target)
|
|
439
|
-
|
|
440
|
-
tracker.log({
|
|
441
|
-
"train/loss": loss,
|
|
442
|
-
"train/epoch": epoch,
|
|
443
|
-
})
|
|
444
|
-
|
|
445
|
-
# Validation
|
|
446
|
-
val_metrics = evaluate(model, val_loader)
|
|
447
|
-
tracker.log({
|
|
448
|
-
"val/loss": val_metrics["loss"],
|
|
449
|
-
"val/accuracy": val_metrics["accuracy"],
|
|
450
|
-
"epoch": epoch,
|
|
451
|
-
})
|
|
452
|
-
|
|
453
|
-
# Save and log model
|
|
454
|
-
torch.save(model.state_dict(), "model.pt")
|
|
455
|
-
tracker.log_model(
|
|
456
|
-
"model.pt",
|
|
457
|
-
name="trained_model",
|
|
458
|
-
metadata={"accuracy": val_metrics["accuracy"]},
|
|
459
|
-
aliases=["latest", "best"],
|
|
460
|
-
)
|
|
461
|
-
|
|
462
|
-
tracker.finish()
|
|
463
|
-
```
|
|
464
|
-
|
|
465
|
-
### W&B Sweeps for Hyperparameter Tuning
|
|
466
|
-
|
|
467
|
-
```python
|
|
468
|
-
import wandb
|
|
469
|
-
|
|
470
|
-
sweep_config = {
|
|
471
|
-
"method": "bayes", # bayes, grid, random
|
|
472
|
-
"metric": {
|
|
473
|
-
"name": "val/loss",
|
|
474
|
-
"goal": "minimize",
|
|
475
|
-
},
|
|
476
|
-
"parameters": {
|
|
477
|
-
"learning_rate": {
|
|
478
|
-
"distribution": "log_uniform_values",
|
|
479
|
-
"min": 1e-5,
|
|
480
|
-
"max": 1e-2,
|
|
481
|
-
},
|
|
482
|
-
"batch_size": {
|
|
483
|
-
"values": [16, 32, 64, 128],
|
|
484
|
-
},
|
|
485
|
-
"hidden_size": {
|
|
486
|
-
"values": [128, 256, 512],
|
|
487
|
-
},
|
|
488
|
-
"dropout": {
|
|
489
|
-
"distribution": "uniform",
|
|
490
|
-
"min": 0.1,
|
|
491
|
-
"max": 0.5,
|
|
492
|
-
},
|
|
493
|
-
},
|
|
494
|
-
"early_terminate": {
|
|
495
|
-
"type": "hyperband",
|
|
496
|
-
"min_iter": 3,
|
|
497
|
-
},
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
def sweep_train():
|
|
501
|
-
"""Training function for sweep."""
|
|
502
|
-
with wandb.init() as run:
|
|
503
|
-
config = wandb.config
|
|
504
|
-
|
|
505
|
-
model = build_model(
|
|
506
|
-
hidden_size=config.hidden_size,
|
|
507
|
-
dropout=config.dropout,
|
|
508
|
-
)
|
|
509
|
-
|
|
510
|
-
optimizer = torch.optim.Adam(
|
|
511
|
-
model.parameters(),
|
|
512
|
-
lr=config.learning_rate,
|
|
513
|
-
)
|
|
514
|
-
|
|
515
|
-
train_loader = DataLoader(train_dataset, batch_size=config.batch_size)
|
|
516
|
-
|
|
517
|
-
for epoch in range(10):
|
|
518
|
-
loss = train_epoch(model, train_loader, optimizer)
|
|
519
|
-
val_loss = evaluate(model, val_loader)
|
|
520
|
-
|
|
521
|
-
wandb.log({
|
|
522
|
-
"train/loss": loss,
|
|
523
|
-
"val/loss": val_loss,
|
|
524
|
-
"epoch": epoch,
|
|
525
|
-
})
|
|
526
|
-
|
|
527
|
-
# Run sweep
|
|
528
|
-
sweep_id = wandb.sweep(sweep_config, project="my-project")
|
|
529
|
-
wandb.agent(sweep_id, function=sweep_train, count=50)
|
|
530
|
-
```
|
|
531
|
-
|
|
532
|
-
---
|
|
533
|
-
|
|
534
|
-
## Custom Experiment Tracking
|
|
535
|
-
|
|
536
|
-
### Lightweight Tracker
|
|
537
|
-
|
|
538
|
-
```python
|
|
539
|
-
import json
|
|
540
|
-
from datetime import datetime
|
|
541
|
-
from pathlib import Path
|
|
542
|
-
from dataclasses import dataclass, field, asdict
|
|
543
|
-
from typing import Optional
|
|
544
|
-
import hashlib
|
|
545
|
-
import uuid
|
|
546
|
-
|
|
547
|
-
@dataclass
|
|
548
|
-
class Experiment:
|
|
549
|
-
"""Experiment metadata and results."""
|
|
550
|
-
experiment_id: str
|
|
551
|
-
name: str
|
|
552
|
-
params: dict
|
|
553
|
-
metrics: dict = field(default_factory=dict)
|
|
554
|
-
artifacts: list = field(default_factory=list)
|
|
555
|
-
tags: dict = field(default_factory=dict)
|
|
556
|
-
start_time: str = field(default_factory=lambda: datetime.utcnow().isoformat())
|
|
557
|
-
end_time: Optional[str] = None
|
|
558
|
-
status: str = "running"
|
|
559
|
-
|
|
560
|
-
def to_dict(self) -> dict:
|
|
561
|
-
return asdict(self)
|
|
562
|
-
|
|
563
|
-
class SimpleTracker:
|
|
564
|
-
"""Lightweight file-based experiment tracker."""
|
|
565
|
-
|
|
566
|
-
def __init__(self, experiments_dir: str = "./experiments"):
|
|
567
|
-
self.experiments_dir = Path(experiments_dir)
|
|
568
|
-
self.experiments_dir.mkdir(parents=True, exist_ok=True)
|
|
569
|
-
self.current_experiment: Optional[Experiment] = None
|
|
570
|
-
|
|
571
|
-
def start_experiment(
|
|
572
|
-
self,
|
|
573
|
-
name: str,
|
|
574
|
-
params: dict,
|
|
575
|
-
tags: dict = None,
|
|
576
|
-
) -> Experiment:
|
|
577
|
-
"""Start a new experiment."""
|
|
578
|
-
experiment_id = str(uuid.uuid4())[:8]
|
|
579
|
-
|
|
580
|
-
self.current_experiment = Experiment(
|
|
581
|
-
experiment_id=experiment_id,
|
|
582
|
-
name=name,
|
|
583
|
-
params=params,
|
|
584
|
-
tags=tags or {},
|
|
585
|
-
)
|
|
586
|
-
|
|
587
|
-
# Create experiment directory
|
|
588
|
-
exp_dir = self.experiments_dir / experiment_id
|
|
589
|
-
exp_dir.mkdir(exist_ok=True)
|
|
590
|
-
|
|
591
|
-
self._save_experiment()
|
|
592
|
-
return self.current_experiment
|
|
593
|
-
|
|
594
|
-
def log_metrics(self, metrics: dict, step: int = None) -> None:
|
|
595
|
-
"""Log metrics to current experiment."""
|
|
596
|
-
if self.current_experiment is None:
|
|
597
|
-
raise ValueError("No active experiment")
|
|
598
|
-
|
|
599
|
-
for key, value in metrics.items():
|
|
600
|
-
if key not in self.current_experiment.metrics:
|
|
601
|
-
self.current_experiment.metrics[key] = []
|
|
602
|
-
|
|
603
|
-
self.current_experiment.metrics[key].append({
|
|
604
|
-
"value": value,
|
|
605
|
-
"step": step,
|
|
606
|
-
"timestamp": datetime.utcnow().isoformat(),
|
|
607
|
-
})
|
|
608
|
-
|
|
609
|
-
self._save_experiment()
|
|
610
|
-
|
|
611
|
-
def log_artifact(self, path: str, name: str = None) -> str:
|
|
612
|
-
"""Copy artifact to experiment directory."""
|
|
613
|
-
if self.current_experiment is None:
|
|
614
|
-
raise ValueError("No active experiment")
|
|
615
|
-
|
|
616
|
-
import shutil
|
|
617
|
-
|
|
618
|
-
source = Path(path)
|
|
619
|
-
artifact_name = name or source.name
|
|
620
|
-
exp_dir = self.experiments_dir / self.current_experiment.experiment_id
|
|
621
|
-
dest = exp_dir / "artifacts" / artifact_name
|
|
622
|
-
|
|
623
|
-
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
624
|
-
|
|
625
|
-
if source.is_dir():
|
|
626
|
-
shutil.copytree(source, dest)
|
|
627
|
-
else:
|
|
628
|
-
shutil.copy2(source, dest)
|
|
629
|
-
|
|
630
|
-
self.current_experiment.artifacts.append(str(dest))
|
|
631
|
-
self._save_experiment()
|
|
632
|
-
|
|
633
|
-
return str(dest)
|
|
634
|
-
|
|
635
|
-
def end_experiment(self, status: str = "completed") -> None:
|
|
636
|
-
"""End current experiment."""
|
|
637
|
-
if self.current_experiment is None:
|
|
638
|
-
return
|
|
639
|
-
|
|
640
|
-
self.current_experiment.status = status
|
|
641
|
-
self.current_experiment.end_time = datetime.utcnow().isoformat()
|
|
642
|
-
self._save_experiment()
|
|
643
|
-
self.current_experiment = None
|
|
644
|
-
|
|
645
|
-
def _save_experiment(self) -> None:
|
|
646
|
-
"""Save experiment to JSON file."""
|
|
647
|
-
if self.current_experiment is None:
|
|
648
|
-
return
|
|
649
|
-
|
|
650
|
-
exp_dir = self.experiments_dir / self.current_experiment.experiment_id
|
|
651
|
-
with open(exp_dir / "experiment.json", "w") as f:
|
|
652
|
-
json.dump(self.current_experiment.to_dict(), f, indent=2)
|
|
653
|
-
|
|
654
|
-
def load_experiment(self, experiment_id: str) -> Experiment:
|
|
655
|
-
"""Load experiment by ID."""
|
|
656
|
-
exp_file = self.experiments_dir / experiment_id / "experiment.json"
|
|
657
|
-
with open(exp_file) as f:
|
|
658
|
-
data = json.load(f)
|
|
659
|
-
return Experiment(**data)
|
|
660
|
-
|
|
661
|
-
def list_experiments(self, tags: dict = None) -> list[Experiment]:
|
|
662
|
-
"""List all experiments, optionally filtered by tags."""
|
|
663
|
-
experiments = []
|
|
664
|
-
|
|
665
|
-
for exp_dir in self.experiments_dir.iterdir():
|
|
666
|
-
if not exp_dir.is_dir():
|
|
667
|
-
continue
|
|
668
|
-
|
|
669
|
-
exp_file = exp_dir / "experiment.json"
|
|
670
|
-
if not exp_file.exists():
|
|
671
|
-
continue
|
|
672
|
-
|
|
673
|
-
exp = self.load_experiment(exp_dir.name)
|
|
674
|
-
|
|
675
|
-
if tags:
|
|
676
|
-
if not all(exp.tags.get(k) == v for k, v in tags.items()):
|
|
677
|
-
continue
|
|
678
|
-
|
|
679
|
-
experiments.append(exp)
|
|
680
|
-
|
|
681
|
-
return sorted(experiments, key=lambda x: x.start_time, reverse=True)
|
|
682
|
-
|
|
683
|
-
def compare_experiments(self, experiment_ids: list[str]) -> dict:
|
|
684
|
-
"""Compare metrics across experiments."""
|
|
685
|
-
comparison = {}
|
|
686
|
-
|
|
687
|
-
for exp_id in experiment_ids:
|
|
688
|
-
exp = self.load_experiment(exp_id)
|
|
689
|
-
comparison[exp_id] = {
|
|
690
|
-
"name": exp.name,
|
|
691
|
-
"params": exp.params,
|
|
692
|
-
"final_metrics": {
|
|
693
|
-
k: v[-1]["value"] if v else None
|
|
694
|
-
for k, v in exp.metrics.items()
|
|
695
|
-
},
|
|
696
|
-
}
|
|
697
|
-
|
|
698
|
-
return comparison
|
|
699
|
-
```
|
|
700
|
-
|
|
701
|
-
---
|
|
702
|
-
|
|
703
|
-
## Experiment Comparison and Analysis
|
|
704
|
-
|
|
705
|
-
### Metrics Comparison
|
|
706
|
-
|
|
707
|
-
```python
|
|
708
|
-
import pandas as pd
|
|
709
|
-
import matplotlib.pyplot as plt
|
|
710
|
-
from mlflow.tracking import MlflowClient
|
|
711
|
-
|
|
712
|
-
def compare_runs(
|
|
713
|
-
experiment_name: str,
|
|
714
|
-
metric_keys: list[str],
|
|
715
|
-
n_runs: int = 10,
|
|
716
|
-
) -> pd.DataFrame:
|
|
717
|
-
"""Compare recent runs in an experiment."""
|
|
718
|
-
client = MlflowClient()
|
|
719
|
-
experiment = client.get_experiment_by_name(experiment_name)
|
|
720
|
-
|
|
721
|
-
runs = client.search_runs(
|
|
722
|
-
experiment_ids=[experiment.experiment_id],
|
|
723
|
-
order_by=["start_time DESC"],
|
|
724
|
-
max_results=n_runs,
|
|
725
|
-
)
|
|
726
|
-
|
|
727
|
-
data = []
|
|
728
|
-
for run in runs:
|
|
729
|
-
row = {
|
|
730
|
-
"run_id": run.info.run_id,
|
|
731
|
-
"run_name": run.info.run_name,
|
|
732
|
-
"status": run.info.status,
|
|
733
|
-
"start_time": run.info.start_time,
|
|
734
|
-
}
|
|
735
|
-
row.update(run.data.params)
|
|
736
|
-
row.update({k: run.data.metrics.get(k) for k in metric_keys})
|
|
737
|
-
data.append(row)
|
|
738
|
-
|
|
739
|
-
return pd.DataFrame(data)
|
|
740
|
-
|
|
741
|
-
def plot_metric_comparison(
|
|
742
|
-
runs_df: pd.DataFrame,
|
|
743
|
-
metric: str,
|
|
744
|
-
group_by: str = None,
|
|
745
|
-
) -> plt.Figure:
|
|
746
|
-
"""Plot metric comparison across runs."""
|
|
747
|
-
fig, ax = plt.subplots(figsize=(10, 6))
|
|
748
|
-
|
|
749
|
-
if group_by:
|
|
750
|
-
for group, group_df in runs_df.groupby(group_by):
|
|
751
|
-
ax.bar(group_df["run_name"], group_df[metric], label=str(group))
|
|
752
|
-
ax.legend(title=group_by)
|
|
753
|
-
else:
|
|
754
|
-
ax.bar(runs_df["run_name"], runs_df[metric])
|
|
755
|
-
|
|
756
|
-
ax.set_xlabel("Run")
|
|
757
|
-
ax.set_ylabel(metric)
|
|
758
|
-
ax.set_title(f"Comparison of {metric}")
|
|
759
|
-
plt.xticks(rotation=45, ha="right")
|
|
760
|
-
plt.tight_layout()
|
|
761
|
-
|
|
762
|
-
return fig
|
|
763
|
-
```
|
|
764
|
-
|
|
765
|
-
---
|
|
766
|
-
|
|
767
|
-
## Best Practices
|
|
768
|
-
|
|
769
|
-
### What to Track
|
|
770
|
-
|
|
771
|
-
```python
|
|
772
|
-
# Always track:
|
|
773
|
-
REQUIRED_PARAMS = [
|
|
774
|
-
"learning_rate",
|
|
775
|
-
"batch_size",
|
|
776
|
-
"epochs",
|
|
777
|
-
"model_architecture",
|
|
778
|
-
"optimizer",
|
|
779
|
-
"random_seed",
|
|
780
|
-
"dataset_version",
|
|
781
|
-
]
|
|
782
|
-
|
|
783
|
-
REQUIRED_METRICS = [
|
|
784
|
-
"train_loss",
|
|
785
|
-
"val_loss",
|
|
786
|
-
"train_accuracy",
|
|
787
|
-
"val_accuracy",
|
|
788
|
-
]
|
|
789
|
-
|
|
790
|
-
REQUIRED_ARTIFACTS = [
|
|
791
|
-
"model_checkpoint",
|
|
792
|
-
"training_config",
|
|
793
|
-
"requirements.txt",
|
|
794
|
-
]
|
|
795
|
-
|
|
796
|
-
# Recommended tags
|
|
797
|
-
RECOMMENDED_TAGS = {
|
|
798
|
-
"author": "username",
|
|
799
|
-
"environment": "dev|staging|prod",
|
|
800
|
-
"model_type": "classification|regression|etc",
|
|
801
|
-
"dataset": "dataset_name",
|
|
802
|
-
"git_commit": "commit_hash",
|
|
803
|
-
}
|
|
804
|
-
```
|
|
805
|
-
|
|
806
|
-
### Experiment Naming Conventions
|
|
807
|
-
|
|
808
|
-
```python
|
|
809
|
-
# Good naming patterns
|
|
810
|
-
run_name = f"{model_type}_{dataset}_{timestamp}"
|
|
811
|
-
run_name = f"exp_{experiment_number:03d}_{description}"
|
|
812
|
-
run_name = f"{feature_flag}_{ablation_type}_{seed}"
|
|
813
|
-
|
|
814
|
-
# Organize with groups and tags
|
|
815
|
-
tags = {
|
|
816
|
-
"project": "recommendation_engine",
|
|
817
|
-
"sprint": "sprint_42",
|
|
818
|
-
"hypothesis": "larger_embedding_helps",
|
|
819
|
-
}
|
|
820
|
-
```
|
|
821
|
-
|
|
822
|
-
---
|
|
823
|
-
|
|
824
|
-
## Related References
|
|
825
|
-
|
|
826
|
-
- `training-pipelines.md` - Integrating tracking with training
|
|
827
|
-
- `model-validation.md` - Validating tracked models
|
|
828
|
-
- `pipeline-orchestration.md` - Tracking in automated pipelines
|
|
829
|
-
|
|
830
|
-
## Cross-Reference Skills
|
|
831
|
-
|
|
832
|
-
- **DevOps Engineer** - MLflow server deployment
|
|
833
|
-
- **Data Engineer** - Artifact storage integration
|
|
1
|
+
# Experiment Tracking
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Experiment tracking enables reproducibility, comparison, and collaboration in ML development. It captures hyperparameters, metrics, artifacts, and model versions to ensure every experiment can be reproduced and compared.
|
|
8
|
+
|
|
9
|
+
## When to Use This Reference
|
|
10
|
+
|
|
11
|
+
- Setting up MLflow for experiment tracking
|
|
12
|
+
- Implementing Weights & Biases integration
|
|
13
|
+
- Creating model registries and versioning
|
|
14
|
+
- Comparing experiments and selecting models
|
|
15
|
+
- Building custom tracking solutions
|
|
16
|
+
|
|
17
|
+
## When NOT to Use
|
|
18
|
+
|
|
19
|
+
- Quick one-off experiments without reproducibility needs
|
|
20
|
+
- Simple scripts without hyperparameters
|
|
21
|
+
- Non-ML projects
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## MLflow Integration
|
|
26
|
+
|
|
27
|
+
### Basic Experiment Tracking
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import mlflow
|
|
31
|
+
from mlflow.tracking import MlflowClient
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
import json
|
|
34
|
+
|
|
35
|
+
class MLflowTracker:
|
|
36
|
+
"""MLflow experiment tracking wrapper."""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
experiment_name: str,
|
|
41
|
+
tracking_uri: str = "http://localhost:5000",
|
|
42
|
+
artifact_location: str = None,
|
|
43
|
+
):
|
|
44
|
+
mlflow.set_tracking_uri(tracking_uri)
|
|
45
|
+
|
|
46
|
+
# Create or get experiment
|
|
47
|
+
experiment = mlflow.get_experiment_by_name(experiment_name)
|
|
48
|
+
if experiment is None:
|
|
49
|
+
self.experiment_id = mlflow.create_experiment(
|
|
50
|
+
experiment_name,
|
|
51
|
+
artifact_location=artifact_location,
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
self.experiment_id = experiment.experiment_id
|
|
55
|
+
|
|
56
|
+
mlflow.set_experiment(experiment_name)
|
|
57
|
+
self.client = MlflowClient()
|
|
58
|
+
self.run = None
|
|
59
|
+
|
|
60
|
+
def start_run(
|
|
61
|
+
self,
|
|
62
|
+
run_name: str = None,
|
|
63
|
+
tags: dict = None,
|
|
64
|
+
nested: bool = False,
|
|
65
|
+
) -> str:
|
|
66
|
+
"""Start a new MLflow run."""
|
|
67
|
+
self.run = mlflow.start_run(
|
|
68
|
+
run_name=run_name,
|
|
69
|
+
experiment_id=self.experiment_id,
|
|
70
|
+
nested=nested,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if tags:
|
|
74
|
+
mlflow.set_tags(tags)
|
|
75
|
+
|
|
76
|
+
return self.run.info.run_id
|
|
77
|
+
|
|
78
|
+
def end_run(self, status: str = "FINISHED") -> None:
|
|
79
|
+
"""End the current run."""
|
|
80
|
+
mlflow.end_run(status=status)
|
|
81
|
+
self.run = None
|
|
82
|
+
|
|
83
|
+
def log_params(self, params: dict) -> None:
|
|
84
|
+
"""Log hyperparameters."""
|
|
85
|
+
mlflow.log_params(params)
|
|
86
|
+
|
|
87
|
+
def log_metrics(self, metrics: dict, step: int = None) -> None:
|
|
88
|
+
"""Log metrics with optional step."""
|
|
89
|
+
for key, value in metrics.items():
|
|
90
|
+
mlflow.log_metric(key, value, step=step)
|
|
91
|
+
|
|
92
|
+
def log_artifact(self, local_path: str, artifact_path: str = None) -> None:
|
|
93
|
+
"""Log file or directory as artifact."""
|
|
94
|
+
mlflow.log_artifact(local_path, artifact_path)
|
|
95
|
+
|
|
96
|
+
def log_model(
|
|
97
|
+
self,
|
|
98
|
+
model,
|
|
99
|
+
artifact_path: str,
|
|
100
|
+
registered_model_name: str = None,
|
|
101
|
+
signature=None,
|
|
102
|
+
input_example=None,
|
|
103
|
+
) -> str:
|
|
104
|
+
"""Log model with optional registration."""
|
|
105
|
+
from mlflow.models import infer_signature
|
|
106
|
+
|
|
107
|
+
if signature is None and input_example is not None:
|
|
108
|
+
signature = infer_signature(input_example, model.predict(input_example))
|
|
109
|
+
|
|
110
|
+
model_info = mlflow.sklearn.log_model(
|
|
111
|
+
model,
|
|
112
|
+
artifact_path=artifact_path,
|
|
113
|
+
registered_model_name=registered_model_name,
|
|
114
|
+
signature=signature,
|
|
115
|
+
input_example=input_example,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return model_info.model_uri
|
|
119
|
+
|
|
120
|
+
# Usage example
|
|
121
|
+
def train_with_mlflow(
|
|
122
|
+
model,
|
|
123
|
+
X_train,
|
|
124
|
+
y_train,
|
|
125
|
+
X_val,
|
|
126
|
+
y_val,
|
|
127
|
+
params: dict,
|
|
128
|
+
):
|
|
129
|
+
"""Complete training run with MLflow tracking."""
|
|
130
|
+
tracker = MLflowTracker("my_experiment")
|
|
131
|
+
|
|
132
|
+
tracker.start_run(
|
|
133
|
+
run_name=f"run_{params['model_type']}",
|
|
134
|
+
tags={
|
|
135
|
+
"model_type": params["model_type"],
|
|
136
|
+
"dataset_version": "v1.0",
|
|
137
|
+
"author": "ml-team",
|
|
138
|
+
},
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
# Log parameters
|
|
143
|
+
tracker.log_params(params)
|
|
144
|
+
|
|
145
|
+
# Train model
|
|
146
|
+
model.fit(X_train, y_train)
|
|
147
|
+
|
|
148
|
+
# Evaluate and log metrics
|
|
149
|
+
train_score = model.score(X_train, y_train)
|
|
150
|
+
val_score = model.score(X_val, y_val)
|
|
151
|
+
|
|
152
|
+
tracker.log_metrics({
|
|
153
|
+
"train_accuracy": train_score,
|
|
154
|
+
"val_accuracy": val_score,
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
# Log model
|
|
158
|
+
model_uri = tracker.log_model(
|
|
159
|
+
model,
|
|
160
|
+
artifact_path="model",
|
|
161
|
+
registered_model_name="my_model",
|
|
162
|
+
input_example=X_train[:5],
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
tracker.end_run()
|
|
166
|
+
return model_uri
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
tracker.end_run(status="FAILED")
|
|
170
|
+
raise
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### PyTorch Model Logging
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
import mlflow.pytorch
|
|
177
|
+
import torch
|
|
178
|
+
|
|
179
|
+
def log_pytorch_model(
|
|
180
|
+
model: torch.nn.Module,
|
|
181
|
+
artifact_path: str,
|
|
182
|
+
registered_model_name: str = None,
|
|
183
|
+
sample_input: torch.Tensor = None,
|
|
184
|
+
) -> str:
|
|
185
|
+
"""Log PyTorch model with signature inference."""
|
|
186
|
+
from mlflow.models import infer_signature
|
|
187
|
+
|
|
188
|
+
# Create signature from sample input
|
|
189
|
+
signature = None
|
|
190
|
+
if sample_input is not None:
|
|
191
|
+
model.eval()
|
|
192
|
+
with torch.no_grad():
|
|
193
|
+
sample_output = model(sample_input)
|
|
194
|
+
|
|
195
|
+
signature = infer_signature(
|
|
196
|
+
sample_input.numpy(),
|
|
197
|
+
sample_output.numpy(),
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
model_info = mlflow.pytorch.log_model(
|
|
201
|
+
model,
|
|
202
|
+
artifact_path=artifact_path,
|
|
203
|
+
registered_model_name=registered_model_name,
|
|
204
|
+
signature=signature,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
return model_info.model_uri
|
|
208
|
+
|
|
209
|
+
def load_pytorch_model(model_uri: str, device: str = "cpu") -> torch.nn.Module:
|
|
210
|
+
"""Load PyTorch model from MLflow."""
|
|
211
|
+
model = mlflow.pytorch.load_model(model_uri, map_location=device)
|
|
212
|
+
return model
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Model Registry Operations
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
from mlflow.tracking import MlflowClient
|
|
219
|
+
from mlflow.entities.model_registry import ModelVersion
|
|
220
|
+
|
|
221
|
+
class ModelRegistry:
|
|
222
|
+
"""MLflow Model Registry wrapper."""
|
|
223
|
+
|
|
224
|
+
def __init__(self, tracking_uri: str = "http://localhost:5000"):
|
|
225
|
+
mlflow.set_tracking_uri(tracking_uri)
|
|
226
|
+
self.client = MlflowClient()
|
|
227
|
+
|
|
228
|
+
def register_model(
|
|
229
|
+
self,
|
|
230
|
+
model_uri: str,
|
|
231
|
+
name: str,
|
|
232
|
+
tags: dict = None,
|
|
233
|
+
description: str = None,
|
|
234
|
+
) -> ModelVersion:
|
|
235
|
+
"""Register a new model version."""
|
|
236
|
+
result = mlflow.register_model(model_uri, name)
|
|
237
|
+
|
|
238
|
+
if tags:
|
|
239
|
+
for key, value in tags.items():
|
|
240
|
+
self.client.set_model_version_tag(name, result.version, key, value)
|
|
241
|
+
|
|
242
|
+
if description:
|
|
243
|
+
self.client.update_model_version(
|
|
244
|
+
name,
|
|
245
|
+
result.version,
|
|
246
|
+
description=description,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
return result
|
|
250
|
+
|
|
251
|
+
def transition_model_stage(
|
|
252
|
+
self,
|
|
253
|
+
name: str,
|
|
254
|
+
version: str,
|
|
255
|
+
stage: str,
|
|
256
|
+
archive_existing: bool = True,
|
|
257
|
+
) -> ModelVersion:
|
|
258
|
+
"""Transition model to new stage (Staging, Production, Archived)."""
|
|
259
|
+
return self.client.transition_model_version_stage(
|
|
260
|
+
name=name,
|
|
261
|
+
version=version,
|
|
262
|
+
stage=stage,
|
|
263
|
+
archive_existing_versions=archive_existing,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
def get_latest_version(
|
|
267
|
+
self,
|
|
268
|
+
name: str,
|
|
269
|
+
stages: list[str] = None,
|
|
270
|
+
) -> list[ModelVersion]:
|
|
271
|
+
"""Get latest model versions by stage."""
|
|
272
|
+
return self.client.get_latest_versions(name, stages=stages)
|
|
273
|
+
|
|
274
|
+
def load_production_model(self, name: str) -> any:
|
|
275
|
+
"""Load the production model."""
|
|
276
|
+
model_uri = f"models:/{name}/Production"
|
|
277
|
+
return mlflow.pyfunc.load_model(model_uri)
|
|
278
|
+
|
|
279
|
+
def compare_versions(
|
|
280
|
+
self,
|
|
281
|
+
name: str,
|
|
282
|
+
version_a: str,
|
|
283
|
+
version_b: str,
|
|
284
|
+
) -> dict:
|
|
285
|
+
"""Compare two model versions."""
|
|
286
|
+
v_a = self.client.get_model_version(name, version_a)
|
|
287
|
+
v_b = self.client.get_model_version(name, version_b)
|
|
288
|
+
|
|
289
|
+
run_a = self.client.get_run(v_a.run_id)
|
|
290
|
+
run_b = self.client.get_run(v_b.run_id)
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
"version_a": {
|
|
294
|
+
"version": version_a,
|
|
295
|
+
"metrics": run_a.data.metrics,
|
|
296
|
+
"params": run_a.data.params,
|
|
297
|
+
},
|
|
298
|
+
"version_b": {
|
|
299
|
+
"version": version_b,
|
|
300
|
+
"metrics": run_b.data.metrics,
|
|
301
|
+
"params": run_b.data.params,
|
|
302
|
+
},
|
|
303
|
+
}
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
## Weights & Biases Integration
|
|
309
|
+
|
|
310
|
+
### Basic W&B Tracking
|
|
311
|
+
|
|
312
|
+
```python
|
|
313
|
+
import wandb
|
|
314
|
+
from pathlib import Path
|
|
315
|
+
|
|
316
|
+
class WandbTracker:
|
|
317
|
+
"""Weights & Biases experiment tracking wrapper."""
|
|
318
|
+
|
|
319
|
+
def __init__(
|
|
320
|
+
self,
|
|
321
|
+
project: str,
|
|
322
|
+
entity: str = None,
|
|
323
|
+
config: dict = None,
|
|
324
|
+
):
|
|
325
|
+
self.project = project
|
|
326
|
+
self.entity = entity
|
|
327
|
+
self.config = config
|
|
328
|
+
self.run = None
|
|
329
|
+
|
|
330
|
+
def start_run(
|
|
331
|
+
self,
|
|
332
|
+
name: str = None,
|
|
333
|
+
tags: list[str] = None,
|
|
334
|
+
group: str = None,
|
|
335
|
+
job_type: str = "train",
|
|
336
|
+
resume: str = None,
|
|
337
|
+
) -> wandb.Run:
|
|
338
|
+
"""Initialize W&B run."""
|
|
339
|
+
self.run = wandb.init(
|
|
340
|
+
project=self.project,
|
|
341
|
+
entity=self.entity,
|
|
342
|
+
name=name,
|
|
343
|
+
config=self.config,
|
|
344
|
+
tags=tags,
|
|
345
|
+
group=group,
|
|
346
|
+
job_type=job_type,
|
|
347
|
+
resume=resume,
|
|
348
|
+
)
|
|
349
|
+
return self.run
|
|
350
|
+
|
|
351
|
+
def log(self, data: dict, step: int = None, commit: bool = True) -> None:
|
|
352
|
+
"""Log metrics and data."""
|
|
353
|
+
wandb.log(data, step=step, commit=commit)
|
|
354
|
+
|
|
355
|
+
def log_artifact(
|
|
356
|
+
self,
|
|
357
|
+
name: str,
|
|
358
|
+
artifact_type: str,
|
|
359
|
+
path: str,
|
|
360
|
+
metadata: dict = None,
|
|
361
|
+
) -> wandb.Artifact:
|
|
362
|
+
"""Log artifact (model, dataset, etc.)."""
|
|
363
|
+
artifact = wandb.Artifact(
|
|
364
|
+
name=name,
|
|
365
|
+
type=artifact_type,
|
|
366
|
+
metadata=metadata,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
if Path(path).is_dir():
|
|
370
|
+
artifact.add_dir(path)
|
|
371
|
+
else:
|
|
372
|
+
artifact.add_file(path)
|
|
373
|
+
|
|
374
|
+
self.run.log_artifact(artifact)
|
|
375
|
+
return artifact
|
|
376
|
+
|
|
377
|
+
def log_model(
|
|
378
|
+
self,
|
|
379
|
+
model_path: str,
|
|
380
|
+
name: str,
|
|
381
|
+
metadata: dict = None,
|
|
382
|
+
aliases: list[str] = None,
|
|
383
|
+
) -> wandb.Artifact:
|
|
384
|
+
"""Log model artifact with aliases."""
|
|
385
|
+
artifact = wandb.Artifact(
|
|
386
|
+
name=name,
|
|
387
|
+
type="model",
|
|
388
|
+
metadata=metadata,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
if Path(model_path).is_dir():
|
|
392
|
+
artifact.add_dir(model_path)
|
|
393
|
+
else:
|
|
394
|
+
artifact.add_file(model_path)
|
|
395
|
+
|
|
396
|
+
self.run.log_artifact(artifact, aliases=aliases or ["latest"])
|
|
397
|
+
return artifact
|
|
398
|
+
|
|
399
|
+
def watch_model(
|
|
400
|
+
self,
|
|
401
|
+
model,
|
|
402
|
+
log: str = "all",
|
|
403
|
+
log_freq: int = 100,
|
|
404
|
+
) -> None:
|
|
405
|
+
"""Watch model for gradient and parameter logging."""
|
|
406
|
+
wandb.watch(model, log=log, log_freq=log_freq)
|
|
407
|
+
|
|
408
|
+
def finish(self, exit_code: int = 0) -> None:
|
|
409
|
+
"""Finish the run."""
|
|
410
|
+
wandb.finish(exit_code=exit_code)
|
|
411
|
+
|
|
412
|
+
# Usage with PyTorch
|
|
413
|
+
def train_with_wandb(
|
|
414
|
+
model: torch.nn.Module,
|
|
415
|
+
train_loader,
|
|
416
|
+
val_loader,
|
|
417
|
+
config: dict,
|
|
418
|
+
):
|
|
419
|
+
"""Training with W&B tracking."""
|
|
420
|
+
tracker = WandbTracker(
|
|
421
|
+
project="my-project",
|
|
422
|
+
config=config,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
tracker.start_run(
|
|
426
|
+
name=f"experiment_{config['model_type']}",
|
|
427
|
+
tags=["baseline", config["model_type"]],
|
|
428
|
+
group="hyperparameter_search",
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# Watch model gradients
|
|
432
|
+
tracker.watch_model(model)
|
|
433
|
+
|
|
434
|
+
for epoch in range(config["epochs"]):
|
|
435
|
+
model.train()
|
|
436
|
+
for batch_idx, (data, target) in enumerate(train_loader):
|
|
437
|
+
# Training step
|
|
438
|
+
loss = train_step(model, data, target)
|
|
439
|
+
|
|
440
|
+
tracker.log({
|
|
441
|
+
"train/loss": loss,
|
|
442
|
+
"train/epoch": epoch,
|
|
443
|
+
})
|
|
444
|
+
|
|
445
|
+
# Validation
|
|
446
|
+
val_metrics = evaluate(model, val_loader)
|
|
447
|
+
tracker.log({
|
|
448
|
+
"val/loss": val_metrics["loss"],
|
|
449
|
+
"val/accuracy": val_metrics["accuracy"],
|
|
450
|
+
"epoch": epoch,
|
|
451
|
+
})
|
|
452
|
+
|
|
453
|
+
# Save and log model
|
|
454
|
+
torch.save(model.state_dict(), "model.pt")
|
|
455
|
+
tracker.log_model(
|
|
456
|
+
"model.pt",
|
|
457
|
+
name="trained_model",
|
|
458
|
+
metadata={"accuracy": val_metrics["accuracy"]},
|
|
459
|
+
aliases=["latest", "best"],
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
tracker.finish()
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
### W&B Sweeps for Hyperparameter Tuning
|
|
466
|
+
|
|
467
|
+
```python
|
|
468
|
+
import wandb
|
|
469
|
+
|
|
470
|
+
sweep_config = {
|
|
471
|
+
"method": "bayes", # bayes, grid, random
|
|
472
|
+
"metric": {
|
|
473
|
+
"name": "val/loss",
|
|
474
|
+
"goal": "minimize",
|
|
475
|
+
},
|
|
476
|
+
"parameters": {
|
|
477
|
+
"learning_rate": {
|
|
478
|
+
"distribution": "log_uniform_values",
|
|
479
|
+
"min": 1e-5,
|
|
480
|
+
"max": 1e-2,
|
|
481
|
+
},
|
|
482
|
+
"batch_size": {
|
|
483
|
+
"values": [16, 32, 64, 128],
|
|
484
|
+
},
|
|
485
|
+
"hidden_size": {
|
|
486
|
+
"values": [128, 256, 512],
|
|
487
|
+
},
|
|
488
|
+
"dropout": {
|
|
489
|
+
"distribution": "uniform",
|
|
490
|
+
"min": 0.1,
|
|
491
|
+
"max": 0.5,
|
|
492
|
+
},
|
|
493
|
+
},
|
|
494
|
+
"early_terminate": {
|
|
495
|
+
"type": "hyperband",
|
|
496
|
+
"min_iter": 3,
|
|
497
|
+
},
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
def sweep_train():
|
|
501
|
+
"""Training function for sweep."""
|
|
502
|
+
with wandb.init() as run:
|
|
503
|
+
config = wandb.config
|
|
504
|
+
|
|
505
|
+
model = build_model(
|
|
506
|
+
hidden_size=config.hidden_size,
|
|
507
|
+
dropout=config.dropout,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
optimizer = torch.optim.Adam(
|
|
511
|
+
model.parameters(),
|
|
512
|
+
lr=config.learning_rate,
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
train_loader = DataLoader(train_dataset, batch_size=config.batch_size)
|
|
516
|
+
|
|
517
|
+
for epoch in range(10):
|
|
518
|
+
loss = train_epoch(model, train_loader, optimizer)
|
|
519
|
+
val_loss = evaluate(model, val_loader)
|
|
520
|
+
|
|
521
|
+
wandb.log({
|
|
522
|
+
"train/loss": loss,
|
|
523
|
+
"val/loss": val_loss,
|
|
524
|
+
"epoch": epoch,
|
|
525
|
+
})
|
|
526
|
+
|
|
527
|
+
# Run sweep
|
|
528
|
+
sweep_id = wandb.sweep(sweep_config, project="my-project")
|
|
529
|
+
wandb.agent(sweep_id, function=sweep_train, count=50)
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
---
|
|
533
|
+
|
|
534
|
+
## Custom Experiment Tracking
|
|
535
|
+
|
|
536
|
+
### Lightweight Tracker
|
|
537
|
+
|
|
538
|
+
```python
|
|
539
|
+
import json
|
|
540
|
+
from datetime import datetime
|
|
541
|
+
from pathlib import Path
|
|
542
|
+
from dataclasses import dataclass, field, asdict
|
|
543
|
+
from typing import Optional
|
|
544
|
+
import hashlib
|
|
545
|
+
import uuid
|
|
546
|
+
|
|
547
|
+
@dataclass
|
|
548
|
+
class Experiment:
|
|
549
|
+
"""Experiment metadata and results."""
|
|
550
|
+
experiment_id: str
|
|
551
|
+
name: str
|
|
552
|
+
params: dict
|
|
553
|
+
metrics: dict = field(default_factory=dict)
|
|
554
|
+
artifacts: list = field(default_factory=list)
|
|
555
|
+
tags: dict = field(default_factory=dict)
|
|
556
|
+
start_time: str = field(default_factory=lambda: datetime.utcnow().isoformat())
|
|
557
|
+
end_time: Optional[str] = None
|
|
558
|
+
status: str = "running"
|
|
559
|
+
|
|
560
|
+
def to_dict(self) -> dict:
|
|
561
|
+
return asdict(self)
|
|
562
|
+
|
|
563
|
+
class SimpleTracker:
|
|
564
|
+
"""Lightweight file-based experiment tracker."""
|
|
565
|
+
|
|
566
|
+
def __init__(self, experiments_dir: str = "./experiments"):
|
|
567
|
+
self.experiments_dir = Path(experiments_dir)
|
|
568
|
+
self.experiments_dir.mkdir(parents=True, exist_ok=True)
|
|
569
|
+
self.current_experiment: Optional[Experiment] = None
|
|
570
|
+
|
|
571
|
+
def start_experiment(
|
|
572
|
+
self,
|
|
573
|
+
name: str,
|
|
574
|
+
params: dict,
|
|
575
|
+
tags: dict = None,
|
|
576
|
+
) -> Experiment:
|
|
577
|
+
"""Start a new experiment."""
|
|
578
|
+
experiment_id = str(uuid.uuid4())[:8]
|
|
579
|
+
|
|
580
|
+
self.current_experiment = Experiment(
|
|
581
|
+
experiment_id=experiment_id,
|
|
582
|
+
name=name,
|
|
583
|
+
params=params,
|
|
584
|
+
tags=tags or {},
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
# Create experiment directory
|
|
588
|
+
exp_dir = self.experiments_dir / experiment_id
|
|
589
|
+
exp_dir.mkdir(exist_ok=True)
|
|
590
|
+
|
|
591
|
+
self._save_experiment()
|
|
592
|
+
return self.current_experiment
|
|
593
|
+
|
|
594
|
+
def log_metrics(self, metrics: dict, step: int = None) -> None:
|
|
595
|
+
"""Log metrics to current experiment."""
|
|
596
|
+
if self.current_experiment is None:
|
|
597
|
+
raise ValueError("No active experiment")
|
|
598
|
+
|
|
599
|
+
for key, value in metrics.items():
|
|
600
|
+
if key not in self.current_experiment.metrics:
|
|
601
|
+
self.current_experiment.metrics[key] = []
|
|
602
|
+
|
|
603
|
+
self.current_experiment.metrics[key].append({
|
|
604
|
+
"value": value,
|
|
605
|
+
"step": step,
|
|
606
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
607
|
+
})
|
|
608
|
+
|
|
609
|
+
self._save_experiment()
|
|
610
|
+
|
|
611
|
+
def log_artifact(self, path: str, name: str = None) -> str:
|
|
612
|
+
"""Copy artifact to experiment directory."""
|
|
613
|
+
if self.current_experiment is None:
|
|
614
|
+
raise ValueError("No active experiment")
|
|
615
|
+
|
|
616
|
+
import shutil
|
|
617
|
+
|
|
618
|
+
source = Path(path)
|
|
619
|
+
artifact_name = name or source.name
|
|
620
|
+
exp_dir = self.experiments_dir / self.current_experiment.experiment_id
|
|
621
|
+
dest = exp_dir / "artifacts" / artifact_name
|
|
622
|
+
|
|
623
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
624
|
+
|
|
625
|
+
if source.is_dir():
|
|
626
|
+
shutil.copytree(source, dest)
|
|
627
|
+
else:
|
|
628
|
+
shutil.copy2(source, dest)
|
|
629
|
+
|
|
630
|
+
self.current_experiment.artifacts.append(str(dest))
|
|
631
|
+
self._save_experiment()
|
|
632
|
+
|
|
633
|
+
return str(dest)
|
|
634
|
+
|
|
635
|
+
def end_experiment(self, status: str = "completed") -> None:
|
|
636
|
+
"""End current experiment."""
|
|
637
|
+
if self.current_experiment is None:
|
|
638
|
+
return
|
|
639
|
+
|
|
640
|
+
self.current_experiment.status = status
|
|
641
|
+
self.current_experiment.end_time = datetime.utcnow().isoformat()
|
|
642
|
+
self._save_experiment()
|
|
643
|
+
self.current_experiment = None
|
|
644
|
+
|
|
645
|
+
def _save_experiment(self) -> None:
|
|
646
|
+
"""Save experiment to JSON file."""
|
|
647
|
+
if self.current_experiment is None:
|
|
648
|
+
return
|
|
649
|
+
|
|
650
|
+
exp_dir = self.experiments_dir / self.current_experiment.experiment_id
|
|
651
|
+
with open(exp_dir / "experiment.json", "w") as f:
|
|
652
|
+
json.dump(self.current_experiment.to_dict(), f, indent=2)
|
|
653
|
+
|
|
654
|
+
def load_experiment(self, experiment_id: str) -> Experiment:
|
|
655
|
+
"""Load experiment by ID."""
|
|
656
|
+
exp_file = self.experiments_dir / experiment_id / "experiment.json"
|
|
657
|
+
with open(exp_file) as f:
|
|
658
|
+
data = json.load(f)
|
|
659
|
+
return Experiment(**data)
|
|
660
|
+
|
|
661
|
+
def list_experiments(self, tags: dict = None) -> list[Experiment]:
|
|
662
|
+
"""List all experiments, optionally filtered by tags."""
|
|
663
|
+
experiments = []
|
|
664
|
+
|
|
665
|
+
for exp_dir in self.experiments_dir.iterdir():
|
|
666
|
+
if not exp_dir.is_dir():
|
|
667
|
+
continue
|
|
668
|
+
|
|
669
|
+
exp_file = exp_dir / "experiment.json"
|
|
670
|
+
if not exp_file.exists():
|
|
671
|
+
continue
|
|
672
|
+
|
|
673
|
+
exp = self.load_experiment(exp_dir.name)
|
|
674
|
+
|
|
675
|
+
if tags:
|
|
676
|
+
if not all(exp.tags.get(k) == v for k, v in tags.items()):
|
|
677
|
+
continue
|
|
678
|
+
|
|
679
|
+
experiments.append(exp)
|
|
680
|
+
|
|
681
|
+
return sorted(experiments, key=lambda x: x.start_time, reverse=True)
|
|
682
|
+
|
|
683
|
+
def compare_experiments(self, experiment_ids: list[str]) -> dict:
|
|
684
|
+
"""Compare metrics across experiments."""
|
|
685
|
+
comparison = {}
|
|
686
|
+
|
|
687
|
+
for exp_id in experiment_ids:
|
|
688
|
+
exp = self.load_experiment(exp_id)
|
|
689
|
+
comparison[exp_id] = {
|
|
690
|
+
"name": exp.name,
|
|
691
|
+
"params": exp.params,
|
|
692
|
+
"final_metrics": {
|
|
693
|
+
k: v[-1]["value"] if v else None
|
|
694
|
+
for k, v in exp.metrics.items()
|
|
695
|
+
},
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
return comparison
|
|
699
|
+
```
|
|
700
|
+
|
|
701
|
+
---
|
|
702
|
+
|
|
703
|
+
## Experiment Comparison and Analysis
|
|
704
|
+
|
|
705
|
+
### Metrics Comparison
|
|
706
|
+
|
|
707
|
+
```python
|
|
708
|
+
import pandas as pd
|
|
709
|
+
import matplotlib.pyplot as plt
|
|
710
|
+
from mlflow.tracking import MlflowClient
|
|
711
|
+
|
|
712
|
+
def compare_runs(
|
|
713
|
+
experiment_name: str,
|
|
714
|
+
metric_keys: list[str],
|
|
715
|
+
n_runs: int = 10,
|
|
716
|
+
) -> pd.DataFrame:
|
|
717
|
+
"""Compare recent runs in an experiment."""
|
|
718
|
+
client = MlflowClient()
|
|
719
|
+
experiment = client.get_experiment_by_name(experiment_name)
|
|
720
|
+
|
|
721
|
+
runs = client.search_runs(
|
|
722
|
+
experiment_ids=[experiment.experiment_id],
|
|
723
|
+
order_by=["start_time DESC"],
|
|
724
|
+
max_results=n_runs,
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
data = []
|
|
728
|
+
for run in runs:
|
|
729
|
+
row = {
|
|
730
|
+
"run_id": run.info.run_id,
|
|
731
|
+
"run_name": run.info.run_name,
|
|
732
|
+
"status": run.info.status,
|
|
733
|
+
"start_time": run.info.start_time,
|
|
734
|
+
}
|
|
735
|
+
row.update(run.data.params)
|
|
736
|
+
row.update({k: run.data.metrics.get(k) for k in metric_keys})
|
|
737
|
+
data.append(row)
|
|
738
|
+
|
|
739
|
+
return pd.DataFrame(data)
|
|
740
|
+
|
|
741
|
+
def plot_metric_comparison(
|
|
742
|
+
runs_df: pd.DataFrame,
|
|
743
|
+
metric: str,
|
|
744
|
+
group_by: str = None,
|
|
745
|
+
) -> plt.Figure:
|
|
746
|
+
"""Plot metric comparison across runs."""
|
|
747
|
+
fig, ax = plt.subplots(figsize=(10, 6))
|
|
748
|
+
|
|
749
|
+
if group_by:
|
|
750
|
+
for group, group_df in runs_df.groupby(group_by):
|
|
751
|
+
ax.bar(group_df["run_name"], group_df[metric], label=str(group))
|
|
752
|
+
ax.legend(title=group_by)
|
|
753
|
+
else:
|
|
754
|
+
ax.bar(runs_df["run_name"], runs_df[metric])
|
|
755
|
+
|
|
756
|
+
ax.set_xlabel("Run")
|
|
757
|
+
ax.set_ylabel(metric)
|
|
758
|
+
ax.set_title(f"Comparison of {metric}")
|
|
759
|
+
plt.xticks(rotation=45, ha="right")
|
|
760
|
+
plt.tight_layout()
|
|
761
|
+
|
|
762
|
+
return fig
|
|
763
|
+
```
|
|
764
|
+
|
|
765
|
+
---
|
|
766
|
+
|
|
767
|
+
## Best Practices
|
|
768
|
+
|
|
769
|
+
### What to Track
|
|
770
|
+
|
|
771
|
+
```python
|
|
772
|
+
# Always track:
|
|
773
|
+
REQUIRED_PARAMS = [
|
|
774
|
+
"learning_rate",
|
|
775
|
+
"batch_size",
|
|
776
|
+
"epochs",
|
|
777
|
+
"model_architecture",
|
|
778
|
+
"optimizer",
|
|
779
|
+
"random_seed",
|
|
780
|
+
"dataset_version",
|
|
781
|
+
]
|
|
782
|
+
|
|
783
|
+
REQUIRED_METRICS = [
|
|
784
|
+
"train_loss",
|
|
785
|
+
"val_loss",
|
|
786
|
+
"train_accuracy",
|
|
787
|
+
"val_accuracy",
|
|
788
|
+
]
|
|
789
|
+
|
|
790
|
+
REQUIRED_ARTIFACTS = [
|
|
791
|
+
"model_checkpoint",
|
|
792
|
+
"training_config",
|
|
793
|
+
"requirements.txt",
|
|
794
|
+
]
|
|
795
|
+
|
|
796
|
+
# Recommended tags
|
|
797
|
+
RECOMMENDED_TAGS = {
|
|
798
|
+
"author": "username",
|
|
799
|
+
"environment": "dev|staging|prod",
|
|
800
|
+
"model_type": "classification|regression|etc",
|
|
801
|
+
"dataset": "dataset_name",
|
|
802
|
+
"git_commit": "commit_hash",
|
|
803
|
+
}
|
|
804
|
+
```
|
|
805
|
+
|
|
806
|
+
### Experiment Naming Conventions
|
|
807
|
+
|
|
808
|
+
```python
|
|
809
|
+
# Good naming patterns
|
|
810
|
+
run_name = f"{model_type}_{dataset}_{timestamp}"
|
|
811
|
+
run_name = f"exp_{experiment_number:03d}_{description}"
|
|
812
|
+
run_name = f"{feature_flag}_{ablation_type}_{seed}"
|
|
813
|
+
|
|
814
|
+
# Organize with groups and tags
|
|
815
|
+
tags = {
|
|
816
|
+
"project": "recommendation_engine",
|
|
817
|
+
"sprint": "sprint_42",
|
|
818
|
+
"hypothesis": "larger_embedding_helps",
|
|
819
|
+
}
|
|
820
|
+
```
|
|
821
|
+
|
|
822
|
+
---
|
|
823
|
+
|
|
824
|
+
## Related References
|
|
825
|
+
|
|
826
|
+
- `training-pipelines.md` - Integrating tracking with training
|
|
827
|
+
- `model-validation.md` - Validating tracked models
|
|
828
|
+
- `pipeline-orchestration.md` - Tracking in automated pipelines
|
|
829
|
+
|
|
830
|
+
## Cross-Reference Skills
|
|
831
|
+
|
|
832
|
+
- **DevOps Engineer** - MLflow server deployment
|
|
833
|
+
- **Data Engineer** - Artifact storage integration
|