aigroup-workflow 2.2.1 → 2.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/fix-build.md +10 -5
- package/.claude/commands/init-project.md +13 -8
- package/.claude/commands/plan.md +15 -8
- package/.claude/commands/review.md +12 -6
- package/.claude/commands/tdd.md +11 -5
- package/.claude/commands/workflow-start.md +20 -11
- package/.claude/settings.json +28 -0
- package/.codex/agents/architect.toml +207 -0
- package/.codex/agents/build-error-resolver.toml +110 -0
- package/.codex/agents/code-reviewer.toml +233 -0
- package/.codex/agents/doc-updater.toml +103 -0
- package/.codex/agents/e2e-runner.toml +103 -0
- package/.codex/agents/get-current-datetime.toml +23 -0
- package/.codex/agents/init-architect.toml +181 -0
- package/.codex/agents/planner.toml +208 -0
- package/.codex/agents/refactor-cleaner.toml +81 -0
- package/.codex/agents/rust-reviewer.toml +90 -0
- package/.codex/agents/security-reviewer.toml +104 -0
- package/.codex/agents/tdd-guide.toml +87 -0
- package/.codex/config.toml +22 -39
- package/AGENTS.md +2 -2
- package/CLAUDE.md +23 -1
- package/LICENSE +20 -20
- package/README.md +333 -333
- package/agents/a11y-architect.md +141 -141
- package/agents/architect.md +211 -211
- package/agents/build-error-resolver.md +114 -114
- package/agents/chief-of-staff.md +151 -151
- package/agents/code-architect.md +71 -71
- package/agents/code-explorer.md +69 -69
- package/agents/code-reviewer.md +237 -237
- package/agents/code-simplifier.md +47 -47
- package/agents/comment-analyzer.md +45 -45
- package/agents/conversation-analyzer.md +52 -52
- package/agents/cpp-build-resolver.md +90 -90
- package/agents/cpp-reviewer.md +72 -72
- package/agents/csharp-reviewer.md +101 -101
- package/agents/dart-build-resolver.md +201 -201
- package/agents/database-reviewer.md +91 -91
- package/agents/doc-updater.md +107 -107
- package/agents/docs-lookup.md +68 -68
- package/agents/e2e-runner.md +107 -107
- package/agents/flutter-reviewer.md +243 -243
- package/agents/gan-evaluator.md +209 -209
- package/agents/gan-generator.md +131 -131
- package/agents/gan-planner.md +99 -99
- package/agents/get-current-datetime.md +26 -26
- package/agents/go-build-resolver.md +94 -94
- package/agents/go-reviewer.md +76 -76
- package/agents/harness-optimizer.md +35 -35
- package/agents/healthcare-reviewer.md +83 -83
- package/agents/java-build-resolver.md +153 -153
- package/agents/java-reviewer.md +92 -92
- package/agents/kotlin-build-resolver.md +118 -118
- package/agents/kotlin-reviewer.md +159 -159
- package/agents/loop-operator.md +36 -36
- package/agents/opensource-forker.md +198 -198
- package/agents/opensource-packager.md +249 -249
- package/agents/opensource-sanitizer.md +188 -188
- package/agents/performance-optimizer.md +446 -446
- package/agents/planner.md +212 -212
- package/agents/pr-test-analyzer.md +45 -45
- package/agents/python-reviewer.md +98 -98
- package/agents/pytorch-build-resolver.md +120 -120
- package/agents/refactor-cleaner.md +85 -85
- package/agents/rust-build-resolver.md +148 -148
- package/agents/rust-reviewer.md +94 -94
- package/agents/security-reviewer.md +108 -108
- package/agents/seo-specialist.md +59 -59
- package/agents/silent-failure-hunter.md +50 -50
- package/agents/tdd-guide.md +91 -91
- package/agents/type-design-analyzer.md +41 -41
- package/agents/typescript-reviewer.md +112 -112
- package/cli/commands/update.mjs +1 -1
- package/cli/utils/scaffold.mjs +53 -0
- package/docs/rules/agents.md +166 -50
- package/docs/rules/cpp/coding-style.md +44 -44
- package/docs/rules/cpp/hooks.md +39 -39
- package/docs/rules/cpp/patterns.md +51 -51
- package/docs/rules/cpp/security.md +51 -51
- package/docs/rules/cpp/testing.md +44 -44
- package/docs/rules/csharp/coding-style.md +72 -72
- package/docs/rules/csharp/hooks.md +25 -25
- package/docs/rules/csharp/patterns.md +50 -50
- package/docs/rules/csharp/security.md +58 -58
- package/docs/rules/csharp/testing.md +46 -46
- package/docs/rules/dart/coding-style.md +159 -159
- package/docs/rules/dart/hooks.md +66 -66
- package/docs/rules/dart/patterns.md +261 -261
- package/docs/rules/dart/security.md +135 -135
- package/docs/rules/dart/testing.md +215 -215
- package/docs/rules/golang/coding-style.md +32 -32
- package/docs/rules/golang/hooks.md +17 -17
- package/docs/rules/golang/patterns.md +45 -45
- package/docs/rules/golang/security.md +34 -34
- package/docs/rules/golang/testing.md +31 -31
- package/docs/rules/java/coding-style.md +114 -114
- package/docs/rules/java/hooks.md +18 -18
- package/docs/rules/java/patterns.md +146 -146
- package/docs/rules/java/security.md +100 -100
- package/docs/rules/java/testing.md +131 -131
- package/docs/rules/java_zn/coding-style.md +169 -0
- package/docs/rules/java_zn/mybatis.md +102 -0
- package/docs/rules/kotlin/coding-style.md +86 -86
- package/docs/rules/kotlin/hooks.md +17 -17
- package/docs/rules/kotlin/patterns.md +146 -146
- package/docs/rules/kotlin/security.md +82 -82
- package/docs/rules/kotlin/testing.md +128 -128
- package/docs/rules/perl/coding-style.md +46 -46
- package/docs/rules/perl/hooks.md +22 -22
- package/docs/rules/perl/patterns.md +76 -76
- package/docs/rules/perl/security.md +69 -69
- package/docs/rules/perl/testing.md +54 -54
- package/docs/rules/php/coding-style.md +40 -40
- package/docs/rules/php/hooks.md +24 -24
- package/docs/rules/php/patterns.md +33 -33
- package/docs/rules/php/security.md +37 -37
- package/docs/rules/php/testing.md +39 -39
- package/docs/rules/python/coding-style.md +42 -42
- package/docs/rules/python/hooks.md +19 -19
- package/docs/rules/python/patterns.md +39 -39
- package/docs/rules/python/security.md +30 -30
- package/docs/rules/python/testing.md +38 -38
- package/docs/rules/rust/coding-style.md +151 -151
- package/docs/rules/rust/hooks.md +16 -16
- package/docs/rules/rust/patterns.md +168 -168
- package/docs/rules/rust/security.md +141 -141
- package/docs/rules/rust/testing.md +154 -154
- package/docs/rules/swift/coding-style.md +47 -47
- package/docs/rules/swift/hooks.md +20 -20
- package/docs/rules/swift/patterns.md +66 -66
- package/docs/rules/swift/security.md +33 -33
- package/docs/rules/swift/testing.md +45 -45
- package/docs/rules/typescript/coding-style.md +199 -199
- package/docs/rules/typescript/hooks.md +22 -22
- package/docs/rules/typescript/patterns.md +52 -52
- package/docs/rules/typescript/security.md +28 -28
- package/docs/rules/typescript/testing.md +18 -18
- package/docs/rules/web/coding-style.md +96 -96
- package/docs/rules/web/design-quality.md +62 -62
- package/docs/rules/web/hooks.md +120 -120
- package/docs/rules/web/patterns.md +79 -79
- package/docs/rules/web/performance.md +64 -64
- package/docs/rules/web/security.md +57 -57
- package/docs/rules/web/testing.md +55 -55
- package/docs/templates/README.md +36 -36
- package/docs/templates/ai-project-final.md +124 -124
- package/docs/templates/ai-project.md +105 -105
- package/docs/templates/api.md +157 -157
- package/docs/templates/bug.md +62 -62
- package/docs/templates/code-review.md +87 -87
- package/docs/templates/generic.md +116 -116
- package/docs/templates/implementation-plan.md +1 -1
- package/docs/templates/meeting.md +68 -68
- package/docs/templates/prd.md +98 -98
- package/docs/templates/ui.md +134 -134
- package/docs/workflow-pipeline.md +5 -5
- package/package.json +40 -39
- package/skills/SUPERPOWERS-LICENSE +21 -21
- package/skills/ai-ml/fine-tuning-expert/SKILL.md +162 -162
- package/skills/ai-ml/fine-tuning-expert/references/dataset-preparation.md +540 -540
- package/skills/ai-ml/fine-tuning-expert/references/deployment-optimization.md +673 -673
- package/skills/ai-ml/fine-tuning-expert/references/evaluation-metrics.md +597 -597
- package/skills/ai-ml/fine-tuning-expert/references/hyperparameter-tuning.md +565 -565
- package/skills/ai-ml/fine-tuning-expert/references/lora-peft.md +347 -347
- package/skills/ai-ml/ml-pipeline/SKILL.md +159 -159
- package/skills/ai-ml/ml-pipeline/references/experiment-tracking.md +833 -833
- package/skills/ai-ml/ml-pipeline/references/feature-engineering.md +631 -631
- package/skills/ai-ml/ml-pipeline/references/model-validation.md +978 -978
- package/skills/ai-ml/ml-pipeline/references/pipeline-orchestration.md +907 -907
- package/skills/ai-ml/ml-pipeline/references/training-pipelines.md +782 -782
- package/skills/ai-ml/rag-architect/SKILL.md +194 -194
- package/skills/ai-ml/rag-architect/references/chunking-strategies.md +878 -878
- package/skills/ai-ml/rag-architect/references/embedding-models.md +561 -561
- package/skills/ai-ml/rag-architect/references/rag-evaluation.md +833 -833
- package/skills/ai-ml/rag-architect/references/retrieval-optimization.md +795 -795
- package/skills/ai-ml/rag-architect/references/vector-databases.md +589 -589
- package/skills/ai-ml/spark-engineer/SKILL.md +148 -148
- package/skills/ai-ml/spark-engineer/references/partitioning-caching.md +543 -543
- package/skills/ai-ml/spark-engineer/references/performance-tuning.md +544 -544
- package/skills/ai-ml/spark-engineer/references/rdd-operations.md +599 -599
- package/skills/ai-ml/spark-engineer/references/spark-sql-dataframes.md +474 -474
- package/skills/ai-ml/spark-engineer/references/streaming-patterns.md +786 -786
- package/skills/backend/api-designer/SKILL.md +217 -217
- package/skills/backend/api-designer/references/error-handling.md +541 -541
- package/skills/backend/api-designer/references/openapi.md +824 -824
- package/skills/backend/api-designer/references/pagination.md +494 -494
- package/skills/backend/api-designer/references/rest-patterns.md +335 -335
- package/skills/backend/api-designer/references/versioning.md +391 -391
- package/skills/backend/architecture-designer/SKILL.md +117 -117
- package/skills/backend/architecture-designer/references/adr-template.md +116 -116
- package/skills/backend/architecture-designer/references/architecture-patterns.md +111 -111
- package/skills/backend/architecture-designer/references/database-selection.md +102 -102
- package/skills/backend/architecture-designer/references/nfr-checklist.md +112 -112
- package/skills/backend/architecture-designer/references/system-design.md +100 -100
- package/skills/backend/code-documenter/SKILL.md +147 -147
- package/skills/backend/code-documenter/references/api-docs-fastapi-django.md +166 -166
- package/skills/backend/code-documenter/references/api-docs-nestjs-express.md +220 -220
- package/skills/backend/code-documenter/references/coverage-reports.md +125 -125
- package/skills/backend/code-documenter/references/documentation-systems.md +333 -333
- package/skills/backend/code-documenter/references/interactive-api-docs.md +531 -531
- package/skills/backend/code-documenter/references/python-docstrings.md +121 -121
- package/skills/backend/code-documenter/references/typescript-jsdoc.md +145 -145
- package/skills/backend/code-documenter/references/user-guides-tutorials.md +530 -530
- package/skills/backend/debugging-wizard/SKILL.md +105 -105
- package/skills/backend/debugging-wizard/references/common-patterns.md +132 -132
- package/skills/backend/debugging-wizard/references/debugging-tools.md +140 -140
- package/skills/backend/debugging-wizard/references/quick-fixes.md +177 -177
- package/skills/backend/debugging-wizard/references/strategies.md +142 -142
- package/skills/backend/debugging-wizard/references/systematic-debugging.md +367 -367
- package/skills/backend/feature-forge/SKILL.md +98 -98
- package/skills/backend/feature-forge/references/acceptance-criteria.md +104 -104
- package/skills/backend/feature-forge/references/ears-syntax.md +99 -99
- package/skills/backend/feature-forge/references/interview-questions.md +150 -150
- package/skills/backend/feature-forge/references/pre-discovery-subagents.md +54 -54
- package/skills/backend/feature-forge/references/specification-template.md +103 -103
- package/skills/backend/fullstack-guardian/SKILL.md +105 -105
- package/skills/backend/fullstack-guardian/references/api-design-standards.md +307 -307
- package/skills/backend/fullstack-guardian/references/architecture-decisions.md +350 -350
- package/skills/backend/fullstack-guardian/references/backend-patterns.md +237 -237
- package/skills/backend/fullstack-guardian/references/common-patterns.md +134 -134
- package/skills/backend/fullstack-guardian/references/deliverables-checklist.md +354 -354
- package/skills/backend/fullstack-guardian/references/design-template.md +91 -91
- package/skills/backend/fullstack-guardian/references/error-handling.md +135 -135
- package/skills/backend/fullstack-guardian/references/frontend-patterns.md +340 -340
- package/skills/backend/fullstack-guardian/references/integration-patterns.md +333 -333
- package/skills/backend/fullstack-guardian/references/security-checklist.md +106 -106
- package/skills/backend/graphql-architect/SKILL.md +146 -146
- package/skills/backend/graphql-architect/references/federation.md +418 -418
- package/skills/backend/graphql-architect/references/migration-from-rest.md +1141 -1141
- package/skills/backend/graphql-architect/references/resolvers.md +425 -425
- package/skills/backend/graphql-architect/references/schema-design.md +393 -393
- package/skills/backend/graphql-architect/references/security.md +569 -569
- package/skills/backend/graphql-architect/references/subscriptions.md +510 -510
- package/skills/backend/legacy-modernizer/SKILL.md +137 -137
- package/skills/backend/legacy-modernizer/references/legacy-testing.md +381 -381
- package/skills/backend/legacy-modernizer/references/migration-strategies.md +423 -423
- package/skills/backend/legacy-modernizer/references/refactoring-patterns.md +395 -395
- package/skills/backend/legacy-modernizer/references/strangler-fig-pattern.md +281 -281
- package/skills/backend/legacy-modernizer/references/system-assessment.md +487 -487
- package/skills/backend/microservices-architect/SKILL.md +164 -164
- package/skills/backend/microservices-architect/references/communication.md +499 -499
- package/skills/backend/microservices-architect/references/data.md +721 -721
- package/skills/backend/microservices-architect/references/decomposition.md +344 -344
- package/skills/backend/microservices-architect/references/observability.md +805 -805
- package/skills/backend/microservices-architect/references/patterns.md +603 -603
- package/skills/database/database-optimizer/SKILL.md +147 -147
- package/skills/database/database-optimizer/references/index-strategies.md +331 -331
- package/skills/database/database-optimizer/references/monitoring-analysis.md +501 -501
- package/skills/database/database-optimizer/references/mysql-tuning.md +452 -452
- package/skills/database/database-optimizer/references/postgresql-tuning.md +413 -413
- package/skills/database/database-optimizer/references/query-optimization.md +251 -251
- package/skills/database/postgres-pro/SKILL.md +152 -152
- package/skills/database/postgres-pro/references/extensions.md +404 -404
- package/skills/database/postgres-pro/references/jsonb.md +321 -321
- package/skills/database/postgres-pro/references/maintenance.md +481 -481
- package/skills/database/postgres-pro/references/performance.md +265 -265
- package/skills/database/postgres-pro/references/replication.md +446 -446
- package/skills/database/sql-pro/SKILL.md +129 -129
- package/skills/database/sql-pro/references/database-design.md +402 -402
- package/skills/database/sql-pro/references/dialect-differences.md +419 -419
- package/skills/database/sql-pro/references/optimization.md +384 -384
- package/skills/database/sql-pro/references/query-patterns.md +285 -285
- package/skills/database/sql-pro/references/window-functions.md +328 -328
- package/skills/dotnet/csharp-developer/SKILL.md +125 -125
- package/skills/dotnet/csharp-developer/references/aspnet-core.md +394 -394
- package/skills/dotnet/csharp-developer/references/blazor.md +553 -553
- package/skills/dotnet/csharp-developer/references/entity-framework.md +409 -409
- package/skills/dotnet/csharp-developer/references/modern-csharp.md +248 -248
- package/skills/dotnet/csharp-developer/references/performance.md +498 -498
- package/skills/dotnet/dotnet-core-expert/SKILL.md +138 -138
- package/skills/dotnet/dotnet-core-expert/references/authentication.md +546 -546
- package/skills/dotnet/dotnet-core-expert/references/clean-architecture.md +455 -455
- package/skills/dotnet/dotnet-core-expert/references/cloud-native.md +548 -548
- package/skills/dotnet/dotnet-core-expert/references/entity-framework.md +440 -440
- package/skills/dotnet/dotnet-core-expert/references/minimal-apis.md +319 -319
- package/skills/frontend/angular-architect/SKILL.md +152 -152
- package/skills/frontend/angular-architect/references/components.md +297 -297
- package/skills/frontend/angular-architect/references/ngrx.md +401 -401
- package/skills/frontend/angular-architect/references/routing.md +361 -361
- package/skills/frontend/angular-architect/references/rxjs.md +319 -319
- package/skills/frontend/angular-architect/references/testing.md +405 -405
- package/skills/frontend/design-commands/design.md +91 -91
- package/skills/frontend/design-commands/handoff.md +97 -97
- package/skills/frontend/design-commands/prototype.md +120 -120
- package/skills/frontend/design-commands/spec.md +160 -160
- package/skills/frontend/design-commands/style.md +78 -78
- package/skills/frontend/flutter-expert/SKILL.md +138 -138
- package/skills/frontend/flutter-expert/references/bloc-state.md +259 -259
- package/skills/frontend/flutter-expert/references/gorouter-navigation.md +119 -119
- package/skills/frontend/flutter-expert/references/performance.md +99 -99
- package/skills/frontend/flutter-expert/references/project-structure.md +118 -118
- package/skills/frontend/flutter-expert/references/riverpod-state.md +130 -130
- package/skills/frontend/flutter-expert/references/widget-patterns.md +123 -123
- package/skills/frontend/nextjs-developer/SKILL.md +143 -143
- package/skills/frontend/nextjs-developer/references/app-router.md +311 -311
- package/skills/frontend/nextjs-developer/references/data-fetching.md +482 -482
- package/skills/frontend/nextjs-developer/references/deployment.md +545 -545
- package/skills/frontend/nextjs-developer/references/server-actions.md +462 -462
- package/skills/frontend/nextjs-developer/references/server-components.md +384 -384
- package/skills/frontend/react-expert/SKILL.md +149 -149
- package/skills/frontend/react-expert/references/hooks-patterns.md +162 -162
- package/skills/frontend/react-expert/references/migration-class-to-modern.md +1119 -1119
- package/skills/frontend/react-expert/references/performance.md +168 -168
- package/skills/frontend/react-expert/references/react-19-features.md +174 -174
- package/skills/frontend/react-expert/references/server-components.md +143 -143
- package/skills/frontend/react-expert/references/state-management.md +171 -171
- package/skills/frontend/react-expert/references/testing-react.md +174 -174
- package/skills/frontend/react-native-expert/SKILL.md +185 -185
- package/skills/frontend/react-native-expert/references/expo-router.md +187 -187
- package/skills/frontend/react-native-expert/references/list-optimization.md +204 -204
- package/skills/frontend/react-native-expert/references/platform-handling.md +188 -188
- package/skills/frontend/react-native-expert/references/project-structure.md +171 -171
- package/skills/frontend/react-native-expert/references/storage-hooks.md +173 -173
- package/skills/frontend/senior-frontend/SKILL.md +477 -477
- package/skills/frontend/senior-frontend/references/frontend_best_practices.md +806 -806
- package/skills/frontend/senior-frontend/references/nextjs_optimization_guide.md +724 -724
- package/skills/frontend/senior-frontend/references/react_patterns.md +746 -746
- package/skills/frontend/senior-frontend/scripts/bundle_analyzer.py +407 -407
- package/skills/frontend/senior-frontend/scripts/component_generator.py +329 -329
- package/skills/frontend/senior-frontend/scripts/frontend_scaffolder.py +1005 -1005
- package/skills/frontend/ui-ux-pro-max/SKILL.md +386 -386
- package/skills/frontend/ui-ux-pro-max/data/charts.csv +26 -26
- package/skills/frontend/ui-ux-pro-max/data/colors.csv +97 -97
- package/skills/frontend/ui-ux-pro-max/data/icons.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/landing.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/data/products.csv +96 -96
- package/skills/frontend/ui-ux-pro-max/data/react-performance.csv +45 -45
- package/skills/frontend/ui-ux-pro-max/data/stacks/astro.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/flutter.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -56
- package/skills/frontend/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nextjs.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -59
- package/skills/frontend/ui-ux-pro-max/data/stacks/react-native.csv +52 -52
- package/skills/frontend/ui-ux-pro-max/data/stacks/react.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/shadcn.csv +61 -61
- package/skills/frontend/ui-ux-pro-max/data/stacks/svelte.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/swiftui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/vue.csv +50 -50
- package/skills/frontend/ui-ux-pro-max/data/styles.csv +68 -68
- package/skills/frontend/ui-ux-pro-max/data/typography.csv +57 -57
- package/skills/frontend/ui-ux-pro-max/data/ui-reasoning.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/ux-guidelines.csv +99 -99
- package/skills/frontend/ui-ux-pro-max/data/web-interface.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/scripts/core.py +253 -253
- package/skills/frontend/ui-ux-pro-max/scripts/design_system.py +1067 -1067
- package/skills/frontend/ui-ux-pro-max/scripts/search.py +114 -114
- package/skills/frontend/vue-expert/SKILL.md +98 -98
- package/skills/frontend/vue-expert/references/build-tooling.md +480 -480
- package/skills/frontend/vue-expert/references/components.md +448 -448
- package/skills/frontend/vue-expert/references/composition-api.md +299 -299
- package/skills/frontend/vue-expert/references/mobile-hybrid.md +636 -636
- package/skills/frontend/vue-expert/references/nuxt.md +669 -669
- package/skills/frontend/vue-expert/references/state-management.md +449 -449
- package/skills/frontend/vue-expert/references/typescript.md +584 -584
- package/skills/frontend/vue-expert-js/SKILL.md +167 -167
- package/skills/frontend/vue-expert-js/references/component-architecture.md +219 -219
- package/skills/frontend/vue-expert-js/references/composables-patterns.md +183 -183
- package/skills/frontend/vue-expert-js/references/jsdoc-typing.md +535 -535
- package/skills/frontend/vue-expert-js/references/state-management.md +249 -249
- package/skills/frontend/vue-expert-js/references/testing-patterns.md +237 -237
- package/skills/go-rust-cpp/cpp-pro/SKILL.md +115 -115
- package/skills/go-rust-cpp/cpp-pro/references/build-tooling.md +440 -440
- package/skills/go-rust-cpp/cpp-pro/references/concurrency.md +437 -437
- package/skills/go-rust-cpp/cpp-pro/references/memory-performance.md +397 -397
- package/skills/go-rust-cpp/cpp-pro/references/modern-cpp.md +304 -304
- package/skills/go-rust-cpp/cpp-pro/references/templates.md +357 -357
- package/skills/go-rust-cpp/golang-pro/SKILL.md +122 -122
- package/skills/go-rust-cpp/golang-pro/references/concurrency.md +329 -329
- package/skills/go-rust-cpp/golang-pro/references/generics.md +442 -442
- package/skills/go-rust-cpp/golang-pro/references/interfaces.md +432 -432
- package/skills/go-rust-cpp/golang-pro/references/project-structure.md +477 -477
- package/skills/go-rust-cpp/golang-pro/references/testing.md +451 -451
- package/skills/go-rust-cpp/rust-engineer/SKILL.md +167 -167
- package/skills/go-rust-cpp/rust-engineer/references/async.md +458 -458
- package/skills/go-rust-cpp/rust-engineer/references/error-handling.md +334 -334
- package/skills/go-rust-cpp/rust-engineer/references/ownership.md +278 -278
- package/skills/go-rust-cpp/rust-engineer/references/testing.md +470 -470
- package/skills/go-rust-cpp/rust-engineer/references/traits.md +413 -413
- package/skills/infra/cli-developer/SKILL.md +113 -113
- package/skills/infra/cli-developer/references/design-patterns.md +221 -221
- package/skills/infra/cli-developer/references/go-cli.md +540 -540
- package/skills/infra/cli-developer/references/node-cli.md +383 -383
- package/skills/infra/cli-developer/references/python-cli.md +422 -422
- package/skills/infra/cli-developer/references/ux-patterns.md +448 -448
- package/skills/infra/cloud-architect/SKILL.md +216 -216
- package/skills/infra/cloud-architect/references/aws.md +394 -394
- package/skills/infra/cloud-architect/references/azure.md +562 -562
- package/skills/infra/cloud-architect/references/cost.md +582 -582
- package/skills/infra/cloud-architect/references/gcp.md +633 -633
- package/skills/infra/cloud-architect/references/multi-cloud.md +483 -483
- package/skills/infra/devops-engineer/SKILL.md +144 -144
- package/skills/infra/devops-engineer/references/deployment-strategies.md +241 -241
- package/skills/infra/devops-engineer/references/docker-patterns.md +113 -113
- package/skills/infra/devops-engineer/references/github-actions.md +139 -139
- package/skills/infra/devops-engineer/references/incident-response.md +331 -331
- package/skills/infra/devops-engineer/references/kubernetes.md +154 -154
- package/skills/infra/devops-engineer/references/platform-engineering.md +417 -417
- package/skills/infra/devops-engineer/references/release-automation.md +527 -527
- package/skills/infra/devops-engineer/references/terraform-iac.md +141 -141
- package/skills/infra/kubernetes-specialist/SKILL.md +241 -241
- package/skills/infra/kubernetes-specialist/references/configuration.md +452 -452
- package/skills/infra/kubernetes-specialist/references/cost-optimization.md +458 -458
- package/skills/infra/kubernetes-specialist/references/custom-operators.md +563 -563
- package/skills/infra/kubernetes-specialist/references/gitops.md +530 -530
- package/skills/infra/kubernetes-specialist/references/helm-charts.md +912 -912
- package/skills/infra/kubernetes-specialist/references/multi-cluster.md +507 -507
- package/skills/infra/kubernetes-specialist/references/networking.md +447 -447
- package/skills/infra/kubernetes-specialist/references/service-mesh.md +459 -459
- package/skills/infra/kubernetes-specialist/references/storage.md +535 -535
- package/skills/infra/kubernetes-specialist/references/troubleshooting.md +414 -414
- package/skills/infra/kubernetes-specialist/references/workloads.md +377 -377
- package/skills/infra/mcp-developer/SKILL.md +143 -143
- package/skills/infra/mcp-developer/references/protocol.md +244 -244
- package/skills/infra/mcp-developer/references/python-sdk.md +367 -367
- package/skills/infra/mcp-developer/references/resources.md +554 -554
- package/skills/infra/mcp-developer/references/tools.md +480 -480
- package/skills/infra/mcp-developer/references/typescript-sdk.md +350 -350
- package/skills/infra/monitoring-expert/SKILL.md +176 -176
- package/skills/infra/monitoring-expert/references/alerting-rules.md +141 -141
- package/skills/infra/monitoring-expert/references/application-profiling.md +331 -331
- package/skills/infra/monitoring-expert/references/capacity-planning.md +344 -344
- package/skills/infra/monitoring-expert/references/dashboards.md +126 -126
- package/skills/infra/monitoring-expert/references/opentelemetry.md +123 -123
- package/skills/infra/monitoring-expert/references/performance-testing.md +269 -269
- package/skills/infra/monitoring-expert/references/prometheus-metrics.md +136 -136
- package/skills/infra/monitoring-expert/references/structured-logging.md +142 -142
- package/skills/infra/sre-engineer/SKILL.md +181 -181
- package/skills/infra/sre-engineer/references/automation-toil.md +492 -492
- package/skills/infra/sre-engineer/references/error-budget-policy.md +334 -334
- package/skills/infra/sre-engineer/references/incident-chaos.md +576 -576
- package/skills/infra/sre-engineer/references/monitoring-alerting.md +424 -424
- package/skills/infra/sre-engineer/references/slo-sli-management.md +238 -238
- package/skills/infra/terraform-engineer/SKILL.md +143 -143
- package/skills/infra/terraform-engineer/references/best-practices.md +583 -583
- package/skills/infra/terraform-engineer/references/module-patterns.md +297 -297
- package/skills/infra/terraform-engineer/references/providers.md +452 -452
- package/skills/infra/terraform-engineer/references/state-management.md +371 -371
- package/skills/infra/terraform-engineer/references/testing.md +486 -486
- package/skills/infra/websocket-engineer/SKILL.md +168 -168
- package/skills/infra/websocket-engineer/references/alternatives.md +391 -391
- package/skills/infra/websocket-engineer/references/patterns.md +400 -400
- package/skills/infra/websocket-engineer/references/protocol.md +195 -195
- package/skills/infra/websocket-engineer/references/scaling.md +333 -333
- package/skills/infra/websocket-engineer/references/security.md +474 -474
- package/skills/java/java-architect/SKILL.md +132 -132
- package/skills/java/java-architect/references/jpa-optimization.md +393 -393
- package/skills/java/java-architect/references/reactive-webflux.md +356 -356
- package/skills/java/java-architect/references/spring-boot-setup.md +269 -269
- package/skills/java/java-architect/references/spring-security.md +445 -445
- package/skills/java/java-architect/references/testing-patterns.md +500 -500
- package/skills/java/kotlin-specialist/SKILL.md +147 -147
- package/skills/java/kotlin-specialist/references/android-compose.md +419 -419
- package/skills/java/kotlin-specialist/references/coroutines-flow.md +276 -276
- package/skills/java/kotlin-specialist/references/dsl-idioms.md +421 -421
- package/skills/java/kotlin-specialist/references/ktor-server.md +426 -426
- package/skills/java/kotlin-specialist/references/multiplatform-kmp.md +380 -380
- package/skills/java/spring-boot-engineer/SKILL.md +196 -195
- package/skills/java/spring-boot-engineer/references/cloud.md +498 -498
- package/skills/java/spring-boot-engineer/references/data.md +381 -381
- package/skills/java/spring-boot-engineer/references/mybatis-plus.md +592 -0
- package/skills/java/spring-boot-engineer/references/security.md +459 -459
- package/skills/java/spring-boot-engineer/references/testing.md +545 -545
- package/skills/java/spring-boot-engineer/references/web.md +295 -295
- package/skills/java/spring-boot-engineer_zn/SKILL.md +129 -0
- package/skills/java/spring-boot-engineer_zn/references/architecture.md +23 -0
- package/skills/java/spring-boot-engineer_zn/references/concurrency.md +9 -0
- package/skills/java/spring-boot-engineer_zn/references/exception-logging.md +31 -0
- package/skills/java/spring-boot-engineer_zn/references/persistence.md +13 -0
- package/skills/java/spring-boot-engineer_zn/references/pojo-lombok.md +48 -0
- package/skills/java/spring-boot-engineer_zn/references/security.md +9 -0
- package/skills/java/spring-boot-engineer_zn/references/testing.md +7 -0
- package/skills/java/spring-boot-engineer_zn/references/validation.md +80 -0
- package/skills/javascript/javascript-pro/SKILL.md +132 -132
- package/skills/javascript/javascript-pro/references/async-patterns.md +334 -334
- package/skills/javascript/javascript-pro/references/browser-apis.md +398 -398
- package/skills/javascript/javascript-pro/references/modern-syntax.md +272 -272
- package/skills/javascript/javascript-pro/references/modules.md +357 -357
- package/skills/javascript/javascript-pro/references/node-essentials.md +471 -471
- package/skills/javascript/nestjs-expert/SKILL.md +206 -206
- package/skills/javascript/nestjs-expert/references/authentication.md +166 -166
- package/skills/javascript/nestjs-expert/references/controllers-routing.md +111 -111
- package/skills/javascript/nestjs-expert/references/dtos-validation.md +153 -153
- package/skills/javascript/nestjs-expert/references/migration-from-express.md +1237 -1237
- package/skills/javascript/nestjs-expert/references/services-di.md +140 -140
- package/skills/javascript/nestjs-expert/references/testing-patterns.md +186 -186
- package/skills/javascript/typescript-pro/SKILL.md +145 -145
- package/skills/javascript/typescript-pro/references/advanced-types.md +259 -259
- package/skills/javascript/typescript-pro/references/configuration.md +445 -445
- package/skills/javascript/typescript-pro/references/patterns.md +484 -484
- package/skills/javascript/typescript-pro/references/type-guards.md +352 -352
- package/skills/javascript/typescript-pro/references/utility-types.md +329 -329
- package/skills/php/laravel-specialist/SKILL.md +262 -262
- package/skills/php/laravel-specialist/references/eloquent.md +351 -351
- package/skills/php/laravel-specialist/references/livewire.md +512 -512
- package/skills/php/laravel-specialist/references/queues.md +423 -423
- package/skills/php/laravel-specialist/references/routing.md +362 -362
- package/skills/php/laravel-specialist/references/testing.md +522 -522
- package/skills/php/php-pro/SKILL.md +206 -206
- package/skills/php/php-pro/references/async-patterns.md +412 -412
- package/skills/php/php-pro/references/laravel-patterns.md +377 -377
- package/skills/php/php-pro/references/modern-php-features.md +323 -323
- package/skills/php/php-pro/references/symfony-patterns.md +466 -466
- package/skills/php/php-pro/references/testing-quality.md +466 -466
- package/skills/product/competitive-analysis/SKILL.md +257 -257
- package/skills/product/meeting-notes/SKILL.md +266 -266
- package/skills/product/prd-template/SKILL.md +150 -150
- package/skills/product/stakeholder-update/SKILL.md +225 -225
- package/skills/product/user-research-synthesis/SKILL.md +235 -235
- package/skills/python/django-expert/SKILL.md +162 -162
- package/skills/python/django-expert/references/authentication.md +145 -145
- package/skills/python/django-expert/references/drf-serializers.md +148 -148
- package/skills/python/django-expert/references/models-orm.md +151 -151
- package/skills/python/django-expert/references/testing-django.md +204 -204
- package/skills/python/django-expert/references/viewsets-views.md +153 -153
- package/skills/python/fastapi-expert/SKILL.md +185 -185
- package/skills/python/fastapi-expert/references/async-sqlalchemy.md +146 -146
- package/skills/python/fastapi-expert/references/authentication.md +159 -159
- package/skills/python/fastapi-expert/references/endpoints-routing.md +142 -142
- package/skills/python/fastapi-expert/references/migration-from-django.md +996 -996
- package/skills/python/fastapi-expert/references/pydantic-v2.md +135 -135
- package/skills/python/fastapi-expert/references/testing-async.md +159 -159
- package/skills/python/pandas-pro/SKILL.md +178 -178
- package/skills/python/pandas-pro/references/aggregation-groupby.md +545 -545
- package/skills/python/pandas-pro/references/data-cleaning.md +500 -500
- package/skills/python/pandas-pro/references/dataframe-operations.md +420 -420
- package/skills/python/pandas-pro/references/merging-joining.md +596 -596
- package/skills/python/pandas-pro/references/performance-optimization.md +597 -597
- package/skills/python/python-pro/SKILL.md +177 -177
- package/skills/python/python-pro/references/async-patterns.md +356 -356
- package/skills/python/python-pro/references/packaging.md +460 -460
- package/skills/python/python-pro/references/standard-library.md +378 -378
- package/skills/python/python-pro/references/testing.md +404 -404
- package/skills/python/python-pro/references/type-system.md +290 -290
- package/skills/quality/chaos-engineer/SKILL.md +182 -182
- package/skills/quality/chaos-engineer/references/chaos-tools.md +511 -511
- package/skills/quality/chaos-engineer/references/experiment-design.md +229 -229
- package/skills/quality/chaos-engineer/references/game-days.md +434 -434
- package/skills/quality/chaos-engineer/references/infrastructure-chaos.md +348 -348
- package/skills/quality/chaos-engineer/references/kubernetes-chaos.md +432 -432
- package/skills/quality/code-reviewer/SKILL.md +119 -119
- package/skills/quality/code-reviewer/references/common-issues.md +142 -142
- package/skills/quality/code-reviewer/references/feedback-examples.md +144 -144
- package/skills/quality/code-reviewer/references/receiving-feedback.md +238 -238
- package/skills/quality/code-reviewer/references/report-template.md +109 -109
- package/skills/quality/code-reviewer/references/review-checklist.md +88 -88
- package/skills/quality/code-reviewer/references/spec-compliance-review.md +258 -258
- package/skills/quality/playwright-expert/SKILL.md +169 -169
- package/skills/quality/playwright-expert/references/api-mocking.md +140 -140
- package/skills/quality/playwright-expert/references/configuration.md +155 -155
- package/skills/quality/playwright-expert/references/debugging-flaky.md +150 -150
- package/skills/quality/playwright-expert/references/page-object-model.md +152 -152
- package/skills/quality/playwright-expert/references/selectors-locators.md +119 -119
- package/skills/quality/secure-code-guardian/SKILL.md +191 -191
- package/skills/quality/secure-code-guardian/references/authentication.md +136 -136
- package/skills/quality/secure-code-guardian/references/input-validation.md +146 -146
- package/skills/quality/secure-code-guardian/references/owasp-prevention.md +135 -135
- package/skills/quality/secure-code-guardian/references/security-headers.md +133 -133
- package/skills/quality/secure-code-guardian/references/xss-csrf.md +157 -157
- package/skills/quality/security-reviewer/SKILL.md +103 -103
- package/skills/quality/security-reviewer/references/infrastructure-security.md +268 -268
- package/skills/quality/security-reviewer/references/penetration-testing.md +268 -268
- package/skills/quality/security-reviewer/references/report-template.md +170 -170
- package/skills/quality/security-reviewer/references/sast-tools.md +117 -117
- package/skills/quality/security-reviewer/references/secret-scanning.md +125 -125
- package/skills/quality/security-reviewer/references/vulnerability-patterns.md +152 -152
- package/skills/quality/senior-qa/README.md +196 -196
- package/skills/quality/senior-qa/SKILL.md +399 -399
- package/skills/quality/senior-qa/references/qa_best_practices.md +964 -964
- package/skills/quality/senior-qa/references/test_automation_patterns.md +1009 -1009
- package/skills/quality/senior-qa/references/testing_strategies.md +649 -649
- package/skills/quality/senior-qa/scripts/coverage_analyzer.py +836 -836
- package/skills/quality/senior-qa/scripts/e2e_test_scaffolder.py +820 -820
- package/skills/quality/senior-qa/scripts/test_suite_generator.py +605 -605
- package/skills/quality/tdd-guide/HOW_TO_USE.md +313 -313
- package/skills/quality/tdd-guide/README.md +680 -680
- package/skills/quality/tdd-guide/SKILL.md +122 -122
- package/skills/quality/tdd-guide/assets/expected_output.json +77 -77
- package/skills/quality/tdd-guide/assets/sample_input_python.json +39 -39
- package/skills/quality/tdd-guide/assets/sample_input_typescript.json +36 -36
- package/skills/quality/tdd-guide/references/ci-integration.md +195 -195
- package/skills/quality/tdd-guide/references/framework-guide.md +206 -206
- package/skills/quality/tdd-guide/references/tdd-best-practices.md +128 -128
- package/skills/quality/tdd-guide/scripts/coverage_analyzer.py +434 -434
- package/skills/quality/tdd-guide/scripts/fixture_generator.py +440 -440
- package/skills/quality/tdd-guide/scripts/format_detector.py +384 -384
- package/skills/quality/tdd-guide/scripts/framework_adapter.py +428 -428
- package/skills/quality/tdd-guide/scripts/metrics_calculator.py +456 -456
- package/skills/quality/tdd-guide/scripts/output_formatter.py +354 -354
- package/skills/quality/tdd-guide/scripts/tdd_workflow.py +474 -474
- package/skills/quality/tdd-guide/scripts/test_generator.py +438 -438
- package/skills/quality/test-master/SKILL.md +94 -94
- package/skills/quality/test-master/references/automation-frameworks.md +294 -294
- package/skills/quality/test-master/references/e2e-testing.md +128 -128
- package/skills/quality/test-master/references/integration-testing.md +120 -120
- package/skills/quality/test-master/references/performance-testing.md +118 -118
- package/skills/quality/test-master/references/qa-methodology.md +247 -247
- package/skills/quality/test-master/references/security-testing.md +127 -127
- package/skills/quality/test-master/references/tdd-iron-laws.md +174 -174
- package/skills/quality/test-master/references/test-reports.md +104 -104
- package/skills/quality/test-master/references/testing-anti-patterns.md +231 -231
- package/skills/quality/test-master/references/unit-testing.md +113 -113
- package/skills/ruby/rails-expert/SKILL.md +154 -154
- package/skills/ruby/rails-expert/references/active-record.md +244 -244
- package/skills/ruby/rails-expert/references/api-development.md +401 -401
- package/skills/ruby/rails-expert/references/background-jobs.md +272 -272
- package/skills/ruby/rails-expert/references/hotwire-turbo.md +228 -228
- package/skills/ruby/rails-expert/references/rspec-testing.md +367 -367
- package/skills/swift/swift-expert/SKILL.md +163 -163
- package/skills/swift/swift-expert/references/async-concurrency.md +360 -360
- package/skills/swift/swift-expert/references/memory-performance.md +377 -377
- package/skills/swift/swift-expert/references/protocol-oriented.md +354 -354
- package/skills/swift/swift-expert/references/swiftui-patterns.md +291 -291
- package/skills/swift/swift-expert/references/testing-patterns.md +399 -399
- package/skills/workflow/brainstorming/SKILL.md +164 -164
- package/skills/workflow/brainstorming/scripts/frame-template.html +214 -214
- package/skills/workflow/brainstorming/scripts/helper.js +88 -88
- package/skills/workflow/brainstorming/scripts/server.cjs +354 -354
- package/skills/workflow/brainstorming/scripts/start-server.sh +148 -148
- package/skills/workflow/brainstorming/scripts/stop-server.sh +56 -56
- package/skills/workflow/brainstorming/spec-document-reviewer-prompt.md +49 -49
- package/skills/workflow/brainstorming/visual-companion.md +287 -287
- package/skills/workflow/documentation/SKILL.md +45 -45
- package/skills/workflow/entropy-management/SKILL.md +115 -115
- package/skills/workflow/executing-plans/SKILL.md +70 -70
- package/skills/workflow/finishing-a-development-branch/SKILL.md +200 -200
- package/skills/workflow/receiving-code-review/SKILL.md +213 -213
- package/skills/workflow/requesting-code-review/SKILL.md +105 -105
- package/skills/workflow/requesting-code-review/code-reviewer.md +146 -146
- package/skills/workflow/requirement-engineering/SKILL.md +111 -111
- package/skills/workflow/systematic-debugging/CREATION-LOG.md +119 -119
- package/skills/workflow/systematic-debugging/SKILL.md +296 -296
- package/skills/workflow/systematic-debugging/condition-based-waiting-example.ts +158 -158
- package/skills/workflow/systematic-debugging/condition-based-waiting.md +115 -115
- package/skills/workflow/systematic-debugging/defense-in-depth.md +122 -122
- package/skills/workflow/systematic-debugging/find-polluter.sh +63 -63
- package/skills/workflow/systematic-debugging/root-cause-tracing.md +169 -169
- package/skills/workflow/systematic-debugging/test-academic.md +14 -14
- package/skills/workflow/systematic-debugging/test-pressure-1.md +58 -58
- package/skills/workflow/systematic-debugging/test-pressure-2.md +68 -68
- package/skills/workflow/systematic-debugging/test-pressure-3.md +69 -69
- package/skills/workflow/using-git-worktrees/SKILL.md +218 -218
- package/skills/workflow/verification-before-completion/SKILL.md +139 -139
- package/skills/workflow/writing-plans/SKILL.md +151 -151
- package/skills/workflow/writing-plans/plan-document-reviewer-prompt.md +49 -49
- package/skills/workflow/writing-skills/SKILL.md +655 -655
- package/skills/workflow/writing-skills/anthropic-best-practices.md +1150 -1150
- package/skills/workflow/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -189
- package/skills/workflow/writing-skills/persuasion-principles.md +187 -187
- package/skills/workflow/writing-skills/render-graphs.js +168 -168
- package/skills/workflow/writing-skills/testing-skills-with-subagents.md +384 -384
|
@@ -1,833 +1,833 @@
|
|
|
1
|
-
# Experiment Tracking
|
|
2
|
-
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
## Overview
|
|
6
|
-
|
|
7
|
-
Experiment tracking enables reproducibility, comparison, and collaboration in ML development. It captures hyperparameters, metrics, artifacts, and model versions to ensure every experiment can be reproduced and compared.
|
|
8
|
-
|
|
9
|
-
## When to Use This Reference
|
|
10
|
-
|
|
11
|
-
- Setting up MLflow for experiment tracking
|
|
12
|
-
- Implementing Weights & Biases integration
|
|
13
|
-
- Creating model registries and versioning
|
|
14
|
-
- Comparing experiments and selecting models
|
|
15
|
-
- Building custom tracking solutions
|
|
16
|
-
|
|
17
|
-
## When NOT to Use
|
|
18
|
-
|
|
19
|
-
- Quick one-off experiments without reproducibility needs
|
|
20
|
-
- Simple scripts without hyperparameters
|
|
21
|
-
- Non-ML projects
|
|
22
|
-
|
|
23
|
-
---
|
|
24
|
-
|
|
25
|
-
## MLflow Integration
|
|
26
|
-
|
|
27
|
-
### Basic Experiment Tracking
|
|
28
|
-
|
|
29
|
-
```python
|
|
30
|
-
import mlflow
|
|
31
|
-
from mlflow.tracking import MlflowClient
|
|
32
|
-
from pathlib import Path
|
|
33
|
-
import json
|
|
34
|
-
|
|
35
|
-
class MLflowTracker:
|
|
36
|
-
"""MLflow experiment tracking wrapper."""
|
|
37
|
-
|
|
38
|
-
def __init__(
|
|
39
|
-
self,
|
|
40
|
-
experiment_name: str,
|
|
41
|
-
tracking_uri: str = "http://localhost:5000",
|
|
42
|
-
artifact_location: str = None,
|
|
43
|
-
):
|
|
44
|
-
mlflow.set_tracking_uri(tracking_uri)
|
|
45
|
-
|
|
46
|
-
# Create or get experiment
|
|
47
|
-
experiment = mlflow.get_experiment_by_name(experiment_name)
|
|
48
|
-
if experiment is None:
|
|
49
|
-
self.experiment_id = mlflow.create_experiment(
|
|
50
|
-
experiment_name,
|
|
51
|
-
artifact_location=artifact_location,
|
|
52
|
-
)
|
|
53
|
-
else:
|
|
54
|
-
self.experiment_id = experiment.experiment_id
|
|
55
|
-
|
|
56
|
-
mlflow.set_experiment(experiment_name)
|
|
57
|
-
self.client = MlflowClient()
|
|
58
|
-
self.run = None
|
|
59
|
-
|
|
60
|
-
def start_run(
|
|
61
|
-
self,
|
|
62
|
-
run_name: str = None,
|
|
63
|
-
tags: dict = None,
|
|
64
|
-
nested: bool = False,
|
|
65
|
-
) -> str:
|
|
66
|
-
"""Start a new MLflow run."""
|
|
67
|
-
self.run = mlflow.start_run(
|
|
68
|
-
run_name=run_name,
|
|
69
|
-
experiment_id=self.experiment_id,
|
|
70
|
-
nested=nested,
|
|
71
|
-
)
|
|
72
|
-
|
|
73
|
-
if tags:
|
|
74
|
-
mlflow.set_tags(tags)
|
|
75
|
-
|
|
76
|
-
return self.run.info.run_id
|
|
77
|
-
|
|
78
|
-
def end_run(self, status: str = "FINISHED") -> None:
|
|
79
|
-
"""End the current run."""
|
|
80
|
-
mlflow.end_run(status=status)
|
|
81
|
-
self.run = None
|
|
82
|
-
|
|
83
|
-
def log_params(self, params: dict) -> None:
|
|
84
|
-
"""Log hyperparameters."""
|
|
85
|
-
mlflow.log_params(params)
|
|
86
|
-
|
|
87
|
-
def log_metrics(self, metrics: dict, step: int = None) -> None:
|
|
88
|
-
"""Log metrics with optional step."""
|
|
89
|
-
for key, value in metrics.items():
|
|
90
|
-
mlflow.log_metric(key, value, step=step)
|
|
91
|
-
|
|
92
|
-
def log_artifact(self, local_path: str, artifact_path: str = None) -> None:
|
|
93
|
-
"""Log file or directory as artifact."""
|
|
94
|
-
mlflow.log_artifact(local_path, artifact_path)
|
|
95
|
-
|
|
96
|
-
def log_model(
|
|
97
|
-
self,
|
|
98
|
-
model,
|
|
99
|
-
artifact_path: str,
|
|
100
|
-
registered_model_name: str = None,
|
|
101
|
-
signature=None,
|
|
102
|
-
input_example=None,
|
|
103
|
-
) -> str:
|
|
104
|
-
"""Log model with optional registration."""
|
|
105
|
-
from mlflow.models import infer_signature
|
|
106
|
-
|
|
107
|
-
if signature is None and input_example is not None:
|
|
108
|
-
signature = infer_signature(input_example, model.predict(input_example))
|
|
109
|
-
|
|
110
|
-
model_info = mlflow.sklearn.log_model(
|
|
111
|
-
model,
|
|
112
|
-
artifact_path=artifact_path,
|
|
113
|
-
registered_model_name=registered_model_name,
|
|
114
|
-
signature=signature,
|
|
115
|
-
input_example=input_example,
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
return model_info.model_uri
|
|
119
|
-
|
|
120
|
-
# Usage example
|
|
121
|
-
def train_with_mlflow(
|
|
122
|
-
model,
|
|
123
|
-
X_train,
|
|
124
|
-
y_train,
|
|
125
|
-
X_val,
|
|
126
|
-
y_val,
|
|
127
|
-
params: dict,
|
|
128
|
-
):
|
|
129
|
-
"""Complete training run with MLflow tracking."""
|
|
130
|
-
tracker = MLflowTracker("my_experiment")
|
|
131
|
-
|
|
132
|
-
tracker.start_run(
|
|
133
|
-
run_name=f"run_{params['model_type']}",
|
|
134
|
-
tags={
|
|
135
|
-
"model_type": params["model_type"],
|
|
136
|
-
"dataset_version": "v1.0",
|
|
137
|
-
"author": "ml-team",
|
|
138
|
-
},
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
try:
|
|
142
|
-
# Log parameters
|
|
143
|
-
tracker.log_params(params)
|
|
144
|
-
|
|
145
|
-
# Train model
|
|
146
|
-
model.fit(X_train, y_train)
|
|
147
|
-
|
|
148
|
-
# Evaluate and log metrics
|
|
149
|
-
train_score = model.score(X_train, y_train)
|
|
150
|
-
val_score = model.score(X_val, y_val)
|
|
151
|
-
|
|
152
|
-
tracker.log_metrics({
|
|
153
|
-
"train_accuracy": train_score,
|
|
154
|
-
"val_accuracy": val_score,
|
|
155
|
-
})
|
|
156
|
-
|
|
157
|
-
# Log model
|
|
158
|
-
model_uri = tracker.log_model(
|
|
159
|
-
model,
|
|
160
|
-
artifact_path="model",
|
|
161
|
-
registered_model_name="my_model",
|
|
162
|
-
input_example=X_train[:5],
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
tracker.end_run()
|
|
166
|
-
return model_uri
|
|
167
|
-
|
|
168
|
-
except Exception as e:
|
|
169
|
-
tracker.end_run(status="FAILED")
|
|
170
|
-
raise
|
|
171
|
-
```
|
|
172
|
-
|
|
173
|
-
### PyTorch Model Logging
|
|
174
|
-
|
|
175
|
-
```python
|
|
176
|
-
import mlflow.pytorch
|
|
177
|
-
import torch
|
|
178
|
-
|
|
179
|
-
def log_pytorch_model(
|
|
180
|
-
model: torch.nn.Module,
|
|
181
|
-
artifact_path: str,
|
|
182
|
-
registered_model_name: str = None,
|
|
183
|
-
sample_input: torch.Tensor = None,
|
|
184
|
-
) -> str:
|
|
185
|
-
"""Log PyTorch model with signature inference."""
|
|
186
|
-
from mlflow.models import infer_signature
|
|
187
|
-
|
|
188
|
-
# Create signature from sample input
|
|
189
|
-
signature = None
|
|
190
|
-
if sample_input is not None:
|
|
191
|
-
model.eval()
|
|
192
|
-
with torch.no_grad():
|
|
193
|
-
sample_output = model(sample_input)
|
|
194
|
-
|
|
195
|
-
signature = infer_signature(
|
|
196
|
-
sample_input.numpy(),
|
|
197
|
-
sample_output.numpy(),
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
model_info = mlflow.pytorch.log_model(
|
|
201
|
-
model,
|
|
202
|
-
artifact_path=artifact_path,
|
|
203
|
-
registered_model_name=registered_model_name,
|
|
204
|
-
signature=signature,
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
return model_info.model_uri
|
|
208
|
-
|
|
209
|
-
def load_pytorch_model(model_uri: str, device: str = "cpu") -> torch.nn.Module:
|
|
210
|
-
"""Load PyTorch model from MLflow."""
|
|
211
|
-
model = mlflow.pytorch.load_model(model_uri, map_location=device)
|
|
212
|
-
return model
|
|
213
|
-
```
|
|
214
|
-
|
|
215
|
-
### Model Registry Operations
|
|
216
|
-
|
|
217
|
-
```python
|
|
218
|
-
from mlflow.tracking import MlflowClient
|
|
219
|
-
from mlflow.entities.model_registry import ModelVersion
|
|
220
|
-
|
|
221
|
-
class ModelRegistry:
|
|
222
|
-
"""MLflow Model Registry wrapper."""
|
|
223
|
-
|
|
224
|
-
def __init__(self, tracking_uri: str = "http://localhost:5000"):
|
|
225
|
-
mlflow.set_tracking_uri(tracking_uri)
|
|
226
|
-
self.client = MlflowClient()
|
|
227
|
-
|
|
228
|
-
def register_model(
|
|
229
|
-
self,
|
|
230
|
-
model_uri: str,
|
|
231
|
-
name: str,
|
|
232
|
-
tags: dict = None,
|
|
233
|
-
description: str = None,
|
|
234
|
-
) -> ModelVersion:
|
|
235
|
-
"""Register a new model version."""
|
|
236
|
-
result = mlflow.register_model(model_uri, name)
|
|
237
|
-
|
|
238
|
-
if tags:
|
|
239
|
-
for key, value in tags.items():
|
|
240
|
-
self.client.set_model_version_tag(name, result.version, key, value)
|
|
241
|
-
|
|
242
|
-
if description:
|
|
243
|
-
self.client.update_model_version(
|
|
244
|
-
name,
|
|
245
|
-
result.version,
|
|
246
|
-
description=description,
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
return result
|
|
250
|
-
|
|
251
|
-
def transition_model_stage(
|
|
252
|
-
self,
|
|
253
|
-
name: str,
|
|
254
|
-
version: str,
|
|
255
|
-
stage: str,
|
|
256
|
-
archive_existing: bool = True,
|
|
257
|
-
) -> ModelVersion:
|
|
258
|
-
"""Transition model to new stage (Staging, Production, Archived)."""
|
|
259
|
-
return self.client.transition_model_version_stage(
|
|
260
|
-
name=name,
|
|
261
|
-
version=version,
|
|
262
|
-
stage=stage,
|
|
263
|
-
archive_existing_versions=archive_existing,
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
def get_latest_version(
|
|
267
|
-
self,
|
|
268
|
-
name: str,
|
|
269
|
-
stages: list[str] = None,
|
|
270
|
-
) -> list[ModelVersion]:
|
|
271
|
-
"""Get latest model versions by stage."""
|
|
272
|
-
return self.client.get_latest_versions(name, stages=stages)
|
|
273
|
-
|
|
274
|
-
def load_production_model(self, name: str) -> any:
|
|
275
|
-
"""Load the production model."""
|
|
276
|
-
model_uri = f"models:/{name}/Production"
|
|
277
|
-
return mlflow.pyfunc.load_model(model_uri)
|
|
278
|
-
|
|
279
|
-
def compare_versions(
|
|
280
|
-
self,
|
|
281
|
-
name: str,
|
|
282
|
-
version_a: str,
|
|
283
|
-
version_b: str,
|
|
284
|
-
) -> dict:
|
|
285
|
-
"""Compare two model versions."""
|
|
286
|
-
v_a = self.client.get_model_version(name, version_a)
|
|
287
|
-
v_b = self.client.get_model_version(name, version_b)
|
|
288
|
-
|
|
289
|
-
run_a = self.client.get_run(v_a.run_id)
|
|
290
|
-
run_b = self.client.get_run(v_b.run_id)
|
|
291
|
-
|
|
292
|
-
return {
|
|
293
|
-
"version_a": {
|
|
294
|
-
"version": version_a,
|
|
295
|
-
"metrics": run_a.data.metrics,
|
|
296
|
-
"params": run_a.data.params,
|
|
297
|
-
},
|
|
298
|
-
"version_b": {
|
|
299
|
-
"version": version_b,
|
|
300
|
-
"metrics": run_b.data.metrics,
|
|
301
|
-
"params": run_b.data.params,
|
|
302
|
-
},
|
|
303
|
-
}
|
|
304
|
-
```
|
|
305
|
-
|
|
306
|
-
---
|
|
307
|
-
|
|
308
|
-
## Weights & Biases Integration
|
|
309
|
-
|
|
310
|
-
### Basic W&B Tracking
|
|
311
|
-
|
|
312
|
-
```python
|
|
313
|
-
import wandb
|
|
314
|
-
from pathlib import Path
|
|
315
|
-
|
|
316
|
-
class WandbTracker:
|
|
317
|
-
"""Weights & Biases experiment tracking wrapper."""
|
|
318
|
-
|
|
319
|
-
def __init__(
|
|
320
|
-
self,
|
|
321
|
-
project: str,
|
|
322
|
-
entity: str = None,
|
|
323
|
-
config: dict = None,
|
|
324
|
-
):
|
|
325
|
-
self.project = project
|
|
326
|
-
self.entity = entity
|
|
327
|
-
self.config = config
|
|
328
|
-
self.run = None
|
|
329
|
-
|
|
330
|
-
def start_run(
|
|
331
|
-
self,
|
|
332
|
-
name: str = None,
|
|
333
|
-
tags: list[str] = None,
|
|
334
|
-
group: str = None,
|
|
335
|
-
job_type: str = "train",
|
|
336
|
-
resume: str = None,
|
|
337
|
-
) -> wandb.Run:
|
|
338
|
-
"""Initialize W&B run."""
|
|
339
|
-
self.run = wandb.init(
|
|
340
|
-
project=self.project,
|
|
341
|
-
entity=self.entity,
|
|
342
|
-
name=name,
|
|
343
|
-
config=self.config,
|
|
344
|
-
tags=tags,
|
|
345
|
-
group=group,
|
|
346
|
-
job_type=job_type,
|
|
347
|
-
resume=resume,
|
|
348
|
-
)
|
|
349
|
-
return self.run
|
|
350
|
-
|
|
351
|
-
def log(self, data: dict, step: int = None, commit: bool = True) -> None:
|
|
352
|
-
"""Log metrics and data."""
|
|
353
|
-
wandb.log(data, step=step, commit=commit)
|
|
354
|
-
|
|
355
|
-
def log_artifact(
|
|
356
|
-
self,
|
|
357
|
-
name: str,
|
|
358
|
-
artifact_type: str,
|
|
359
|
-
path: str,
|
|
360
|
-
metadata: dict = None,
|
|
361
|
-
) -> wandb.Artifact:
|
|
362
|
-
"""Log artifact (model, dataset, etc.)."""
|
|
363
|
-
artifact = wandb.Artifact(
|
|
364
|
-
name=name,
|
|
365
|
-
type=artifact_type,
|
|
366
|
-
metadata=metadata,
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
if Path(path).is_dir():
|
|
370
|
-
artifact.add_dir(path)
|
|
371
|
-
else:
|
|
372
|
-
artifact.add_file(path)
|
|
373
|
-
|
|
374
|
-
self.run.log_artifact(artifact)
|
|
375
|
-
return artifact
|
|
376
|
-
|
|
377
|
-
def log_model(
|
|
378
|
-
self,
|
|
379
|
-
model_path: str,
|
|
380
|
-
name: str,
|
|
381
|
-
metadata: dict = None,
|
|
382
|
-
aliases: list[str] = None,
|
|
383
|
-
) -> wandb.Artifact:
|
|
384
|
-
"""Log model artifact with aliases."""
|
|
385
|
-
artifact = wandb.Artifact(
|
|
386
|
-
name=name,
|
|
387
|
-
type="model",
|
|
388
|
-
metadata=metadata,
|
|
389
|
-
)
|
|
390
|
-
|
|
391
|
-
if Path(model_path).is_dir():
|
|
392
|
-
artifact.add_dir(model_path)
|
|
393
|
-
else:
|
|
394
|
-
artifact.add_file(model_path)
|
|
395
|
-
|
|
396
|
-
self.run.log_artifact(artifact, aliases=aliases or ["latest"])
|
|
397
|
-
return artifact
|
|
398
|
-
|
|
399
|
-
def watch_model(
|
|
400
|
-
self,
|
|
401
|
-
model,
|
|
402
|
-
log: str = "all",
|
|
403
|
-
log_freq: int = 100,
|
|
404
|
-
) -> None:
|
|
405
|
-
"""Watch model for gradient and parameter logging."""
|
|
406
|
-
wandb.watch(model, log=log, log_freq=log_freq)
|
|
407
|
-
|
|
408
|
-
def finish(self, exit_code: int = 0) -> None:
|
|
409
|
-
"""Finish the run."""
|
|
410
|
-
wandb.finish(exit_code=exit_code)
|
|
411
|
-
|
|
412
|
-
# Usage with PyTorch
|
|
413
|
-
def train_with_wandb(
|
|
414
|
-
model: torch.nn.Module,
|
|
415
|
-
train_loader,
|
|
416
|
-
val_loader,
|
|
417
|
-
config: dict,
|
|
418
|
-
):
|
|
419
|
-
"""Training with W&B tracking."""
|
|
420
|
-
tracker = WandbTracker(
|
|
421
|
-
project="my-project",
|
|
422
|
-
config=config,
|
|
423
|
-
)
|
|
424
|
-
|
|
425
|
-
tracker.start_run(
|
|
426
|
-
name=f"experiment_{config['model_type']}",
|
|
427
|
-
tags=["baseline", config["model_type"]],
|
|
428
|
-
group="hyperparameter_search",
|
|
429
|
-
)
|
|
430
|
-
|
|
431
|
-
# Watch model gradients
|
|
432
|
-
tracker.watch_model(model)
|
|
433
|
-
|
|
434
|
-
for epoch in range(config["epochs"]):
|
|
435
|
-
model.train()
|
|
436
|
-
for batch_idx, (data, target) in enumerate(train_loader):
|
|
437
|
-
# Training step
|
|
438
|
-
loss = train_step(model, data, target)
|
|
439
|
-
|
|
440
|
-
tracker.log({
|
|
441
|
-
"train/loss": loss,
|
|
442
|
-
"train/epoch": epoch,
|
|
443
|
-
})
|
|
444
|
-
|
|
445
|
-
# Validation
|
|
446
|
-
val_metrics = evaluate(model, val_loader)
|
|
447
|
-
tracker.log({
|
|
448
|
-
"val/loss": val_metrics["loss"],
|
|
449
|
-
"val/accuracy": val_metrics["accuracy"],
|
|
450
|
-
"epoch": epoch,
|
|
451
|
-
})
|
|
452
|
-
|
|
453
|
-
# Save and log model
|
|
454
|
-
torch.save(model.state_dict(), "model.pt")
|
|
455
|
-
tracker.log_model(
|
|
456
|
-
"model.pt",
|
|
457
|
-
name="trained_model",
|
|
458
|
-
metadata={"accuracy": val_metrics["accuracy"]},
|
|
459
|
-
aliases=["latest", "best"],
|
|
460
|
-
)
|
|
461
|
-
|
|
462
|
-
tracker.finish()
|
|
463
|
-
```
|
|
464
|
-
|
|
465
|
-
### W&B Sweeps for Hyperparameter Tuning
|
|
466
|
-
|
|
467
|
-
```python
|
|
468
|
-
import wandb
|
|
469
|
-
|
|
470
|
-
sweep_config = {
|
|
471
|
-
"method": "bayes", # bayes, grid, random
|
|
472
|
-
"metric": {
|
|
473
|
-
"name": "val/loss",
|
|
474
|
-
"goal": "minimize",
|
|
475
|
-
},
|
|
476
|
-
"parameters": {
|
|
477
|
-
"learning_rate": {
|
|
478
|
-
"distribution": "log_uniform_values",
|
|
479
|
-
"min": 1e-5,
|
|
480
|
-
"max": 1e-2,
|
|
481
|
-
},
|
|
482
|
-
"batch_size": {
|
|
483
|
-
"values": [16, 32, 64, 128],
|
|
484
|
-
},
|
|
485
|
-
"hidden_size": {
|
|
486
|
-
"values": [128, 256, 512],
|
|
487
|
-
},
|
|
488
|
-
"dropout": {
|
|
489
|
-
"distribution": "uniform",
|
|
490
|
-
"min": 0.1,
|
|
491
|
-
"max": 0.5,
|
|
492
|
-
},
|
|
493
|
-
},
|
|
494
|
-
"early_terminate": {
|
|
495
|
-
"type": "hyperband",
|
|
496
|
-
"min_iter": 3,
|
|
497
|
-
},
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
def sweep_train():
|
|
501
|
-
"""Training function for sweep."""
|
|
502
|
-
with wandb.init() as run:
|
|
503
|
-
config = wandb.config
|
|
504
|
-
|
|
505
|
-
model = build_model(
|
|
506
|
-
hidden_size=config.hidden_size,
|
|
507
|
-
dropout=config.dropout,
|
|
508
|
-
)
|
|
509
|
-
|
|
510
|
-
optimizer = torch.optim.Adam(
|
|
511
|
-
model.parameters(),
|
|
512
|
-
lr=config.learning_rate,
|
|
513
|
-
)
|
|
514
|
-
|
|
515
|
-
train_loader = DataLoader(train_dataset, batch_size=config.batch_size)
|
|
516
|
-
|
|
517
|
-
for epoch in range(10):
|
|
518
|
-
loss = train_epoch(model, train_loader, optimizer)
|
|
519
|
-
val_loss = evaluate(model, val_loader)
|
|
520
|
-
|
|
521
|
-
wandb.log({
|
|
522
|
-
"train/loss": loss,
|
|
523
|
-
"val/loss": val_loss,
|
|
524
|
-
"epoch": epoch,
|
|
525
|
-
})
|
|
526
|
-
|
|
527
|
-
# Run sweep
|
|
528
|
-
sweep_id = wandb.sweep(sweep_config, project="my-project")
|
|
529
|
-
wandb.agent(sweep_id, function=sweep_train, count=50)
|
|
530
|
-
```
|
|
531
|
-
|
|
532
|
-
---
|
|
533
|
-
|
|
534
|
-
## Custom Experiment Tracking
|
|
535
|
-
|
|
536
|
-
### Lightweight Tracker
|
|
537
|
-
|
|
538
|
-
```python
|
|
539
|
-
import json
|
|
540
|
-
from datetime import datetime
|
|
541
|
-
from pathlib import Path
|
|
542
|
-
from dataclasses import dataclass, field, asdict
|
|
543
|
-
from typing import Optional
|
|
544
|
-
import hashlib
|
|
545
|
-
import uuid
|
|
546
|
-
|
|
547
|
-
@dataclass
|
|
548
|
-
class Experiment:
|
|
549
|
-
"""Experiment metadata and results."""
|
|
550
|
-
experiment_id: str
|
|
551
|
-
name: str
|
|
552
|
-
params: dict
|
|
553
|
-
metrics: dict = field(default_factory=dict)
|
|
554
|
-
artifacts: list = field(default_factory=list)
|
|
555
|
-
tags: dict = field(default_factory=dict)
|
|
556
|
-
start_time: str = field(default_factory=lambda: datetime.utcnow().isoformat())
|
|
557
|
-
end_time: Optional[str] = None
|
|
558
|
-
status: str = "running"
|
|
559
|
-
|
|
560
|
-
def to_dict(self) -> dict:
|
|
561
|
-
return asdict(self)
|
|
562
|
-
|
|
563
|
-
class SimpleTracker:
|
|
564
|
-
"""Lightweight file-based experiment tracker."""
|
|
565
|
-
|
|
566
|
-
def __init__(self, experiments_dir: str = "./experiments"):
|
|
567
|
-
self.experiments_dir = Path(experiments_dir)
|
|
568
|
-
self.experiments_dir.mkdir(parents=True, exist_ok=True)
|
|
569
|
-
self.current_experiment: Optional[Experiment] = None
|
|
570
|
-
|
|
571
|
-
def start_experiment(
|
|
572
|
-
self,
|
|
573
|
-
name: str,
|
|
574
|
-
params: dict,
|
|
575
|
-
tags: dict = None,
|
|
576
|
-
) -> Experiment:
|
|
577
|
-
"""Start a new experiment."""
|
|
578
|
-
experiment_id = str(uuid.uuid4())[:8]
|
|
579
|
-
|
|
580
|
-
self.current_experiment = Experiment(
|
|
581
|
-
experiment_id=experiment_id,
|
|
582
|
-
name=name,
|
|
583
|
-
params=params,
|
|
584
|
-
tags=tags or {},
|
|
585
|
-
)
|
|
586
|
-
|
|
587
|
-
# Create experiment directory
|
|
588
|
-
exp_dir = self.experiments_dir / experiment_id
|
|
589
|
-
exp_dir.mkdir(exist_ok=True)
|
|
590
|
-
|
|
591
|
-
self._save_experiment()
|
|
592
|
-
return self.current_experiment
|
|
593
|
-
|
|
594
|
-
def log_metrics(self, metrics: dict, step: int = None) -> None:
|
|
595
|
-
"""Log metrics to current experiment."""
|
|
596
|
-
if self.current_experiment is None:
|
|
597
|
-
raise ValueError("No active experiment")
|
|
598
|
-
|
|
599
|
-
for key, value in metrics.items():
|
|
600
|
-
if key not in self.current_experiment.metrics:
|
|
601
|
-
self.current_experiment.metrics[key] = []
|
|
602
|
-
|
|
603
|
-
self.current_experiment.metrics[key].append({
|
|
604
|
-
"value": value,
|
|
605
|
-
"step": step,
|
|
606
|
-
"timestamp": datetime.utcnow().isoformat(),
|
|
607
|
-
})
|
|
608
|
-
|
|
609
|
-
self._save_experiment()
|
|
610
|
-
|
|
611
|
-
def log_artifact(self, path: str, name: str = None) -> str:
|
|
612
|
-
"""Copy artifact to experiment directory."""
|
|
613
|
-
if self.current_experiment is None:
|
|
614
|
-
raise ValueError("No active experiment")
|
|
615
|
-
|
|
616
|
-
import shutil
|
|
617
|
-
|
|
618
|
-
source = Path(path)
|
|
619
|
-
artifact_name = name or source.name
|
|
620
|
-
exp_dir = self.experiments_dir / self.current_experiment.experiment_id
|
|
621
|
-
dest = exp_dir / "artifacts" / artifact_name
|
|
622
|
-
|
|
623
|
-
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
624
|
-
|
|
625
|
-
if source.is_dir():
|
|
626
|
-
shutil.copytree(source, dest)
|
|
627
|
-
else:
|
|
628
|
-
shutil.copy2(source, dest)
|
|
629
|
-
|
|
630
|
-
self.current_experiment.artifacts.append(str(dest))
|
|
631
|
-
self._save_experiment()
|
|
632
|
-
|
|
633
|
-
return str(dest)
|
|
634
|
-
|
|
635
|
-
def end_experiment(self, status: str = "completed") -> None:
|
|
636
|
-
"""End current experiment."""
|
|
637
|
-
if self.current_experiment is None:
|
|
638
|
-
return
|
|
639
|
-
|
|
640
|
-
self.current_experiment.status = status
|
|
641
|
-
self.current_experiment.end_time = datetime.utcnow().isoformat()
|
|
642
|
-
self._save_experiment()
|
|
643
|
-
self.current_experiment = None
|
|
644
|
-
|
|
645
|
-
def _save_experiment(self) -> None:
|
|
646
|
-
"""Save experiment to JSON file."""
|
|
647
|
-
if self.current_experiment is None:
|
|
648
|
-
return
|
|
649
|
-
|
|
650
|
-
exp_dir = self.experiments_dir / self.current_experiment.experiment_id
|
|
651
|
-
with open(exp_dir / "experiment.json", "w") as f:
|
|
652
|
-
json.dump(self.current_experiment.to_dict(), f, indent=2)
|
|
653
|
-
|
|
654
|
-
def load_experiment(self, experiment_id: str) -> Experiment:
|
|
655
|
-
"""Load experiment by ID."""
|
|
656
|
-
exp_file = self.experiments_dir / experiment_id / "experiment.json"
|
|
657
|
-
with open(exp_file) as f:
|
|
658
|
-
data = json.load(f)
|
|
659
|
-
return Experiment(**data)
|
|
660
|
-
|
|
661
|
-
def list_experiments(self, tags: dict = None) -> list[Experiment]:
|
|
662
|
-
"""List all experiments, optionally filtered by tags."""
|
|
663
|
-
experiments = []
|
|
664
|
-
|
|
665
|
-
for exp_dir in self.experiments_dir.iterdir():
|
|
666
|
-
if not exp_dir.is_dir():
|
|
667
|
-
continue
|
|
668
|
-
|
|
669
|
-
exp_file = exp_dir / "experiment.json"
|
|
670
|
-
if not exp_file.exists():
|
|
671
|
-
continue
|
|
672
|
-
|
|
673
|
-
exp = self.load_experiment(exp_dir.name)
|
|
674
|
-
|
|
675
|
-
if tags:
|
|
676
|
-
if not all(exp.tags.get(k) == v for k, v in tags.items()):
|
|
677
|
-
continue
|
|
678
|
-
|
|
679
|
-
experiments.append(exp)
|
|
680
|
-
|
|
681
|
-
return sorted(experiments, key=lambda x: x.start_time, reverse=True)
|
|
682
|
-
|
|
683
|
-
def compare_experiments(self, experiment_ids: list[str]) -> dict:
|
|
684
|
-
"""Compare metrics across experiments."""
|
|
685
|
-
comparison = {}
|
|
686
|
-
|
|
687
|
-
for exp_id in experiment_ids:
|
|
688
|
-
exp = self.load_experiment(exp_id)
|
|
689
|
-
comparison[exp_id] = {
|
|
690
|
-
"name": exp.name,
|
|
691
|
-
"params": exp.params,
|
|
692
|
-
"final_metrics": {
|
|
693
|
-
k: v[-1]["value"] if v else None
|
|
694
|
-
for k, v in exp.metrics.items()
|
|
695
|
-
},
|
|
696
|
-
}
|
|
697
|
-
|
|
698
|
-
return comparison
|
|
699
|
-
```
|
|
700
|
-
|
|
701
|
-
---
|
|
702
|
-
|
|
703
|
-
## Experiment Comparison and Analysis
|
|
704
|
-
|
|
705
|
-
### Metrics Comparison
|
|
706
|
-
|
|
707
|
-
```python
|
|
708
|
-
import pandas as pd
|
|
709
|
-
import matplotlib.pyplot as plt
|
|
710
|
-
from mlflow.tracking import MlflowClient
|
|
711
|
-
|
|
712
|
-
def compare_runs(
|
|
713
|
-
experiment_name: str,
|
|
714
|
-
metric_keys: list[str],
|
|
715
|
-
n_runs: int = 10,
|
|
716
|
-
) -> pd.DataFrame:
|
|
717
|
-
"""Compare recent runs in an experiment."""
|
|
718
|
-
client = MlflowClient()
|
|
719
|
-
experiment = client.get_experiment_by_name(experiment_name)
|
|
720
|
-
|
|
721
|
-
runs = client.search_runs(
|
|
722
|
-
experiment_ids=[experiment.experiment_id],
|
|
723
|
-
order_by=["start_time DESC"],
|
|
724
|
-
max_results=n_runs,
|
|
725
|
-
)
|
|
726
|
-
|
|
727
|
-
data = []
|
|
728
|
-
for run in runs:
|
|
729
|
-
row = {
|
|
730
|
-
"run_id": run.info.run_id,
|
|
731
|
-
"run_name": run.info.run_name,
|
|
732
|
-
"status": run.info.status,
|
|
733
|
-
"start_time": run.info.start_time,
|
|
734
|
-
}
|
|
735
|
-
row.update(run.data.params)
|
|
736
|
-
row.update({k: run.data.metrics.get(k) for k in metric_keys})
|
|
737
|
-
data.append(row)
|
|
738
|
-
|
|
739
|
-
return pd.DataFrame(data)
|
|
740
|
-
|
|
741
|
-
def plot_metric_comparison(
|
|
742
|
-
runs_df: pd.DataFrame,
|
|
743
|
-
metric: str,
|
|
744
|
-
group_by: str = None,
|
|
745
|
-
) -> plt.Figure:
|
|
746
|
-
"""Plot metric comparison across runs."""
|
|
747
|
-
fig, ax = plt.subplots(figsize=(10, 6))
|
|
748
|
-
|
|
749
|
-
if group_by:
|
|
750
|
-
for group, group_df in runs_df.groupby(group_by):
|
|
751
|
-
ax.bar(group_df["run_name"], group_df[metric], label=str(group))
|
|
752
|
-
ax.legend(title=group_by)
|
|
753
|
-
else:
|
|
754
|
-
ax.bar(runs_df["run_name"], runs_df[metric])
|
|
755
|
-
|
|
756
|
-
ax.set_xlabel("Run")
|
|
757
|
-
ax.set_ylabel(metric)
|
|
758
|
-
ax.set_title(f"Comparison of {metric}")
|
|
759
|
-
plt.xticks(rotation=45, ha="right")
|
|
760
|
-
plt.tight_layout()
|
|
761
|
-
|
|
762
|
-
return fig
|
|
763
|
-
```
|
|
764
|
-
|
|
765
|
-
---
|
|
766
|
-
|
|
767
|
-
## Best Practices
|
|
768
|
-
|
|
769
|
-
### What to Track
|
|
770
|
-
|
|
771
|
-
```python
|
|
772
|
-
# Always track:
|
|
773
|
-
REQUIRED_PARAMS = [
|
|
774
|
-
"learning_rate",
|
|
775
|
-
"batch_size",
|
|
776
|
-
"epochs",
|
|
777
|
-
"model_architecture",
|
|
778
|
-
"optimizer",
|
|
779
|
-
"random_seed",
|
|
780
|
-
"dataset_version",
|
|
781
|
-
]
|
|
782
|
-
|
|
783
|
-
REQUIRED_METRICS = [
|
|
784
|
-
"train_loss",
|
|
785
|
-
"val_loss",
|
|
786
|
-
"train_accuracy",
|
|
787
|
-
"val_accuracy",
|
|
788
|
-
]
|
|
789
|
-
|
|
790
|
-
REQUIRED_ARTIFACTS = [
|
|
791
|
-
"model_checkpoint",
|
|
792
|
-
"training_config",
|
|
793
|
-
"requirements.txt",
|
|
794
|
-
]
|
|
795
|
-
|
|
796
|
-
# Recommended tags
|
|
797
|
-
RECOMMENDED_TAGS = {
|
|
798
|
-
"author": "username",
|
|
799
|
-
"environment": "dev|staging|prod",
|
|
800
|
-
"model_type": "classification|regression|etc",
|
|
801
|
-
"dataset": "dataset_name",
|
|
802
|
-
"git_commit": "commit_hash",
|
|
803
|
-
}
|
|
804
|
-
```
|
|
805
|
-
|
|
806
|
-
### Experiment Naming Conventions
|
|
807
|
-
|
|
808
|
-
```python
|
|
809
|
-
# Good naming patterns
|
|
810
|
-
run_name = f"{model_type}_{dataset}_{timestamp}"
|
|
811
|
-
run_name = f"exp_{experiment_number:03d}_{description}"
|
|
812
|
-
run_name = f"{feature_flag}_{ablation_type}_{seed}"
|
|
813
|
-
|
|
814
|
-
# Organize with groups and tags
|
|
815
|
-
tags = {
|
|
816
|
-
"project": "recommendation_engine",
|
|
817
|
-
"sprint": "sprint_42",
|
|
818
|
-
"hypothesis": "larger_embedding_helps",
|
|
819
|
-
}
|
|
820
|
-
```
|
|
821
|
-
|
|
822
|
-
---
|
|
823
|
-
|
|
824
|
-
## Related References
|
|
825
|
-
|
|
826
|
-
- `training-pipelines.md` - Integrating tracking with training
|
|
827
|
-
- `model-validation.md` - Validating tracked models
|
|
828
|
-
- `pipeline-orchestration.md` - Tracking in automated pipelines
|
|
829
|
-
|
|
830
|
-
## Cross-Reference Skills
|
|
831
|
-
|
|
832
|
-
- **DevOps Engineer** - MLflow server deployment
|
|
833
|
-
- **Data Engineer** - Artifact storage integration
|
|
1
|
+
# Experiment Tracking
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Experiment tracking enables reproducibility, comparison, and collaboration in ML development. It captures hyperparameters, metrics, artifacts, and model versions to ensure every experiment can be reproduced and compared.
|
|
8
|
+
|
|
9
|
+
## When to Use This Reference
|
|
10
|
+
|
|
11
|
+
- Setting up MLflow for experiment tracking
|
|
12
|
+
- Implementing Weights & Biases integration
|
|
13
|
+
- Creating model registries and versioning
|
|
14
|
+
- Comparing experiments and selecting models
|
|
15
|
+
- Building custom tracking solutions
|
|
16
|
+
|
|
17
|
+
## When NOT to Use
|
|
18
|
+
|
|
19
|
+
- Quick one-off experiments without reproducibility needs
|
|
20
|
+
- Simple scripts without hyperparameters
|
|
21
|
+
- Non-ML projects
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## MLflow Integration
|
|
26
|
+
|
|
27
|
+
### Basic Experiment Tracking
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import mlflow
|
|
31
|
+
from mlflow.tracking import MlflowClient
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
import json
|
|
34
|
+
|
|
35
|
+
class MLflowTracker:
|
|
36
|
+
"""MLflow experiment tracking wrapper."""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
experiment_name: str,
|
|
41
|
+
tracking_uri: str = "http://localhost:5000",
|
|
42
|
+
artifact_location: str = None,
|
|
43
|
+
):
|
|
44
|
+
mlflow.set_tracking_uri(tracking_uri)
|
|
45
|
+
|
|
46
|
+
# Create or get experiment
|
|
47
|
+
experiment = mlflow.get_experiment_by_name(experiment_name)
|
|
48
|
+
if experiment is None:
|
|
49
|
+
self.experiment_id = mlflow.create_experiment(
|
|
50
|
+
experiment_name,
|
|
51
|
+
artifact_location=artifact_location,
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
self.experiment_id = experiment.experiment_id
|
|
55
|
+
|
|
56
|
+
mlflow.set_experiment(experiment_name)
|
|
57
|
+
self.client = MlflowClient()
|
|
58
|
+
self.run = None
|
|
59
|
+
|
|
60
|
+
def start_run(
|
|
61
|
+
self,
|
|
62
|
+
run_name: str = None,
|
|
63
|
+
tags: dict = None,
|
|
64
|
+
nested: bool = False,
|
|
65
|
+
) -> str:
|
|
66
|
+
"""Start a new MLflow run."""
|
|
67
|
+
self.run = mlflow.start_run(
|
|
68
|
+
run_name=run_name,
|
|
69
|
+
experiment_id=self.experiment_id,
|
|
70
|
+
nested=nested,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if tags:
|
|
74
|
+
mlflow.set_tags(tags)
|
|
75
|
+
|
|
76
|
+
return self.run.info.run_id
|
|
77
|
+
|
|
78
|
+
def end_run(self, status: str = "FINISHED") -> None:
|
|
79
|
+
"""End the current run."""
|
|
80
|
+
mlflow.end_run(status=status)
|
|
81
|
+
self.run = None
|
|
82
|
+
|
|
83
|
+
def log_params(self, params: dict) -> None:
|
|
84
|
+
"""Log hyperparameters."""
|
|
85
|
+
mlflow.log_params(params)
|
|
86
|
+
|
|
87
|
+
def log_metrics(self, metrics: dict, step: int = None) -> None:
|
|
88
|
+
"""Log metrics with optional step."""
|
|
89
|
+
for key, value in metrics.items():
|
|
90
|
+
mlflow.log_metric(key, value, step=step)
|
|
91
|
+
|
|
92
|
+
def log_artifact(self, local_path: str, artifact_path: str = None) -> None:
|
|
93
|
+
"""Log file or directory as artifact."""
|
|
94
|
+
mlflow.log_artifact(local_path, artifact_path)
|
|
95
|
+
|
|
96
|
+
def log_model(
|
|
97
|
+
self,
|
|
98
|
+
model,
|
|
99
|
+
artifact_path: str,
|
|
100
|
+
registered_model_name: str = None,
|
|
101
|
+
signature=None,
|
|
102
|
+
input_example=None,
|
|
103
|
+
) -> str:
|
|
104
|
+
"""Log model with optional registration."""
|
|
105
|
+
from mlflow.models import infer_signature
|
|
106
|
+
|
|
107
|
+
if signature is None and input_example is not None:
|
|
108
|
+
signature = infer_signature(input_example, model.predict(input_example))
|
|
109
|
+
|
|
110
|
+
model_info = mlflow.sklearn.log_model(
|
|
111
|
+
model,
|
|
112
|
+
artifact_path=artifact_path,
|
|
113
|
+
registered_model_name=registered_model_name,
|
|
114
|
+
signature=signature,
|
|
115
|
+
input_example=input_example,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return model_info.model_uri
|
|
119
|
+
|
|
120
|
+
# Usage example
|
|
121
|
+
def train_with_mlflow(
|
|
122
|
+
model,
|
|
123
|
+
X_train,
|
|
124
|
+
y_train,
|
|
125
|
+
X_val,
|
|
126
|
+
y_val,
|
|
127
|
+
params: dict,
|
|
128
|
+
):
|
|
129
|
+
"""Complete training run with MLflow tracking."""
|
|
130
|
+
tracker = MLflowTracker("my_experiment")
|
|
131
|
+
|
|
132
|
+
tracker.start_run(
|
|
133
|
+
run_name=f"run_{params['model_type']}",
|
|
134
|
+
tags={
|
|
135
|
+
"model_type": params["model_type"],
|
|
136
|
+
"dataset_version": "v1.0",
|
|
137
|
+
"author": "ml-team",
|
|
138
|
+
},
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
# Log parameters
|
|
143
|
+
tracker.log_params(params)
|
|
144
|
+
|
|
145
|
+
# Train model
|
|
146
|
+
model.fit(X_train, y_train)
|
|
147
|
+
|
|
148
|
+
# Evaluate and log metrics
|
|
149
|
+
train_score = model.score(X_train, y_train)
|
|
150
|
+
val_score = model.score(X_val, y_val)
|
|
151
|
+
|
|
152
|
+
tracker.log_metrics({
|
|
153
|
+
"train_accuracy": train_score,
|
|
154
|
+
"val_accuracy": val_score,
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
# Log model
|
|
158
|
+
model_uri = tracker.log_model(
|
|
159
|
+
model,
|
|
160
|
+
artifact_path="model",
|
|
161
|
+
registered_model_name="my_model",
|
|
162
|
+
input_example=X_train[:5],
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
tracker.end_run()
|
|
166
|
+
return model_uri
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
tracker.end_run(status="FAILED")
|
|
170
|
+
raise
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### PyTorch Model Logging
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
import mlflow.pytorch
|
|
177
|
+
import torch
|
|
178
|
+
|
|
179
|
+
def log_pytorch_model(
|
|
180
|
+
model: torch.nn.Module,
|
|
181
|
+
artifact_path: str,
|
|
182
|
+
registered_model_name: str = None,
|
|
183
|
+
sample_input: torch.Tensor = None,
|
|
184
|
+
) -> str:
|
|
185
|
+
"""Log PyTorch model with signature inference."""
|
|
186
|
+
from mlflow.models import infer_signature
|
|
187
|
+
|
|
188
|
+
# Create signature from sample input
|
|
189
|
+
signature = None
|
|
190
|
+
if sample_input is not None:
|
|
191
|
+
model.eval()
|
|
192
|
+
with torch.no_grad():
|
|
193
|
+
sample_output = model(sample_input)
|
|
194
|
+
|
|
195
|
+
signature = infer_signature(
|
|
196
|
+
sample_input.numpy(),
|
|
197
|
+
sample_output.numpy(),
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
model_info = mlflow.pytorch.log_model(
|
|
201
|
+
model,
|
|
202
|
+
artifact_path=artifact_path,
|
|
203
|
+
registered_model_name=registered_model_name,
|
|
204
|
+
signature=signature,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
return model_info.model_uri
|
|
208
|
+
|
|
209
|
+
def load_pytorch_model(model_uri: str, device: str = "cpu") -> torch.nn.Module:
|
|
210
|
+
"""Load PyTorch model from MLflow."""
|
|
211
|
+
model = mlflow.pytorch.load_model(model_uri, map_location=device)
|
|
212
|
+
return model
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Model Registry Operations
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
from mlflow.tracking import MlflowClient
|
|
219
|
+
from mlflow.entities.model_registry import ModelVersion
|
|
220
|
+
|
|
221
|
+
class ModelRegistry:
|
|
222
|
+
"""MLflow Model Registry wrapper."""
|
|
223
|
+
|
|
224
|
+
def __init__(self, tracking_uri: str = "http://localhost:5000"):
|
|
225
|
+
mlflow.set_tracking_uri(tracking_uri)
|
|
226
|
+
self.client = MlflowClient()
|
|
227
|
+
|
|
228
|
+
def register_model(
|
|
229
|
+
self,
|
|
230
|
+
model_uri: str,
|
|
231
|
+
name: str,
|
|
232
|
+
tags: dict = None,
|
|
233
|
+
description: str = None,
|
|
234
|
+
) -> ModelVersion:
|
|
235
|
+
"""Register a new model version."""
|
|
236
|
+
result = mlflow.register_model(model_uri, name)
|
|
237
|
+
|
|
238
|
+
if tags:
|
|
239
|
+
for key, value in tags.items():
|
|
240
|
+
self.client.set_model_version_tag(name, result.version, key, value)
|
|
241
|
+
|
|
242
|
+
if description:
|
|
243
|
+
self.client.update_model_version(
|
|
244
|
+
name,
|
|
245
|
+
result.version,
|
|
246
|
+
description=description,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
return result
|
|
250
|
+
|
|
251
|
+
def transition_model_stage(
|
|
252
|
+
self,
|
|
253
|
+
name: str,
|
|
254
|
+
version: str,
|
|
255
|
+
stage: str,
|
|
256
|
+
archive_existing: bool = True,
|
|
257
|
+
) -> ModelVersion:
|
|
258
|
+
"""Transition model to new stage (Staging, Production, Archived)."""
|
|
259
|
+
return self.client.transition_model_version_stage(
|
|
260
|
+
name=name,
|
|
261
|
+
version=version,
|
|
262
|
+
stage=stage,
|
|
263
|
+
archive_existing_versions=archive_existing,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
def get_latest_version(
|
|
267
|
+
self,
|
|
268
|
+
name: str,
|
|
269
|
+
stages: list[str] = None,
|
|
270
|
+
) -> list[ModelVersion]:
|
|
271
|
+
"""Get latest model versions by stage."""
|
|
272
|
+
return self.client.get_latest_versions(name, stages=stages)
|
|
273
|
+
|
|
274
|
+
def load_production_model(self, name: str) -> any:
|
|
275
|
+
"""Load the production model."""
|
|
276
|
+
model_uri = f"models:/{name}/Production"
|
|
277
|
+
return mlflow.pyfunc.load_model(model_uri)
|
|
278
|
+
|
|
279
|
+
def compare_versions(
|
|
280
|
+
self,
|
|
281
|
+
name: str,
|
|
282
|
+
version_a: str,
|
|
283
|
+
version_b: str,
|
|
284
|
+
) -> dict:
|
|
285
|
+
"""Compare two model versions."""
|
|
286
|
+
v_a = self.client.get_model_version(name, version_a)
|
|
287
|
+
v_b = self.client.get_model_version(name, version_b)
|
|
288
|
+
|
|
289
|
+
run_a = self.client.get_run(v_a.run_id)
|
|
290
|
+
run_b = self.client.get_run(v_b.run_id)
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
"version_a": {
|
|
294
|
+
"version": version_a,
|
|
295
|
+
"metrics": run_a.data.metrics,
|
|
296
|
+
"params": run_a.data.params,
|
|
297
|
+
},
|
|
298
|
+
"version_b": {
|
|
299
|
+
"version": version_b,
|
|
300
|
+
"metrics": run_b.data.metrics,
|
|
301
|
+
"params": run_b.data.params,
|
|
302
|
+
},
|
|
303
|
+
}
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
## Weights & Biases Integration
|
|
309
|
+
|
|
310
|
+
### Basic W&B Tracking
|
|
311
|
+
|
|
312
|
+
```python
|
|
313
|
+
import wandb
|
|
314
|
+
from pathlib import Path
|
|
315
|
+
|
|
316
|
+
class WandbTracker:
|
|
317
|
+
"""Weights & Biases experiment tracking wrapper."""
|
|
318
|
+
|
|
319
|
+
def __init__(
|
|
320
|
+
self,
|
|
321
|
+
project: str,
|
|
322
|
+
entity: str = None,
|
|
323
|
+
config: dict = None,
|
|
324
|
+
):
|
|
325
|
+
self.project = project
|
|
326
|
+
self.entity = entity
|
|
327
|
+
self.config = config
|
|
328
|
+
self.run = None
|
|
329
|
+
|
|
330
|
+
def start_run(
|
|
331
|
+
self,
|
|
332
|
+
name: str = None,
|
|
333
|
+
tags: list[str] = None,
|
|
334
|
+
group: str = None,
|
|
335
|
+
job_type: str = "train",
|
|
336
|
+
resume: str = None,
|
|
337
|
+
) -> wandb.Run:
|
|
338
|
+
"""Initialize W&B run."""
|
|
339
|
+
self.run = wandb.init(
|
|
340
|
+
project=self.project,
|
|
341
|
+
entity=self.entity,
|
|
342
|
+
name=name,
|
|
343
|
+
config=self.config,
|
|
344
|
+
tags=tags,
|
|
345
|
+
group=group,
|
|
346
|
+
job_type=job_type,
|
|
347
|
+
resume=resume,
|
|
348
|
+
)
|
|
349
|
+
return self.run
|
|
350
|
+
|
|
351
|
+
def log(self, data: dict, step: int = None, commit: bool = True) -> None:
|
|
352
|
+
"""Log metrics and data."""
|
|
353
|
+
wandb.log(data, step=step, commit=commit)
|
|
354
|
+
|
|
355
|
+
def log_artifact(
|
|
356
|
+
self,
|
|
357
|
+
name: str,
|
|
358
|
+
artifact_type: str,
|
|
359
|
+
path: str,
|
|
360
|
+
metadata: dict = None,
|
|
361
|
+
) -> wandb.Artifact:
|
|
362
|
+
"""Log artifact (model, dataset, etc.)."""
|
|
363
|
+
artifact = wandb.Artifact(
|
|
364
|
+
name=name,
|
|
365
|
+
type=artifact_type,
|
|
366
|
+
metadata=metadata,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
if Path(path).is_dir():
|
|
370
|
+
artifact.add_dir(path)
|
|
371
|
+
else:
|
|
372
|
+
artifact.add_file(path)
|
|
373
|
+
|
|
374
|
+
self.run.log_artifact(artifact)
|
|
375
|
+
return artifact
|
|
376
|
+
|
|
377
|
+
def log_model(
|
|
378
|
+
self,
|
|
379
|
+
model_path: str,
|
|
380
|
+
name: str,
|
|
381
|
+
metadata: dict = None,
|
|
382
|
+
aliases: list[str] = None,
|
|
383
|
+
) -> wandb.Artifact:
|
|
384
|
+
"""Log model artifact with aliases."""
|
|
385
|
+
artifact = wandb.Artifact(
|
|
386
|
+
name=name,
|
|
387
|
+
type="model",
|
|
388
|
+
metadata=metadata,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
if Path(model_path).is_dir():
|
|
392
|
+
artifact.add_dir(model_path)
|
|
393
|
+
else:
|
|
394
|
+
artifact.add_file(model_path)
|
|
395
|
+
|
|
396
|
+
self.run.log_artifact(artifact, aliases=aliases or ["latest"])
|
|
397
|
+
return artifact
|
|
398
|
+
|
|
399
|
+
def watch_model(
|
|
400
|
+
self,
|
|
401
|
+
model,
|
|
402
|
+
log: str = "all",
|
|
403
|
+
log_freq: int = 100,
|
|
404
|
+
) -> None:
|
|
405
|
+
"""Watch model for gradient and parameter logging."""
|
|
406
|
+
wandb.watch(model, log=log, log_freq=log_freq)
|
|
407
|
+
|
|
408
|
+
def finish(self, exit_code: int = 0) -> None:
|
|
409
|
+
"""Finish the run."""
|
|
410
|
+
wandb.finish(exit_code=exit_code)
|
|
411
|
+
|
|
412
|
+
# Usage with PyTorch
|
|
413
|
+
def train_with_wandb(
|
|
414
|
+
model: torch.nn.Module,
|
|
415
|
+
train_loader,
|
|
416
|
+
val_loader,
|
|
417
|
+
config: dict,
|
|
418
|
+
):
|
|
419
|
+
"""Training with W&B tracking."""
|
|
420
|
+
tracker = WandbTracker(
|
|
421
|
+
project="my-project",
|
|
422
|
+
config=config,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
tracker.start_run(
|
|
426
|
+
name=f"experiment_{config['model_type']}",
|
|
427
|
+
tags=["baseline", config["model_type"]],
|
|
428
|
+
group="hyperparameter_search",
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# Watch model gradients
|
|
432
|
+
tracker.watch_model(model)
|
|
433
|
+
|
|
434
|
+
for epoch in range(config["epochs"]):
|
|
435
|
+
model.train()
|
|
436
|
+
for batch_idx, (data, target) in enumerate(train_loader):
|
|
437
|
+
# Training step
|
|
438
|
+
loss = train_step(model, data, target)
|
|
439
|
+
|
|
440
|
+
tracker.log({
|
|
441
|
+
"train/loss": loss,
|
|
442
|
+
"train/epoch": epoch,
|
|
443
|
+
})
|
|
444
|
+
|
|
445
|
+
# Validation
|
|
446
|
+
val_metrics = evaluate(model, val_loader)
|
|
447
|
+
tracker.log({
|
|
448
|
+
"val/loss": val_metrics["loss"],
|
|
449
|
+
"val/accuracy": val_metrics["accuracy"],
|
|
450
|
+
"epoch": epoch,
|
|
451
|
+
})
|
|
452
|
+
|
|
453
|
+
# Save and log model
|
|
454
|
+
torch.save(model.state_dict(), "model.pt")
|
|
455
|
+
tracker.log_model(
|
|
456
|
+
"model.pt",
|
|
457
|
+
name="trained_model",
|
|
458
|
+
metadata={"accuracy": val_metrics["accuracy"]},
|
|
459
|
+
aliases=["latest", "best"],
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
tracker.finish()
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
### W&B Sweeps for Hyperparameter Tuning
|
|
466
|
+
|
|
467
|
+
```python
|
|
468
|
+
import wandb
|
|
469
|
+
|
|
470
|
+
sweep_config = {
|
|
471
|
+
"method": "bayes", # bayes, grid, random
|
|
472
|
+
"metric": {
|
|
473
|
+
"name": "val/loss",
|
|
474
|
+
"goal": "minimize",
|
|
475
|
+
},
|
|
476
|
+
"parameters": {
|
|
477
|
+
"learning_rate": {
|
|
478
|
+
"distribution": "log_uniform_values",
|
|
479
|
+
"min": 1e-5,
|
|
480
|
+
"max": 1e-2,
|
|
481
|
+
},
|
|
482
|
+
"batch_size": {
|
|
483
|
+
"values": [16, 32, 64, 128],
|
|
484
|
+
},
|
|
485
|
+
"hidden_size": {
|
|
486
|
+
"values": [128, 256, 512],
|
|
487
|
+
},
|
|
488
|
+
"dropout": {
|
|
489
|
+
"distribution": "uniform",
|
|
490
|
+
"min": 0.1,
|
|
491
|
+
"max": 0.5,
|
|
492
|
+
},
|
|
493
|
+
},
|
|
494
|
+
"early_terminate": {
|
|
495
|
+
"type": "hyperband",
|
|
496
|
+
"min_iter": 3,
|
|
497
|
+
},
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
def sweep_train():
|
|
501
|
+
"""Training function for sweep."""
|
|
502
|
+
with wandb.init() as run:
|
|
503
|
+
config = wandb.config
|
|
504
|
+
|
|
505
|
+
model = build_model(
|
|
506
|
+
hidden_size=config.hidden_size,
|
|
507
|
+
dropout=config.dropout,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
optimizer = torch.optim.Adam(
|
|
511
|
+
model.parameters(),
|
|
512
|
+
lr=config.learning_rate,
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
train_loader = DataLoader(train_dataset, batch_size=config.batch_size)
|
|
516
|
+
|
|
517
|
+
for epoch in range(10):
|
|
518
|
+
loss = train_epoch(model, train_loader, optimizer)
|
|
519
|
+
val_loss = evaluate(model, val_loader)
|
|
520
|
+
|
|
521
|
+
wandb.log({
|
|
522
|
+
"train/loss": loss,
|
|
523
|
+
"val/loss": val_loss,
|
|
524
|
+
"epoch": epoch,
|
|
525
|
+
})
|
|
526
|
+
|
|
527
|
+
# Run sweep
|
|
528
|
+
sweep_id = wandb.sweep(sweep_config, project="my-project")
|
|
529
|
+
wandb.agent(sweep_id, function=sweep_train, count=50)
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
---
|
|
533
|
+
|
|
534
|
+
## Custom Experiment Tracking
|
|
535
|
+
|
|
536
|
+
### Lightweight Tracker
|
|
537
|
+
|
|
538
|
+
```python
|
|
539
|
+
import json
|
|
540
|
+
from datetime import datetime
|
|
541
|
+
from pathlib import Path
|
|
542
|
+
from dataclasses import dataclass, field, asdict
|
|
543
|
+
from typing import Optional
|
|
544
|
+
import hashlib
|
|
545
|
+
import uuid
|
|
546
|
+
|
|
547
|
+
@dataclass
|
|
548
|
+
class Experiment:
|
|
549
|
+
"""Experiment metadata and results."""
|
|
550
|
+
experiment_id: str
|
|
551
|
+
name: str
|
|
552
|
+
params: dict
|
|
553
|
+
metrics: dict = field(default_factory=dict)
|
|
554
|
+
artifacts: list = field(default_factory=list)
|
|
555
|
+
tags: dict = field(default_factory=dict)
|
|
556
|
+
start_time: str = field(default_factory=lambda: datetime.utcnow().isoformat())
|
|
557
|
+
end_time: Optional[str] = None
|
|
558
|
+
status: str = "running"
|
|
559
|
+
|
|
560
|
+
def to_dict(self) -> dict:
|
|
561
|
+
return asdict(self)
|
|
562
|
+
|
|
563
|
+
class SimpleTracker:
|
|
564
|
+
"""Lightweight file-based experiment tracker."""
|
|
565
|
+
|
|
566
|
+
def __init__(self, experiments_dir: str = "./experiments"):
|
|
567
|
+
self.experiments_dir = Path(experiments_dir)
|
|
568
|
+
self.experiments_dir.mkdir(parents=True, exist_ok=True)
|
|
569
|
+
self.current_experiment: Optional[Experiment] = None
|
|
570
|
+
|
|
571
|
+
def start_experiment(
|
|
572
|
+
self,
|
|
573
|
+
name: str,
|
|
574
|
+
params: dict,
|
|
575
|
+
tags: dict = None,
|
|
576
|
+
) -> Experiment:
|
|
577
|
+
"""Start a new experiment."""
|
|
578
|
+
experiment_id = str(uuid.uuid4())[:8]
|
|
579
|
+
|
|
580
|
+
self.current_experiment = Experiment(
|
|
581
|
+
experiment_id=experiment_id,
|
|
582
|
+
name=name,
|
|
583
|
+
params=params,
|
|
584
|
+
tags=tags or {},
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
# Create experiment directory
|
|
588
|
+
exp_dir = self.experiments_dir / experiment_id
|
|
589
|
+
exp_dir.mkdir(exist_ok=True)
|
|
590
|
+
|
|
591
|
+
self._save_experiment()
|
|
592
|
+
return self.current_experiment
|
|
593
|
+
|
|
594
|
+
def log_metrics(self, metrics: dict, step: int = None) -> None:
|
|
595
|
+
"""Log metrics to current experiment."""
|
|
596
|
+
if self.current_experiment is None:
|
|
597
|
+
raise ValueError("No active experiment")
|
|
598
|
+
|
|
599
|
+
for key, value in metrics.items():
|
|
600
|
+
if key not in self.current_experiment.metrics:
|
|
601
|
+
self.current_experiment.metrics[key] = []
|
|
602
|
+
|
|
603
|
+
self.current_experiment.metrics[key].append({
|
|
604
|
+
"value": value,
|
|
605
|
+
"step": step,
|
|
606
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
607
|
+
})
|
|
608
|
+
|
|
609
|
+
self._save_experiment()
|
|
610
|
+
|
|
611
|
+
def log_artifact(self, path: str, name: str = None) -> str:
|
|
612
|
+
"""Copy artifact to experiment directory."""
|
|
613
|
+
if self.current_experiment is None:
|
|
614
|
+
raise ValueError("No active experiment")
|
|
615
|
+
|
|
616
|
+
import shutil
|
|
617
|
+
|
|
618
|
+
source = Path(path)
|
|
619
|
+
artifact_name = name or source.name
|
|
620
|
+
exp_dir = self.experiments_dir / self.current_experiment.experiment_id
|
|
621
|
+
dest = exp_dir / "artifacts" / artifact_name
|
|
622
|
+
|
|
623
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
624
|
+
|
|
625
|
+
if source.is_dir():
|
|
626
|
+
shutil.copytree(source, dest)
|
|
627
|
+
else:
|
|
628
|
+
shutil.copy2(source, dest)
|
|
629
|
+
|
|
630
|
+
self.current_experiment.artifacts.append(str(dest))
|
|
631
|
+
self._save_experiment()
|
|
632
|
+
|
|
633
|
+
return str(dest)
|
|
634
|
+
|
|
635
|
+
def end_experiment(self, status: str = "completed") -> None:
|
|
636
|
+
"""End current experiment."""
|
|
637
|
+
if self.current_experiment is None:
|
|
638
|
+
return
|
|
639
|
+
|
|
640
|
+
self.current_experiment.status = status
|
|
641
|
+
self.current_experiment.end_time = datetime.utcnow().isoformat()
|
|
642
|
+
self._save_experiment()
|
|
643
|
+
self.current_experiment = None
|
|
644
|
+
|
|
645
|
+
def _save_experiment(self) -> None:
|
|
646
|
+
"""Save experiment to JSON file."""
|
|
647
|
+
if self.current_experiment is None:
|
|
648
|
+
return
|
|
649
|
+
|
|
650
|
+
exp_dir = self.experiments_dir / self.current_experiment.experiment_id
|
|
651
|
+
with open(exp_dir / "experiment.json", "w") as f:
|
|
652
|
+
json.dump(self.current_experiment.to_dict(), f, indent=2)
|
|
653
|
+
|
|
654
|
+
def load_experiment(self, experiment_id: str) -> Experiment:
|
|
655
|
+
"""Load experiment by ID."""
|
|
656
|
+
exp_file = self.experiments_dir / experiment_id / "experiment.json"
|
|
657
|
+
with open(exp_file) as f:
|
|
658
|
+
data = json.load(f)
|
|
659
|
+
return Experiment(**data)
|
|
660
|
+
|
|
661
|
+
def list_experiments(self, tags: dict = None) -> list[Experiment]:
|
|
662
|
+
"""List all experiments, optionally filtered by tags."""
|
|
663
|
+
experiments = []
|
|
664
|
+
|
|
665
|
+
for exp_dir in self.experiments_dir.iterdir():
|
|
666
|
+
if not exp_dir.is_dir():
|
|
667
|
+
continue
|
|
668
|
+
|
|
669
|
+
exp_file = exp_dir / "experiment.json"
|
|
670
|
+
if not exp_file.exists():
|
|
671
|
+
continue
|
|
672
|
+
|
|
673
|
+
exp = self.load_experiment(exp_dir.name)
|
|
674
|
+
|
|
675
|
+
if tags:
|
|
676
|
+
if not all(exp.tags.get(k) == v for k, v in tags.items()):
|
|
677
|
+
continue
|
|
678
|
+
|
|
679
|
+
experiments.append(exp)
|
|
680
|
+
|
|
681
|
+
return sorted(experiments, key=lambda x: x.start_time, reverse=True)
|
|
682
|
+
|
|
683
|
+
def compare_experiments(self, experiment_ids: list[str]) -> dict:
|
|
684
|
+
"""Compare metrics across experiments."""
|
|
685
|
+
comparison = {}
|
|
686
|
+
|
|
687
|
+
for exp_id in experiment_ids:
|
|
688
|
+
exp = self.load_experiment(exp_id)
|
|
689
|
+
comparison[exp_id] = {
|
|
690
|
+
"name": exp.name,
|
|
691
|
+
"params": exp.params,
|
|
692
|
+
"final_metrics": {
|
|
693
|
+
k: v[-1]["value"] if v else None
|
|
694
|
+
for k, v in exp.metrics.items()
|
|
695
|
+
},
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
return comparison
|
|
699
|
+
```
|
|
700
|
+
|
|
701
|
+
---
|
|
702
|
+
|
|
703
|
+
## Experiment Comparison and Analysis
|
|
704
|
+
|
|
705
|
+
### Metrics Comparison
|
|
706
|
+
|
|
707
|
+
```python
|
|
708
|
+
import pandas as pd
|
|
709
|
+
import matplotlib.pyplot as plt
|
|
710
|
+
from mlflow.tracking import MlflowClient
|
|
711
|
+
|
|
712
|
+
def compare_runs(
|
|
713
|
+
experiment_name: str,
|
|
714
|
+
metric_keys: list[str],
|
|
715
|
+
n_runs: int = 10,
|
|
716
|
+
) -> pd.DataFrame:
|
|
717
|
+
"""Compare recent runs in an experiment."""
|
|
718
|
+
client = MlflowClient()
|
|
719
|
+
experiment = client.get_experiment_by_name(experiment_name)
|
|
720
|
+
|
|
721
|
+
runs = client.search_runs(
|
|
722
|
+
experiment_ids=[experiment.experiment_id],
|
|
723
|
+
order_by=["start_time DESC"],
|
|
724
|
+
max_results=n_runs,
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
data = []
|
|
728
|
+
for run in runs:
|
|
729
|
+
row = {
|
|
730
|
+
"run_id": run.info.run_id,
|
|
731
|
+
"run_name": run.info.run_name,
|
|
732
|
+
"status": run.info.status,
|
|
733
|
+
"start_time": run.info.start_time,
|
|
734
|
+
}
|
|
735
|
+
row.update(run.data.params)
|
|
736
|
+
row.update({k: run.data.metrics.get(k) for k in metric_keys})
|
|
737
|
+
data.append(row)
|
|
738
|
+
|
|
739
|
+
return pd.DataFrame(data)
|
|
740
|
+
|
|
741
|
+
def plot_metric_comparison(
|
|
742
|
+
runs_df: pd.DataFrame,
|
|
743
|
+
metric: str,
|
|
744
|
+
group_by: str = None,
|
|
745
|
+
) -> plt.Figure:
|
|
746
|
+
"""Plot metric comparison across runs."""
|
|
747
|
+
fig, ax = plt.subplots(figsize=(10, 6))
|
|
748
|
+
|
|
749
|
+
if group_by:
|
|
750
|
+
for group, group_df in runs_df.groupby(group_by):
|
|
751
|
+
ax.bar(group_df["run_name"], group_df[metric], label=str(group))
|
|
752
|
+
ax.legend(title=group_by)
|
|
753
|
+
else:
|
|
754
|
+
ax.bar(runs_df["run_name"], runs_df[metric])
|
|
755
|
+
|
|
756
|
+
ax.set_xlabel("Run")
|
|
757
|
+
ax.set_ylabel(metric)
|
|
758
|
+
ax.set_title(f"Comparison of {metric}")
|
|
759
|
+
plt.xticks(rotation=45, ha="right")
|
|
760
|
+
plt.tight_layout()
|
|
761
|
+
|
|
762
|
+
return fig
|
|
763
|
+
```
|
|
764
|
+
|
|
765
|
+
---
|
|
766
|
+
|
|
767
|
+
## Best Practices
|
|
768
|
+
|
|
769
|
+
### What to Track
|
|
770
|
+
|
|
771
|
+
```python
|
|
772
|
+
# Always track:
|
|
773
|
+
REQUIRED_PARAMS = [
|
|
774
|
+
"learning_rate",
|
|
775
|
+
"batch_size",
|
|
776
|
+
"epochs",
|
|
777
|
+
"model_architecture",
|
|
778
|
+
"optimizer",
|
|
779
|
+
"random_seed",
|
|
780
|
+
"dataset_version",
|
|
781
|
+
]
|
|
782
|
+
|
|
783
|
+
REQUIRED_METRICS = [
|
|
784
|
+
"train_loss",
|
|
785
|
+
"val_loss",
|
|
786
|
+
"train_accuracy",
|
|
787
|
+
"val_accuracy",
|
|
788
|
+
]
|
|
789
|
+
|
|
790
|
+
REQUIRED_ARTIFACTS = [
|
|
791
|
+
"model_checkpoint",
|
|
792
|
+
"training_config",
|
|
793
|
+
"requirements.txt",
|
|
794
|
+
]
|
|
795
|
+
|
|
796
|
+
# Recommended tags
|
|
797
|
+
RECOMMENDED_TAGS = {
|
|
798
|
+
"author": "username",
|
|
799
|
+
"environment": "dev|staging|prod",
|
|
800
|
+
"model_type": "classification|regression|etc",
|
|
801
|
+
"dataset": "dataset_name",
|
|
802
|
+
"git_commit": "commit_hash",
|
|
803
|
+
}
|
|
804
|
+
```
|
|
805
|
+
|
|
806
|
+
### Experiment Naming Conventions
|
|
807
|
+
|
|
808
|
+
```python
|
|
809
|
+
# Good naming patterns
|
|
810
|
+
run_name = f"{model_type}_{dataset}_{timestamp}"
|
|
811
|
+
run_name = f"exp_{experiment_number:03d}_{description}"
|
|
812
|
+
run_name = f"{feature_flag}_{ablation_type}_{seed}"
|
|
813
|
+
|
|
814
|
+
# Organize with groups and tags
|
|
815
|
+
tags = {
|
|
816
|
+
"project": "recommendation_engine",
|
|
817
|
+
"sprint": "sprint_42",
|
|
818
|
+
"hypothesis": "larger_embedding_helps",
|
|
819
|
+
}
|
|
820
|
+
```
|
|
821
|
+
|
|
822
|
+
---
|
|
823
|
+
|
|
824
|
+
## Related References
|
|
825
|
+
|
|
826
|
+
- `training-pipelines.md` - Integrating tracking with training
|
|
827
|
+
- `model-validation.md` - Validating tracked models
|
|
828
|
+
- `pipeline-orchestration.md` - Tracking in automated pipelines
|
|
829
|
+
|
|
830
|
+
## Cross-Reference Skills
|
|
831
|
+
|
|
832
|
+
- **DevOps Engineer** - MLflow server deployment
|
|
833
|
+
- **Data Engineer** - Artifact storage integration
|