aigroup-workflow 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/fix-build.md +10 -5
- package/.claude/commands/init-project.md +13 -8
- package/.claude/commands/plan.md +15 -8
- package/.claude/commands/review.md +12 -6
- package/.claude/commands/tdd.md +11 -5
- package/.claude/commands/workflow-start.md +20 -11
- package/.claude/settings.json +28 -0
- package/.codex/agents/architect.toml +207 -0
- package/.codex/agents/build-error-resolver.toml +110 -0
- package/.codex/agents/code-reviewer.toml +233 -0
- package/.codex/agents/doc-updater.toml +103 -0
- package/.codex/agents/e2e-runner.toml +103 -0
- package/.codex/agents/get-current-datetime.toml +23 -0
- package/.codex/agents/init-architect.toml +181 -0
- package/.codex/agents/planner.toml +208 -0
- package/.codex/agents/refactor-cleaner.toml +81 -0
- package/.codex/agents/rust-reviewer.toml +90 -0
- package/.codex/agents/security-reviewer.toml +104 -0
- package/.codex/agents/tdd-guide.toml +87 -0
- package/AGENTS.md +2 -2
- package/CLAUDE.md +23 -1
- package/LICENSE +20 -20
- package/README.md +333 -333
- package/agents/a11y-architect.md +141 -141
- package/agents/architect.md +211 -211
- package/agents/build-error-resolver.md +114 -114
- package/agents/chief-of-staff.md +151 -151
- package/agents/code-architect.md +71 -71
- package/agents/code-explorer.md +69 -69
- package/agents/code-reviewer.md +237 -237
- package/agents/code-simplifier.md +47 -47
- package/agents/comment-analyzer.md +45 -45
- package/agents/conversation-analyzer.md +52 -52
- package/agents/cpp-build-resolver.md +90 -90
- package/agents/cpp-reviewer.md +72 -72
- package/agents/csharp-reviewer.md +101 -101
- package/agents/dart-build-resolver.md +201 -201
- package/agents/database-reviewer.md +91 -91
- package/agents/doc-updater.md +107 -107
- package/agents/docs-lookup.md +68 -68
- package/agents/e2e-runner.md +107 -107
- package/agents/flutter-reviewer.md +243 -243
- package/agents/gan-evaluator.md +209 -209
- package/agents/gan-generator.md +131 -131
- package/agents/gan-planner.md +99 -99
- package/agents/get-current-datetime.md +26 -26
- package/agents/go-build-resolver.md +94 -94
- package/agents/go-reviewer.md +76 -76
- package/agents/harness-optimizer.md +35 -35
- package/agents/healthcare-reviewer.md +83 -83
- package/agents/java-build-resolver.md +153 -153
- package/agents/java-reviewer.md +92 -92
- package/agents/kotlin-build-resolver.md +118 -118
- package/agents/kotlin-reviewer.md +159 -159
- package/agents/loop-operator.md +36 -36
- package/agents/opensource-forker.md +198 -198
- package/agents/opensource-packager.md +249 -249
- package/agents/opensource-sanitizer.md +188 -188
- package/agents/performance-optimizer.md +446 -446
- package/agents/planner.md +212 -212
- package/agents/pr-test-analyzer.md +45 -45
- package/agents/python-reviewer.md +98 -98
- package/agents/pytorch-build-resolver.md +120 -120
- package/agents/refactor-cleaner.md +85 -85
- package/agents/rust-build-resolver.md +148 -148
- package/agents/rust-reviewer.md +94 -94
- package/agents/security-reviewer.md +108 -108
- package/agents/seo-specialist.md +59 -59
- package/agents/silent-failure-hunter.md +50 -50
- package/agents/tdd-guide.md +91 -91
- package/agents/type-design-analyzer.md +41 -41
- package/agents/typescript-reviewer.md +112 -112
- package/cli/commands/update.mjs +1 -1
- package/cli/utils/scaffold.mjs +53 -0
- package/docs/rules/agents.md +166 -50
- package/docs/rules/cpp/coding-style.md +44 -44
- package/docs/rules/cpp/hooks.md +39 -39
- package/docs/rules/cpp/patterns.md +51 -51
- package/docs/rules/cpp/security.md +51 -51
- package/docs/rules/cpp/testing.md +44 -44
- package/docs/rules/csharp/coding-style.md +72 -72
- package/docs/rules/csharp/hooks.md +25 -25
- package/docs/rules/csharp/patterns.md +50 -50
- package/docs/rules/csharp/security.md +58 -58
- package/docs/rules/csharp/testing.md +46 -46
- package/docs/rules/dart/coding-style.md +159 -159
- package/docs/rules/dart/hooks.md +66 -66
- package/docs/rules/dart/patterns.md +261 -261
- package/docs/rules/dart/security.md +135 -135
- package/docs/rules/dart/testing.md +215 -215
- package/docs/rules/golang/coding-style.md +32 -32
- package/docs/rules/golang/hooks.md +17 -17
- package/docs/rules/golang/patterns.md +45 -45
- package/docs/rules/golang/security.md +34 -34
- package/docs/rules/golang/testing.md +31 -31
- package/docs/rules/java/coding-style.md +114 -114
- package/docs/rules/java/hooks.md +18 -18
- package/docs/rules/java/patterns.md +146 -146
- package/docs/rules/java/security.md +100 -100
- package/docs/rules/java/testing.md +131 -131
- package/docs/rules/kotlin/coding-style.md +86 -86
- package/docs/rules/kotlin/hooks.md +17 -17
- package/docs/rules/kotlin/patterns.md +146 -146
- package/docs/rules/kotlin/security.md +82 -82
- package/docs/rules/kotlin/testing.md +128 -128
- package/docs/rules/perl/coding-style.md +46 -46
- package/docs/rules/perl/hooks.md +22 -22
- package/docs/rules/perl/patterns.md +76 -76
- package/docs/rules/perl/security.md +69 -69
- package/docs/rules/perl/testing.md +54 -54
- package/docs/rules/php/coding-style.md +40 -40
- package/docs/rules/php/hooks.md +24 -24
- package/docs/rules/php/patterns.md +33 -33
- package/docs/rules/php/security.md +37 -37
- package/docs/rules/php/testing.md +39 -39
- package/docs/rules/python/coding-style.md +42 -42
- package/docs/rules/python/hooks.md +19 -19
- package/docs/rules/python/patterns.md +39 -39
- package/docs/rules/python/security.md +30 -30
- package/docs/rules/python/testing.md +38 -38
- package/docs/rules/rust/coding-style.md +151 -151
- package/docs/rules/rust/hooks.md +16 -16
- package/docs/rules/rust/patterns.md +168 -168
- package/docs/rules/rust/security.md +141 -141
- package/docs/rules/rust/testing.md +154 -154
- package/docs/rules/swift/coding-style.md +47 -47
- package/docs/rules/swift/hooks.md +20 -20
- package/docs/rules/swift/patterns.md +66 -66
- package/docs/rules/swift/security.md +33 -33
- package/docs/rules/swift/testing.md +45 -45
- package/docs/rules/typescript/coding-style.md +199 -199
- package/docs/rules/typescript/hooks.md +22 -22
- package/docs/rules/typescript/patterns.md +52 -52
- package/docs/rules/typescript/security.md +28 -28
- package/docs/rules/typescript/testing.md +18 -18
- package/docs/rules/web/coding-style.md +96 -96
- package/docs/rules/web/design-quality.md +62 -62
- package/docs/rules/web/hooks.md +120 -120
- package/docs/rules/web/patterns.md +79 -79
- package/docs/rules/web/performance.md +64 -64
- package/docs/rules/web/security.md +57 -57
- package/docs/rules/web/testing.md +55 -55
- package/docs/templates/README.md +36 -36
- package/docs/templates/ai-project-final.md +124 -124
- package/docs/templates/ai-project.md +105 -105
- package/docs/templates/api.md +157 -157
- package/docs/templates/bug.md +62 -62
- package/docs/templates/code-review.md +87 -87
- package/docs/templates/generic.md +116 -116
- package/docs/templates/implementation-plan.md +1 -1
- package/docs/templates/meeting.md +68 -68
- package/docs/templates/prd.md +98 -98
- package/docs/templates/ui.md +134 -134
- package/docs/workflow-pipeline.md +11 -10
- package/package.json +40 -39
- package/scripts/hooks/checks/orchestration-artifacts.cjs +28 -23
- package/scripts/hooks/checks/workflow-state.cjs +4 -5
- package/scripts/orchestration/lib/orchestrator.cjs +344 -117
- package/scripts/orchestration/lib/validate.cjs +145 -0
- package/scripts/orchestration/session.cjs +88 -44
- package/skills/SUPERPOWERS-LICENSE +21 -21
- package/skills/ai-ml/fine-tuning-expert/SKILL.md +162 -162
- package/skills/ai-ml/fine-tuning-expert/references/dataset-preparation.md +540 -540
- package/skills/ai-ml/fine-tuning-expert/references/deployment-optimization.md +673 -673
- package/skills/ai-ml/fine-tuning-expert/references/evaluation-metrics.md +597 -597
- package/skills/ai-ml/fine-tuning-expert/references/hyperparameter-tuning.md +565 -565
- package/skills/ai-ml/fine-tuning-expert/references/lora-peft.md +347 -347
- package/skills/ai-ml/ml-pipeline/SKILL.md +159 -159
- package/skills/ai-ml/ml-pipeline/references/experiment-tracking.md +833 -833
- package/skills/ai-ml/ml-pipeline/references/feature-engineering.md +631 -631
- package/skills/ai-ml/ml-pipeline/references/model-validation.md +978 -978
- package/skills/ai-ml/ml-pipeline/references/pipeline-orchestration.md +907 -907
- package/skills/ai-ml/ml-pipeline/references/training-pipelines.md +782 -782
- package/skills/ai-ml/rag-architect/SKILL.md +194 -194
- package/skills/ai-ml/rag-architect/references/chunking-strategies.md +878 -878
- package/skills/ai-ml/rag-architect/references/embedding-models.md +561 -561
- package/skills/ai-ml/rag-architect/references/rag-evaluation.md +833 -833
- package/skills/ai-ml/rag-architect/references/retrieval-optimization.md +795 -795
- package/skills/ai-ml/rag-architect/references/vector-databases.md +589 -589
- package/skills/ai-ml/spark-engineer/SKILL.md +148 -148
- package/skills/ai-ml/spark-engineer/references/partitioning-caching.md +543 -543
- package/skills/ai-ml/spark-engineer/references/performance-tuning.md +544 -544
- package/skills/ai-ml/spark-engineer/references/rdd-operations.md +599 -599
- package/skills/ai-ml/spark-engineer/references/spark-sql-dataframes.md +474 -474
- package/skills/ai-ml/spark-engineer/references/streaming-patterns.md +786 -786
- package/skills/backend/api-designer/SKILL.md +217 -217
- package/skills/backend/api-designer/references/error-handling.md +541 -541
- package/skills/backend/api-designer/references/openapi.md +824 -824
- package/skills/backend/api-designer/references/pagination.md +494 -494
- package/skills/backend/api-designer/references/rest-patterns.md +335 -335
- package/skills/backend/api-designer/references/versioning.md +391 -391
- package/skills/backend/architecture-designer/SKILL.md +117 -117
- package/skills/backend/architecture-designer/references/adr-template.md +116 -116
- package/skills/backend/architecture-designer/references/architecture-patterns.md +111 -111
- package/skills/backend/architecture-designer/references/database-selection.md +102 -102
- package/skills/backend/architecture-designer/references/nfr-checklist.md +112 -112
- package/skills/backend/architecture-designer/references/system-design.md +100 -100
- package/skills/backend/code-documenter/SKILL.md +147 -147
- package/skills/backend/code-documenter/references/api-docs-fastapi-django.md +166 -166
- package/skills/backend/code-documenter/references/api-docs-nestjs-express.md +220 -220
- package/skills/backend/code-documenter/references/coverage-reports.md +125 -125
- package/skills/backend/code-documenter/references/documentation-systems.md +333 -333
- package/skills/backend/code-documenter/references/interactive-api-docs.md +531 -531
- package/skills/backend/code-documenter/references/python-docstrings.md +121 -121
- package/skills/backend/code-documenter/references/typescript-jsdoc.md +145 -145
- package/skills/backend/code-documenter/references/user-guides-tutorials.md +530 -530
- package/skills/backend/debugging-wizard/SKILL.md +105 -105
- package/skills/backend/debugging-wizard/references/common-patterns.md +132 -132
- package/skills/backend/debugging-wizard/references/debugging-tools.md +140 -140
- package/skills/backend/debugging-wizard/references/quick-fixes.md +177 -177
- package/skills/backend/debugging-wizard/references/strategies.md +142 -142
- package/skills/backend/debugging-wizard/references/systematic-debugging.md +367 -367
- package/skills/backend/feature-forge/SKILL.md +98 -98
- package/skills/backend/feature-forge/references/acceptance-criteria.md +104 -104
- package/skills/backend/feature-forge/references/ears-syntax.md +99 -99
- package/skills/backend/feature-forge/references/interview-questions.md +150 -150
- package/skills/backend/feature-forge/references/pre-discovery-subagents.md +54 -54
- package/skills/backend/feature-forge/references/specification-template.md +103 -103
- package/skills/backend/fullstack-guardian/SKILL.md +105 -105
- package/skills/backend/fullstack-guardian/references/api-design-standards.md +307 -307
- package/skills/backend/fullstack-guardian/references/architecture-decisions.md +350 -350
- package/skills/backend/fullstack-guardian/references/backend-patterns.md +237 -237
- package/skills/backend/fullstack-guardian/references/common-patterns.md +134 -134
- package/skills/backend/fullstack-guardian/references/deliverables-checklist.md +354 -354
- package/skills/backend/fullstack-guardian/references/design-template.md +91 -91
- package/skills/backend/fullstack-guardian/references/error-handling.md +135 -135
- package/skills/backend/fullstack-guardian/references/frontend-patterns.md +340 -340
- package/skills/backend/fullstack-guardian/references/integration-patterns.md +333 -333
- package/skills/backend/fullstack-guardian/references/security-checklist.md +106 -106
- package/skills/backend/graphql-architect/SKILL.md +146 -146
- package/skills/backend/graphql-architect/references/federation.md +418 -418
- package/skills/backend/graphql-architect/references/migration-from-rest.md +1141 -1141
- package/skills/backend/graphql-architect/references/resolvers.md +425 -425
- package/skills/backend/graphql-architect/references/schema-design.md +393 -393
- package/skills/backend/graphql-architect/references/security.md +569 -569
- package/skills/backend/graphql-architect/references/subscriptions.md +510 -510
- package/skills/backend/legacy-modernizer/SKILL.md +137 -137
- package/skills/backend/legacy-modernizer/references/legacy-testing.md +381 -381
- package/skills/backend/legacy-modernizer/references/migration-strategies.md +423 -423
- package/skills/backend/legacy-modernizer/references/refactoring-patterns.md +395 -395
- package/skills/backend/legacy-modernizer/references/strangler-fig-pattern.md +281 -281
- package/skills/backend/legacy-modernizer/references/system-assessment.md +487 -487
- package/skills/backend/microservices-architect/SKILL.md +164 -164
- package/skills/backend/microservices-architect/references/communication.md +499 -499
- package/skills/backend/microservices-architect/references/data.md +721 -721
- package/skills/backend/microservices-architect/references/decomposition.md +344 -344
- package/skills/backend/microservices-architect/references/observability.md +805 -805
- package/skills/backend/microservices-architect/references/patterns.md +603 -603
- package/skills/database/database-optimizer/SKILL.md +147 -147
- package/skills/database/database-optimizer/references/index-strategies.md +331 -331
- package/skills/database/database-optimizer/references/monitoring-analysis.md +501 -501
- package/skills/database/database-optimizer/references/mysql-tuning.md +452 -452
- package/skills/database/database-optimizer/references/postgresql-tuning.md +413 -413
- package/skills/database/database-optimizer/references/query-optimization.md +251 -251
- package/skills/database/postgres-pro/SKILL.md +152 -152
- package/skills/database/postgres-pro/references/extensions.md +404 -404
- package/skills/database/postgres-pro/references/jsonb.md +321 -321
- package/skills/database/postgres-pro/references/maintenance.md +481 -481
- package/skills/database/postgres-pro/references/performance.md +265 -265
- package/skills/database/postgres-pro/references/replication.md +446 -446
- package/skills/database/sql-pro/SKILL.md +129 -129
- package/skills/database/sql-pro/references/database-design.md +402 -402
- package/skills/database/sql-pro/references/dialect-differences.md +419 -419
- package/skills/database/sql-pro/references/optimization.md +384 -384
- package/skills/database/sql-pro/references/query-patterns.md +285 -285
- package/skills/database/sql-pro/references/window-functions.md +328 -328
- package/skills/dotnet/csharp-developer/SKILL.md +125 -125
- package/skills/dotnet/csharp-developer/references/aspnet-core.md +394 -394
- package/skills/dotnet/csharp-developer/references/blazor.md +553 -553
- package/skills/dotnet/csharp-developer/references/entity-framework.md +409 -409
- package/skills/dotnet/csharp-developer/references/modern-csharp.md +248 -248
- package/skills/dotnet/csharp-developer/references/performance.md +498 -498
- package/skills/dotnet/dotnet-core-expert/SKILL.md +138 -138
- package/skills/dotnet/dotnet-core-expert/references/authentication.md +546 -546
- package/skills/dotnet/dotnet-core-expert/references/clean-architecture.md +455 -455
- package/skills/dotnet/dotnet-core-expert/references/cloud-native.md +548 -548
- package/skills/dotnet/dotnet-core-expert/references/entity-framework.md +440 -440
- package/skills/dotnet/dotnet-core-expert/references/minimal-apis.md +319 -319
- package/skills/frontend/angular-architect/SKILL.md +152 -152
- package/skills/frontend/angular-architect/references/components.md +297 -297
- package/skills/frontend/angular-architect/references/ngrx.md +401 -401
- package/skills/frontend/angular-architect/references/routing.md +361 -361
- package/skills/frontend/angular-architect/references/rxjs.md +319 -319
- package/skills/frontend/angular-architect/references/testing.md +405 -405
- package/skills/frontend/design-commands/design.md +91 -91
- package/skills/frontend/design-commands/handoff.md +97 -97
- package/skills/frontend/design-commands/prototype.md +120 -120
- package/skills/frontend/design-commands/spec.md +160 -160
- package/skills/frontend/design-commands/style.md +78 -78
- package/skills/frontend/flutter-expert/SKILL.md +138 -138
- package/skills/frontend/flutter-expert/references/bloc-state.md +259 -259
- package/skills/frontend/flutter-expert/references/gorouter-navigation.md +119 -119
- package/skills/frontend/flutter-expert/references/performance.md +99 -99
- package/skills/frontend/flutter-expert/references/project-structure.md +118 -118
- package/skills/frontend/flutter-expert/references/riverpod-state.md +130 -130
- package/skills/frontend/flutter-expert/references/widget-patterns.md +123 -123
- package/skills/frontend/nextjs-developer/SKILL.md +143 -143
- package/skills/frontend/nextjs-developer/references/app-router.md +311 -311
- package/skills/frontend/nextjs-developer/references/data-fetching.md +482 -482
- package/skills/frontend/nextjs-developer/references/deployment.md +545 -545
- package/skills/frontend/nextjs-developer/references/server-actions.md +462 -462
- package/skills/frontend/nextjs-developer/references/server-components.md +384 -384
- package/skills/frontend/react-expert/SKILL.md +149 -149
- package/skills/frontend/react-expert/references/hooks-patterns.md +162 -162
- package/skills/frontend/react-expert/references/migration-class-to-modern.md +1119 -1119
- package/skills/frontend/react-expert/references/performance.md +168 -168
- package/skills/frontend/react-expert/references/react-19-features.md +174 -174
- package/skills/frontend/react-expert/references/server-components.md +143 -143
- package/skills/frontend/react-expert/references/state-management.md +171 -171
- package/skills/frontend/react-expert/references/testing-react.md +174 -174
- package/skills/frontend/react-native-expert/SKILL.md +185 -185
- package/skills/frontend/react-native-expert/references/expo-router.md +187 -187
- package/skills/frontend/react-native-expert/references/list-optimization.md +204 -204
- package/skills/frontend/react-native-expert/references/platform-handling.md +188 -188
- package/skills/frontend/react-native-expert/references/project-structure.md +171 -171
- package/skills/frontend/react-native-expert/references/storage-hooks.md +173 -173
- package/skills/frontend/senior-frontend/SKILL.md +477 -477
- package/skills/frontend/senior-frontend/references/frontend_best_practices.md +806 -806
- package/skills/frontend/senior-frontend/references/nextjs_optimization_guide.md +724 -724
- package/skills/frontend/senior-frontend/references/react_patterns.md +746 -746
- package/skills/frontend/senior-frontend/scripts/bundle_analyzer.py +407 -407
- package/skills/frontend/senior-frontend/scripts/component_generator.py +329 -329
- package/skills/frontend/senior-frontend/scripts/frontend_scaffolder.py +1005 -1005
- package/skills/frontend/ui-ux-pro-max/SKILL.md +386 -386
- package/skills/frontend/ui-ux-pro-max/data/charts.csv +26 -26
- package/skills/frontend/ui-ux-pro-max/data/colors.csv +97 -97
- package/skills/frontend/ui-ux-pro-max/data/icons.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/landing.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/data/products.csv +96 -96
- package/skills/frontend/ui-ux-pro-max/data/react-performance.csv +45 -45
- package/skills/frontend/ui-ux-pro-max/data/stacks/astro.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/flutter.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -56
- package/skills/frontend/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nextjs.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -59
- package/skills/frontend/ui-ux-pro-max/data/stacks/react-native.csv +52 -52
- package/skills/frontend/ui-ux-pro-max/data/stacks/react.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/shadcn.csv +61 -61
- package/skills/frontend/ui-ux-pro-max/data/stacks/svelte.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/swiftui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/vue.csv +50 -50
- package/skills/frontend/ui-ux-pro-max/data/styles.csv +68 -68
- package/skills/frontend/ui-ux-pro-max/data/typography.csv +57 -57
- package/skills/frontend/ui-ux-pro-max/data/ui-reasoning.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/ux-guidelines.csv +99 -99
- package/skills/frontend/ui-ux-pro-max/data/web-interface.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/scripts/core.py +253 -253
- package/skills/frontend/ui-ux-pro-max/scripts/design_system.py +1067 -1067
- package/skills/frontend/ui-ux-pro-max/scripts/search.py +114 -114
- package/skills/frontend/vue-expert/SKILL.md +98 -98
- package/skills/frontend/vue-expert/references/build-tooling.md +480 -480
- package/skills/frontend/vue-expert/references/components.md +448 -448
- package/skills/frontend/vue-expert/references/composition-api.md +299 -299
- package/skills/frontend/vue-expert/references/mobile-hybrid.md +636 -636
- package/skills/frontend/vue-expert/references/nuxt.md +669 -669
- package/skills/frontend/vue-expert/references/state-management.md +449 -449
- package/skills/frontend/vue-expert/references/typescript.md +584 -584
- package/skills/frontend/vue-expert-js/SKILL.md +167 -167
- package/skills/frontend/vue-expert-js/references/component-architecture.md +219 -219
- package/skills/frontend/vue-expert-js/references/composables-patterns.md +183 -183
- package/skills/frontend/vue-expert-js/references/jsdoc-typing.md +535 -535
- package/skills/frontend/vue-expert-js/references/state-management.md +249 -249
- package/skills/frontend/vue-expert-js/references/testing-patterns.md +237 -237
- package/skills/go-rust-cpp/cpp-pro/SKILL.md +115 -115
- package/skills/go-rust-cpp/cpp-pro/references/build-tooling.md +440 -440
- package/skills/go-rust-cpp/cpp-pro/references/concurrency.md +437 -437
- package/skills/go-rust-cpp/cpp-pro/references/memory-performance.md +397 -397
- package/skills/go-rust-cpp/cpp-pro/references/modern-cpp.md +304 -304
- package/skills/go-rust-cpp/cpp-pro/references/templates.md +357 -357
- package/skills/go-rust-cpp/golang-pro/SKILL.md +122 -122
- package/skills/go-rust-cpp/golang-pro/references/concurrency.md +329 -329
- package/skills/go-rust-cpp/golang-pro/references/generics.md +442 -442
- package/skills/go-rust-cpp/golang-pro/references/interfaces.md +432 -432
- package/skills/go-rust-cpp/golang-pro/references/project-structure.md +477 -477
- package/skills/go-rust-cpp/golang-pro/references/testing.md +451 -451
- package/skills/go-rust-cpp/rust-engineer/SKILL.md +167 -167
- package/skills/go-rust-cpp/rust-engineer/references/async.md +458 -458
- package/skills/go-rust-cpp/rust-engineer/references/error-handling.md +334 -334
- package/skills/go-rust-cpp/rust-engineer/references/ownership.md +278 -278
- package/skills/go-rust-cpp/rust-engineer/references/testing.md +470 -470
- package/skills/go-rust-cpp/rust-engineer/references/traits.md +413 -413
- package/skills/infra/cli-developer/SKILL.md +113 -113
- package/skills/infra/cli-developer/references/design-patterns.md +221 -221
- package/skills/infra/cli-developer/references/go-cli.md +540 -540
- package/skills/infra/cli-developer/references/node-cli.md +383 -383
- package/skills/infra/cli-developer/references/python-cli.md +422 -422
- package/skills/infra/cli-developer/references/ux-patterns.md +448 -448
- package/skills/infra/cloud-architect/SKILL.md +216 -216
- package/skills/infra/cloud-architect/references/aws.md +394 -394
- package/skills/infra/cloud-architect/references/azure.md +562 -562
- package/skills/infra/cloud-architect/references/cost.md +582 -582
- package/skills/infra/cloud-architect/references/gcp.md +633 -633
- package/skills/infra/cloud-architect/references/multi-cloud.md +483 -483
- package/skills/infra/devops-engineer/SKILL.md +144 -144
- package/skills/infra/devops-engineer/references/deployment-strategies.md +241 -241
- package/skills/infra/devops-engineer/references/docker-patterns.md +113 -113
- package/skills/infra/devops-engineer/references/github-actions.md +139 -139
- package/skills/infra/devops-engineer/references/incident-response.md +331 -331
- package/skills/infra/devops-engineer/references/kubernetes.md +154 -154
- package/skills/infra/devops-engineer/references/platform-engineering.md +417 -417
- package/skills/infra/devops-engineer/references/release-automation.md +527 -527
- package/skills/infra/devops-engineer/references/terraform-iac.md +141 -141
- package/skills/infra/kubernetes-specialist/SKILL.md +241 -241
- package/skills/infra/kubernetes-specialist/references/configuration.md +452 -452
- package/skills/infra/kubernetes-specialist/references/cost-optimization.md +458 -458
- package/skills/infra/kubernetes-specialist/references/custom-operators.md +563 -563
- package/skills/infra/kubernetes-specialist/references/gitops.md +530 -530
- package/skills/infra/kubernetes-specialist/references/helm-charts.md +912 -912
- package/skills/infra/kubernetes-specialist/references/multi-cluster.md +507 -507
- package/skills/infra/kubernetes-specialist/references/networking.md +447 -447
- package/skills/infra/kubernetes-specialist/references/service-mesh.md +459 -459
- package/skills/infra/kubernetes-specialist/references/storage.md +535 -535
- package/skills/infra/kubernetes-specialist/references/troubleshooting.md +414 -414
- package/skills/infra/kubernetes-specialist/references/workloads.md +377 -377
- package/skills/infra/mcp-developer/SKILL.md +143 -143
- package/skills/infra/mcp-developer/references/protocol.md +244 -244
- package/skills/infra/mcp-developer/references/python-sdk.md +367 -367
- package/skills/infra/mcp-developer/references/resources.md +554 -554
- package/skills/infra/mcp-developer/references/tools.md +480 -480
- package/skills/infra/mcp-developer/references/typescript-sdk.md +350 -350
- package/skills/infra/monitoring-expert/SKILL.md +176 -176
- package/skills/infra/monitoring-expert/references/alerting-rules.md +141 -141
- package/skills/infra/monitoring-expert/references/application-profiling.md +331 -331
- package/skills/infra/monitoring-expert/references/capacity-planning.md +344 -344
- package/skills/infra/monitoring-expert/references/dashboards.md +126 -126
- package/skills/infra/monitoring-expert/references/opentelemetry.md +123 -123
- package/skills/infra/monitoring-expert/references/performance-testing.md +269 -269
- package/skills/infra/monitoring-expert/references/prometheus-metrics.md +136 -136
- package/skills/infra/monitoring-expert/references/structured-logging.md +142 -142
- package/skills/infra/sre-engineer/SKILL.md +181 -181
- package/skills/infra/sre-engineer/references/automation-toil.md +492 -492
- package/skills/infra/sre-engineer/references/error-budget-policy.md +334 -334
- package/skills/infra/sre-engineer/references/incident-chaos.md +576 -576
- package/skills/infra/sre-engineer/references/monitoring-alerting.md +424 -424
- package/skills/infra/sre-engineer/references/slo-sli-management.md +238 -238
- package/skills/infra/terraform-engineer/SKILL.md +143 -143
- package/skills/infra/terraform-engineer/references/best-practices.md +583 -583
- package/skills/infra/terraform-engineer/references/module-patterns.md +297 -297
- package/skills/infra/terraform-engineer/references/providers.md +452 -452
- package/skills/infra/terraform-engineer/references/state-management.md +371 -371
- package/skills/infra/terraform-engineer/references/testing.md +486 -486
- package/skills/infra/websocket-engineer/SKILL.md +168 -168
- package/skills/infra/websocket-engineer/references/alternatives.md +391 -391
- package/skills/infra/websocket-engineer/references/patterns.md +400 -400
- package/skills/infra/websocket-engineer/references/protocol.md +195 -195
- package/skills/infra/websocket-engineer/references/scaling.md +333 -333
- package/skills/infra/websocket-engineer/references/security.md +474 -474
- package/skills/java/java-architect/SKILL.md +132 -132
- package/skills/java/java-architect/references/jpa-optimization.md +393 -393
- package/skills/java/java-architect/references/reactive-webflux.md +356 -356
- package/skills/java/java-architect/references/spring-boot-setup.md +269 -269
- package/skills/java/java-architect/references/spring-security.md +445 -445
- package/skills/java/java-architect/references/testing-patterns.md +500 -500
- package/skills/java/kotlin-specialist/SKILL.md +147 -147
- package/skills/java/kotlin-specialist/references/android-compose.md +419 -419
- package/skills/java/kotlin-specialist/references/coroutines-flow.md +276 -276
- package/skills/java/kotlin-specialist/references/dsl-idioms.md +421 -421
- package/skills/java/kotlin-specialist/references/ktor-server.md +426 -426
- package/skills/java/kotlin-specialist/references/multiplatform-kmp.md +380 -380
- package/skills/java/spring-boot-engineer/SKILL.md +195 -195
- package/skills/java/spring-boot-engineer/references/cloud.md +498 -498
- package/skills/java/spring-boot-engineer/references/data.md +381 -381
- package/skills/java/spring-boot-engineer/references/security.md +459 -459
- package/skills/java/spring-boot-engineer/references/testing.md +545 -545
- package/skills/java/spring-boot-engineer/references/web.md +295 -295
- package/skills/javascript/javascript-pro/SKILL.md +132 -132
- package/skills/javascript/javascript-pro/references/async-patterns.md +334 -334
- package/skills/javascript/javascript-pro/references/browser-apis.md +398 -398
- package/skills/javascript/javascript-pro/references/modern-syntax.md +272 -272
- package/skills/javascript/javascript-pro/references/modules.md +357 -357
- package/skills/javascript/javascript-pro/references/node-essentials.md +471 -471
- package/skills/javascript/nestjs-expert/SKILL.md +206 -206
- package/skills/javascript/nestjs-expert/references/authentication.md +166 -166
- package/skills/javascript/nestjs-expert/references/controllers-routing.md +111 -111
- package/skills/javascript/nestjs-expert/references/dtos-validation.md +153 -153
- package/skills/javascript/nestjs-expert/references/migration-from-express.md +1237 -1237
- package/skills/javascript/nestjs-expert/references/services-di.md +140 -140
- package/skills/javascript/nestjs-expert/references/testing-patterns.md +186 -186
- package/skills/javascript/typescript-pro/SKILL.md +145 -145
- package/skills/javascript/typescript-pro/references/advanced-types.md +259 -259
- package/skills/javascript/typescript-pro/references/configuration.md +445 -445
- package/skills/javascript/typescript-pro/references/patterns.md +484 -484
- package/skills/javascript/typescript-pro/references/type-guards.md +352 -352
- package/skills/javascript/typescript-pro/references/utility-types.md +329 -329
- package/skills/php/laravel-specialist/SKILL.md +262 -262
- package/skills/php/laravel-specialist/references/eloquent.md +351 -351
- package/skills/php/laravel-specialist/references/livewire.md +512 -512
- package/skills/php/laravel-specialist/references/queues.md +423 -423
- package/skills/php/laravel-specialist/references/routing.md +362 -362
- package/skills/php/laravel-specialist/references/testing.md +522 -522
- package/skills/php/php-pro/SKILL.md +206 -206
- package/skills/php/php-pro/references/async-patterns.md +412 -412
- package/skills/php/php-pro/references/laravel-patterns.md +377 -377
- package/skills/php/php-pro/references/modern-php-features.md +323 -323
- package/skills/php/php-pro/references/symfony-patterns.md +466 -466
- package/skills/php/php-pro/references/testing-quality.md +466 -466
- package/skills/product/competitive-analysis/SKILL.md +257 -257
- package/skills/product/meeting-notes/SKILL.md +266 -266
- package/skills/product/prd-template/SKILL.md +150 -150
- package/skills/product/stakeholder-update/SKILL.md +225 -225
- package/skills/product/user-research-synthesis/SKILL.md +235 -235
- package/skills/python/django-expert/SKILL.md +162 -162
- package/skills/python/django-expert/references/authentication.md +145 -145
- package/skills/python/django-expert/references/drf-serializers.md +148 -148
- package/skills/python/django-expert/references/models-orm.md +151 -151
- package/skills/python/django-expert/references/testing-django.md +204 -204
- package/skills/python/django-expert/references/viewsets-views.md +153 -153
- package/skills/python/fastapi-expert/SKILL.md +185 -185
- package/skills/python/fastapi-expert/references/async-sqlalchemy.md +146 -146
- package/skills/python/fastapi-expert/references/authentication.md +159 -159
- package/skills/python/fastapi-expert/references/endpoints-routing.md +142 -142
- package/skills/python/fastapi-expert/references/migration-from-django.md +996 -996
- package/skills/python/fastapi-expert/references/pydantic-v2.md +135 -135
- package/skills/python/fastapi-expert/references/testing-async.md +159 -159
- package/skills/python/pandas-pro/SKILL.md +178 -178
- package/skills/python/pandas-pro/references/aggregation-groupby.md +545 -545
- package/skills/python/pandas-pro/references/data-cleaning.md +500 -500
- package/skills/python/pandas-pro/references/dataframe-operations.md +420 -420
- package/skills/python/pandas-pro/references/merging-joining.md +596 -596
- package/skills/python/pandas-pro/references/performance-optimization.md +597 -597
- package/skills/python/python-pro/SKILL.md +177 -177
- package/skills/python/python-pro/references/async-patterns.md +356 -356
- package/skills/python/python-pro/references/packaging.md +460 -460
- package/skills/python/python-pro/references/standard-library.md +378 -378
- package/skills/python/python-pro/references/testing.md +404 -404
- package/skills/python/python-pro/references/type-system.md +290 -290
- package/skills/quality/chaos-engineer/SKILL.md +182 -182
- package/skills/quality/chaos-engineer/references/chaos-tools.md +511 -511
- package/skills/quality/chaos-engineer/references/experiment-design.md +229 -229
- package/skills/quality/chaos-engineer/references/game-days.md +434 -434
- package/skills/quality/chaos-engineer/references/infrastructure-chaos.md +348 -348
- package/skills/quality/chaos-engineer/references/kubernetes-chaos.md +432 -432
- package/skills/quality/code-reviewer/SKILL.md +119 -119
- package/skills/quality/code-reviewer/references/common-issues.md +142 -142
- package/skills/quality/code-reviewer/references/feedback-examples.md +144 -144
- package/skills/quality/code-reviewer/references/receiving-feedback.md +238 -238
- package/skills/quality/code-reviewer/references/report-template.md +109 -109
- package/skills/quality/code-reviewer/references/review-checklist.md +88 -88
- package/skills/quality/code-reviewer/references/spec-compliance-review.md +258 -258
- package/skills/quality/playwright-expert/SKILL.md +169 -169
- package/skills/quality/playwright-expert/references/api-mocking.md +140 -140
- package/skills/quality/playwright-expert/references/configuration.md +155 -155
- package/skills/quality/playwright-expert/references/debugging-flaky.md +150 -150
- package/skills/quality/playwright-expert/references/page-object-model.md +152 -152
- package/skills/quality/playwright-expert/references/selectors-locators.md +119 -119
- package/skills/quality/secure-code-guardian/SKILL.md +191 -191
- package/skills/quality/secure-code-guardian/references/authentication.md +136 -136
- package/skills/quality/secure-code-guardian/references/input-validation.md +146 -146
- package/skills/quality/secure-code-guardian/references/owasp-prevention.md +135 -135
- package/skills/quality/secure-code-guardian/references/security-headers.md +133 -133
- package/skills/quality/secure-code-guardian/references/xss-csrf.md +157 -157
- package/skills/quality/security-reviewer/SKILL.md +103 -103
- package/skills/quality/security-reviewer/references/infrastructure-security.md +268 -268
- package/skills/quality/security-reviewer/references/penetration-testing.md +268 -268
- package/skills/quality/security-reviewer/references/report-template.md +170 -170
- package/skills/quality/security-reviewer/references/sast-tools.md +117 -117
- package/skills/quality/security-reviewer/references/secret-scanning.md +125 -125
- package/skills/quality/security-reviewer/references/vulnerability-patterns.md +152 -152
- package/skills/quality/senior-qa/README.md +196 -196
- package/skills/quality/senior-qa/SKILL.md +399 -399
- package/skills/quality/senior-qa/references/qa_best_practices.md +964 -964
- package/skills/quality/senior-qa/references/test_automation_patterns.md +1009 -1009
- package/skills/quality/senior-qa/references/testing_strategies.md +649 -649
- package/skills/quality/senior-qa/scripts/coverage_analyzer.py +836 -836
- package/skills/quality/senior-qa/scripts/e2e_test_scaffolder.py +820 -820
- package/skills/quality/senior-qa/scripts/test_suite_generator.py +605 -605
- package/skills/quality/tdd-guide/HOW_TO_USE.md +313 -313
- package/skills/quality/tdd-guide/README.md +680 -680
- package/skills/quality/tdd-guide/SKILL.md +122 -122
- package/skills/quality/tdd-guide/assets/expected_output.json +77 -77
- package/skills/quality/tdd-guide/assets/sample_input_python.json +39 -39
- package/skills/quality/tdd-guide/assets/sample_input_typescript.json +36 -36
- package/skills/quality/tdd-guide/references/ci-integration.md +195 -195
- package/skills/quality/tdd-guide/references/framework-guide.md +206 -206
- package/skills/quality/tdd-guide/references/tdd-best-practices.md +128 -128
- package/skills/quality/tdd-guide/scripts/coverage_analyzer.py +434 -434
- package/skills/quality/tdd-guide/scripts/fixture_generator.py +440 -440
- package/skills/quality/tdd-guide/scripts/format_detector.py +384 -384
- package/skills/quality/tdd-guide/scripts/framework_adapter.py +428 -428
- package/skills/quality/tdd-guide/scripts/metrics_calculator.py +456 -456
- package/skills/quality/tdd-guide/scripts/output_formatter.py +354 -354
- package/skills/quality/tdd-guide/scripts/tdd_workflow.py +474 -474
- package/skills/quality/tdd-guide/scripts/test_generator.py +438 -438
- package/skills/quality/test-master/SKILL.md +94 -94
- package/skills/quality/test-master/references/automation-frameworks.md +294 -294
- package/skills/quality/test-master/references/e2e-testing.md +128 -128
- package/skills/quality/test-master/references/integration-testing.md +120 -120
- package/skills/quality/test-master/references/performance-testing.md +118 -118
- package/skills/quality/test-master/references/qa-methodology.md +247 -247
- package/skills/quality/test-master/references/security-testing.md +127 -127
- package/skills/quality/test-master/references/tdd-iron-laws.md +174 -174
- package/skills/quality/test-master/references/test-reports.md +104 -104
- package/skills/quality/test-master/references/testing-anti-patterns.md +231 -231
- package/skills/quality/test-master/references/unit-testing.md +113 -113
- package/skills/ruby/rails-expert/SKILL.md +154 -154
- package/skills/ruby/rails-expert/references/active-record.md +244 -244
- package/skills/ruby/rails-expert/references/api-development.md +401 -401
- package/skills/ruby/rails-expert/references/background-jobs.md +272 -272
- package/skills/ruby/rails-expert/references/hotwire-turbo.md +228 -228
- package/skills/ruby/rails-expert/references/rspec-testing.md +367 -367
- package/skills/swift/swift-expert/SKILL.md +163 -163
- package/skills/swift/swift-expert/references/async-concurrency.md +360 -360
- package/skills/swift/swift-expert/references/memory-performance.md +377 -377
- package/skills/swift/swift-expert/references/protocol-oriented.md +354 -354
- package/skills/swift/swift-expert/references/swiftui-patterns.md +291 -291
- package/skills/swift/swift-expert/references/testing-patterns.md +399 -399
- package/skills/workflow/brainstorming/SKILL.md +164 -164
- package/skills/workflow/brainstorming/scripts/frame-template.html +214 -214
- package/skills/workflow/brainstorming/scripts/helper.js +88 -88
- package/skills/workflow/brainstorming/scripts/server.cjs +354 -354
- package/skills/workflow/brainstorming/scripts/start-server.sh +148 -148
- package/skills/workflow/brainstorming/scripts/stop-server.sh +56 -56
- package/skills/workflow/brainstorming/spec-document-reviewer-prompt.md +49 -49
- package/skills/workflow/brainstorming/visual-companion.md +287 -287
- package/skills/workflow/documentation/SKILL.md +45 -45
- package/skills/workflow/entropy-management/SKILL.md +115 -115
- package/skills/workflow/executing-plans/SKILL.md +70 -70
- package/skills/workflow/finishing-a-development-branch/SKILL.md +200 -200
- package/skills/workflow/receiving-code-review/SKILL.md +213 -213
- package/skills/workflow/requesting-code-review/SKILL.md +105 -105
- package/skills/workflow/requesting-code-review/code-reviewer.md +146 -146
- package/skills/workflow/requirement-engineering/SKILL.md +111 -111
- package/skills/workflow/systematic-debugging/CREATION-LOG.md +119 -119
- package/skills/workflow/systematic-debugging/SKILL.md +296 -296
- package/skills/workflow/systematic-debugging/condition-based-waiting-example.ts +158 -158
- package/skills/workflow/systematic-debugging/condition-based-waiting.md +115 -115
- package/skills/workflow/systematic-debugging/defense-in-depth.md +122 -122
- package/skills/workflow/systematic-debugging/find-polluter.sh +63 -63
- package/skills/workflow/systematic-debugging/root-cause-tracing.md +169 -169
- package/skills/workflow/systematic-debugging/test-academic.md +14 -14
- package/skills/workflow/systematic-debugging/test-pressure-1.md +58 -58
- package/skills/workflow/systematic-debugging/test-pressure-2.md +68 -68
- package/skills/workflow/systematic-debugging/test-pressure-3.md +69 -69
- package/skills/workflow/using-git-worktrees/SKILL.md +218 -218
- package/skills/workflow/verification-before-completion/SKILL.md +139 -139
- package/skills/workflow/writing-plans/SKILL.md +151 -151
- package/skills/workflow/writing-plans/plan-document-reviewer-prompt.md +49 -49
- package/skills/workflow/writing-skills/SKILL.md +655 -655
- package/skills/workflow/writing-skills/anthropic-best-practices.md +1150 -1150
- package/skills/workflow/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -189
- package/skills/workflow/writing-skills/persuasion-principles.md +187 -187
- package/skills/workflow/writing-skills/render-graphs.js +168 -168
- package/skills/workflow/writing-skills/testing-skills-with-subagents.md +384 -384
|
@@ -1,500 +1,500 @@
|
|
|
1
|
-
# Data Cleaning
|
|
2
|
-
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
## Overview
|
|
6
|
-
|
|
7
|
-
Data cleaning is critical for reliable analysis. This reference covers handling missing values, duplicates, type conversion, and data validation with pandas 2.0+ patterns.
|
|
8
|
-
|
|
9
|
-
---
|
|
10
|
-
|
|
11
|
-
## Missing Values
|
|
12
|
-
|
|
13
|
-
### Detecting Missing Values
|
|
14
|
-
|
|
15
|
-
```python
|
|
16
|
-
import pandas as pd
|
|
17
|
-
import numpy as np
|
|
18
|
-
|
|
19
|
-
df = pd.DataFrame({
|
|
20
|
-
'name': ['Alice', 'Bob', None, 'Diana'],
|
|
21
|
-
'age': [25, np.nan, 35, 28],
|
|
22
|
-
'salary': [50000, 60000, np.nan, np.nan],
|
|
23
|
-
'department': ['Eng', '', 'Eng', 'Sales']
|
|
24
|
-
})
|
|
25
|
-
|
|
26
|
-
# Check for any missing values
|
|
27
|
-
df.isna().any() # Per column
|
|
28
|
-
df.isna().any().any() # Entire DataFrame
|
|
29
|
-
|
|
30
|
-
# Count missing values
|
|
31
|
-
df.isna().sum() # Per column
|
|
32
|
-
df.isna().sum().sum() # Total
|
|
33
|
-
|
|
34
|
-
# Percentage of missing values
|
|
35
|
-
(df.isna().sum() / len(df) * 100).round(2)
|
|
36
|
-
|
|
37
|
-
# Rows with any missing values
|
|
38
|
-
df[df.isna().any(axis=1)]
|
|
39
|
-
|
|
40
|
-
# Rows with all values present
|
|
41
|
-
df[df.notna().all(axis=1)]
|
|
42
|
-
|
|
43
|
-
# Missing value heatmap info
|
|
44
|
-
missing_info = pd.DataFrame({
|
|
45
|
-
'missing': df.isna().sum(),
|
|
46
|
-
'percent': (df.isna().sum() / len(df) * 100).round(2),
|
|
47
|
-
'dtype': df.dtypes
|
|
48
|
-
})
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
### Handling Missing Values - Dropping
|
|
52
|
-
|
|
53
|
-
```python
|
|
54
|
-
# Drop rows with any missing value
|
|
55
|
-
df_clean = df.dropna()
|
|
56
|
-
|
|
57
|
-
# Drop rows where specific columns have missing values
|
|
58
|
-
df_clean = df.dropna(subset=['name', 'age'])
|
|
59
|
-
|
|
60
|
-
# Drop rows where ALL values are missing
|
|
61
|
-
df_clean = df.dropna(how='all')
|
|
62
|
-
|
|
63
|
-
# Drop rows with minimum non-null values
|
|
64
|
-
df_clean = df.dropna(thresh=3) # Keep rows with at least 3 non-null
|
|
65
|
-
|
|
66
|
-
# Drop columns with missing values
|
|
67
|
-
df_clean = df.dropna(axis=1)
|
|
68
|
-
|
|
69
|
-
# Drop columns with more than 50% missing
|
|
70
|
-
threshold = len(df) * 0.5
|
|
71
|
-
df_clean = df.dropna(axis=1, thresh=threshold)
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
### Handling Missing Values - Filling
|
|
75
|
-
|
|
76
|
-
```python
|
|
77
|
-
# Fill with constant value
|
|
78
|
-
df['age'] = df['age'].fillna(0)
|
|
79
|
-
|
|
80
|
-
# Fill with column mean/median/mode
|
|
81
|
-
df['age'] = df['age'].fillna(df['age'].mean())
|
|
82
|
-
df['salary'] = df['salary'].fillna(df['salary'].median())
|
|
83
|
-
df['department'] = df['department'].fillna(df['department'].mode()[0])
|
|
84
|
-
|
|
85
|
-
# Forward fill (use previous value)
|
|
86
|
-
df['salary'] = df['salary'].ffill()
|
|
87
|
-
|
|
88
|
-
# Backward fill (use next value)
|
|
89
|
-
df['salary'] = df['salary'].bfill()
|
|
90
|
-
|
|
91
|
-
# Fill with different values per column
|
|
92
|
-
fill_values = {'age': 0, 'salary': df['salary'].median(), 'name': 'Unknown'}
|
|
93
|
-
df = df.fillna(fill_values)
|
|
94
|
-
|
|
95
|
-
# Fill with interpolation (numeric data)
|
|
96
|
-
df['salary'] = df['salary'].interpolate(method='linear')
|
|
97
|
-
|
|
98
|
-
# Group-specific fill (fill with group mean)
|
|
99
|
-
df['salary'] = df.groupby('department')['salary'].transform(
|
|
100
|
-
lambda x: x.fillna(x.mean())
|
|
101
|
-
)
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
### Handling Empty Strings vs NaN
|
|
105
|
-
|
|
106
|
-
```python
|
|
107
|
-
# Empty strings are NOT detected as NaN
|
|
108
|
-
df['department'].isna().sum() # Won't count ''
|
|
109
|
-
|
|
110
|
-
# Replace empty strings with NaN
|
|
111
|
-
df['department'] = df['department'].replace('', np.nan)
|
|
112
|
-
# Or
|
|
113
|
-
df['department'] = df['department'].replace(r'^\s*$', np.nan, regex=True)
|
|
114
|
-
|
|
115
|
-
# Replace multiple values with NaN
|
|
116
|
-
df = df.replace(['', 'N/A', 'null', 'None', '-'], np.nan)
|
|
117
|
-
|
|
118
|
-
# Using na_values when reading files
|
|
119
|
-
df = pd.read_csv('file.csv', na_values=['', 'N/A', 'null', 'None', '-'])
|
|
120
|
-
```
|
|
121
|
-
|
|
122
|
-
---
|
|
123
|
-
|
|
124
|
-
## Handling Duplicates
|
|
125
|
-
|
|
126
|
-
### Detecting Duplicates
|
|
127
|
-
|
|
128
|
-
```python
|
|
129
|
-
df = pd.DataFrame({
|
|
130
|
-
'id': [1, 2, 2, 3, 4, 4],
|
|
131
|
-
'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'Diana', 'Diana'],
|
|
132
|
-
'email': ['a@x.com', 'b@x.com', 'b@x.com', 'c@x.com', 'd@x.com', 'd2@x.com']
|
|
133
|
-
})
|
|
134
|
-
|
|
135
|
-
# Check for duplicate rows (all columns)
|
|
136
|
-
df.duplicated().sum()
|
|
137
|
-
|
|
138
|
-
# Check specific columns
|
|
139
|
-
df.duplicated(subset=['id']).sum()
|
|
140
|
-
df.duplicated(subset=['name', 'email']).sum()
|
|
141
|
-
|
|
142
|
-
# View duplicate rows
|
|
143
|
-
df[df.duplicated(keep=False)] # All duplicates
|
|
144
|
-
df[df.duplicated(keep='first')] # Duplicates except first occurrence
|
|
145
|
-
df[df.duplicated(keep='last')] # Duplicates except last occurrence
|
|
146
|
-
|
|
147
|
-
# Count duplicates per key
|
|
148
|
-
df.groupby('id').size().loc[lambda x: x > 1]
|
|
149
|
-
```
|
|
150
|
-
|
|
151
|
-
### Removing Duplicates
|
|
152
|
-
|
|
153
|
-
```python
|
|
154
|
-
# Remove duplicate rows (keep first)
|
|
155
|
-
df_clean = df.drop_duplicates()
|
|
156
|
-
|
|
157
|
-
# Keep last occurrence
|
|
158
|
-
df_clean = df.drop_duplicates(keep='last')
|
|
159
|
-
|
|
160
|
-
# Remove all duplicates (keep none)
|
|
161
|
-
df_clean = df.drop_duplicates(keep=False)
|
|
162
|
-
|
|
163
|
-
# Based on specific columns
|
|
164
|
-
df_clean = df.drop_duplicates(subset=['id'])
|
|
165
|
-
df_clean = df.drop_duplicates(subset=['name', 'email'], keep='last')
|
|
166
|
-
|
|
167
|
-
# In-place modification
|
|
168
|
-
df.drop_duplicates(inplace=True)
|
|
169
|
-
```
|
|
170
|
-
|
|
171
|
-
### Handling Duplicates with Aggregation
|
|
172
|
-
|
|
173
|
-
```python
|
|
174
|
-
# Instead of dropping, aggregate duplicates
|
|
175
|
-
df_agg = df.groupby('id').agg({
|
|
176
|
-
'name': 'first',
|
|
177
|
-
'email': lambda x: ', '.join(x.unique())
|
|
178
|
-
}).reset_index()
|
|
179
|
-
|
|
180
|
-
# Keep row with max/min value
|
|
181
|
-
df_best = df.loc[df.groupby('id')['score'].idxmax()]
|
|
182
|
-
|
|
183
|
-
# Rank duplicates
|
|
184
|
-
df['rank'] = df.groupby('id').cumcount() + 1
|
|
185
|
-
```
|
|
186
|
-
|
|
187
|
-
---
|
|
188
|
-
|
|
189
|
-
## Type Conversion
|
|
190
|
-
|
|
191
|
-
### Checking and Converting Types
|
|
192
|
-
|
|
193
|
-
```python
|
|
194
|
-
# Check current types
|
|
195
|
-
df.dtypes
|
|
196
|
-
df.info()
|
|
197
|
-
|
|
198
|
-
# Convert to specific type
|
|
199
|
-
df['age'] = df['age'].astype(int)
|
|
200
|
-
df['salary'] = df['salary'].astype(float)
|
|
201
|
-
df['name'] = df['name'].astype(str)
|
|
202
|
-
|
|
203
|
-
# Safe conversion with errors handling
|
|
204
|
-
df['age'] = pd.to_numeric(df['age'], errors='coerce') # Invalid -> NaN
|
|
205
|
-
df['age'] = pd.to_numeric(df['age'], errors='ignore') # Keep original if invalid
|
|
206
|
-
|
|
207
|
-
# Convert multiple columns
|
|
208
|
-
df = df.astype({'age': 'int64', 'salary': 'float64'})
|
|
209
|
-
|
|
210
|
-
# Convert object to string (pandas 2.0+ StringDtype)
|
|
211
|
-
df['name'] = df['name'].astype('string') # Nullable string type
|
|
212
|
-
```
|
|
213
|
-
|
|
214
|
-
### Datetime Conversion
|
|
215
|
-
|
|
216
|
-
```python
|
|
217
|
-
df = pd.DataFrame({
|
|
218
|
-
'date_str': ['2024-01-15', '2024-02-20', 'invalid', '2024-03-10'],
|
|
219
|
-
'timestamp': [1705276800, 1708387200, 1710028800, 1710028800]
|
|
220
|
-
})
|
|
221
|
-
|
|
222
|
-
# String to datetime
|
|
223
|
-
df['date'] = pd.to_datetime(df['date_str'], errors='coerce')
|
|
224
|
-
|
|
225
|
-
# Specify format for faster parsing
|
|
226
|
-
df['date'] = pd.to_datetime(df['date_str'], format='%Y-%m-%d', errors='coerce')
|
|
227
|
-
|
|
228
|
-
# Unix timestamp to datetime
|
|
229
|
-
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
|
|
230
|
-
|
|
231
|
-
# Extract components
|
|
232
|
-
df['year'] = df['date'].dt.year
|
|
233
|
-
df['month'] = df['date'].dt.month
|
|
234
|
-
df['day_of_week'] = df['date'].dt.day_name()
|
|
235
|
-
|
|
236
|
-
# Handle mixed formats
|
|
237
|
-
df['date'] = pd.to_datetime(df['date_str'], format='mixed', dayfirst=False)
|
|
238
|
-
```
|
|
239
|
-
|
|
240
|
-
### Categorical Conversion
|
|
241
|
-
|
|
242
|
-
```python
|
|
243
|
-
# Convert to categorical (memory efficient for low cardinality)
|
|
244
|
-
df['department'] = df['department'].astype('category')
|
|
245
|
-
|
|
246
|
-
# Ordered categorical
|
|
247
|
-
df['size'] = pd.Categorical(
|
|
248
|
-
df['size'],
|
|
249
|
-
categories=['Small', 'Medium', 'Large'],
|
|
250
|
-
ordered=True
|
|
251
|
-
)
|
|
252
|
-
|
|
253
|
-
# Check memory savings
|
|
254
|
-
print(f"Object: {df['department'].nbytes}")
|
|
255
|
-
df['department'] = df['department'].astype('category')
|
|
256
|
-
print(f"Category: {df['department'].nbytes}")
|
|
257
|
-
```
|
|
258
|
-
|
|
259
|
-
### Nullable Integer Types (pandas 2.0+)
|
|
260
|
-
|
|
261
|
-
```python
|
|
262
|
-
# Standard int doesn't support NaN
|
|
263
|
-
# Use nullable integer types
|
|
264
|
-
df['age'] = df['age'].astype('Int64') # Note capital I
|
|
265
|
-
|
|
266
|
-
# All nullable types
|
|
267
|
-
df = df.astype({
|
|
268
|
-
'count': 'Int64', # Nullable integer
|
|
269
|
-
'price': 'Float64', # Nullable float
|
|
270
|
-
'flag': 'boolean', # Nullable boolean
|
|
271
|
-
'name': 'string', # Nullable string
|
|
272
|
-
})
|
|
273
|
-
|
|
274
|
-
# Convert with NA handling
|
|
275
|
-
df['age'] = pd.array([1, 2, None, 4], dtype='Int64')
|
|
276
|
-
```
|
|
277
|
-
|
|
278
|
-
---
|
|
279
|
-
|
|
280
|
-
## String Cleaning
|
|
281
|
-
|
|
282
|
-
### Common String Operations
|
|
283
|
-
|
|
284
|
-
```python
|
|
285
|
-
df = pd.DataFrame({
|
|
286
|
-
'name': [' Alice ', 'BOB', 'charlie', None, 'Diana Smith'],
|
|
287
|
-
'email': ['ALICE@EXAMPLE.COM', 'bob@test', 'invalid', None, 'diana@example.com']
|
|
288
|
-
})
|
|
289
|
-
|
|
290
|
-
# Strip whitespace
|
|
291
|
-
df['name'] = df['name'].str.strip()
|
|
292
|
-
|
|
293
|
-
# Case normalization
|
|
294
|
-
df['name'] = df['name'].str.lower()
|
|
295
|
-
df['name'] = df['name'].str.upper()
|
|
296
|
-
df['name'] = df['name'].str.title() # Title Case
|
|
297
|
-
|
|
298
|
-
# Replace patterns
|
|
299
|
-
df['name'] = df['name'].str.replace(r'\s+', ' ', regex=True) # Multiple spaces to one
|
|
300
|
-
df['phone'] = df['phone'].str.replace(r'[^0-9]', '', regex=True) # Keep only digits
|
|
301
|
-
|
|
302
|
-
# Extract with regex
|
|
303
|
-
df['domain'] = df['email'].str.extract(r'@(.+)$')
|
|
304
|
-
df['first_name'] = df['name'].str.extract(r'^(\w+)')
|
|
305
|
-
|
|
306
|
-
# Split strings
|
|
307
|
-
df[['first', 'last']] = df['name'].str.split(' ', n=1, expand=True)
|
|
308
|
-
```
|
|
309
|
-
|
|
310
|
-
### String Validation
|
|
311
|
-
|
|
312
|
-
```python
|
|
313
|
-
# Check patterns
|
|
314
|
-
df['valid_email'] = df['email'].str.match(r'^[\w.]+@[\w.]+\.\w+$', na=False)
|
|
315
|
-
|
|
316
|
-
# String length
|
|
317
|
-
df['name_length'] = df['name'].str.len()
|
|
318
|
-
df['valid_length'] = df['name'].str.len().between(2, 50)
|
|
319
|
-
|
|
320
|
-
# Contains check
|
|
321
|
-
df['has_domain'] = df['email'].str.contains('@', na=False)
|
|
322
|
-
```
|
|
323
|
-
|
|
324
|
-
---
|
|
325
|
-
|
|
326
|
-
## Data Validation
|
|
327
|
-
|
|
328
|
-
### Validation Functions
|
|
329
|
-
|
|
330
|
-
```python
|
|
331
|
-
def validate_dataframe(df: pd.DataFrame) -> dict:
|
|
332
|
-
"""Comprehensive DataFrame validation."""
|
|
333
|
-
report = {
|
|
334
|
-
'rows': len(df),
|
|
335
|
-
'columns': len(df.columns),
|
|
336
|
-
'duplicates': df.duplicated().sum(),
|
|
337
|
-
'missing_by_column': df.isna().sum().to_dict(),
|
|
338
|
-
'dtypes': df.dtypes.astype(str).to_dict(),
|
|
339
|
-
}
|
|
340
|
-
return report
|
|
341
|
-
|
|
342
|
-
# Range validation
|
|
343
|
-
def validate_range(series: pd.Series, min_val, max_val) -> pd.Series:
|
|
344
|
-
"""Return boolean mask for values in range."""
|
|
345
|
-
return series.between(min_val, max_val)
|
|
346
|
-
|
|
347
|
-
df['valid_age'] = validate_range(df['age'], 0, 120)
|
|
348
|
-
|
|
349
|
-
# Custom validation
|
|
350
|
-
def validate_email(series: pd.Series) -> pd.Series:
|
|
351
|
-
"""Validate email format."""
|
|
352
|
-
pattern = r'^[\w.+-]+@[\w-]+\.[\w.-]+$'
|
|
353
|
-
return series.str.match(pattern, na=False)
|
|
354
|
-
|
|
355
|
-
df['valid_email'] = validate_email(df['email'])
|
|
356
|
-
```
|
|
357
|
-
|
|
358
|
-
### Schema Validation with pandera
|
|
359
|
-
|
|
360
|
-
```python
|
|
361
|
-
# Using pandera for schema validation (recommended for production)
|
|
362
|
-
import pandera as pa
|
|
363
|
-
from pandera import Column, Check
|
|
364
|
-
|
|
365
|
-
schema = pa.DataFrameSchema({
|
|
366
|
-
'name': Column(str, Check.str_length(min_value=1, max_value=100)),
|
|
367
|
-
'age': Column(int, Check.in_range(0, 120)),
|
|
368
|
-
'email': Column(str, Check.str_matches(r'^[\w.+-]+@[\w-]+\.[\w.-]+$')),
|
|
369
|
-
'salary': Column(float, Check.greater_than(0), nullable=True),
|
|
370
|
-
})
|
|
371
|
-
|
|
372
|
-
# Validate DataFrame
|
|
373
|
-
try:
|
|
374
|
-
schema.validate(df)
|
|
375
|
-
except pa.errors.SchemaError as e:
|
|
376
|
-
print(f"Validation failed: {e}")
|
|
377
|
-
```
|
|
378
|
-
|
|
379
|
-
---
|
|
380
|
-
|
|
381
|
-
## Data Cleaning Pipeline
|
|
382
|
-
|
|
383
|
-
### Method Chaining Pattern
|
|
384
|
-
|
|
385
|
-
```python
|
|
386
|
-
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
387
|
-
"""Complete data cleaning pipeline using method chaining."""
|
|
388
|
-
return (
|
|
389
|
-
df
|
|
390
|
-
# Make a copy
|
|
391
|
-
.copy()
|
|
392
|
-
# Standardize column names
|
|
393
|
-
.rename(columns=lambda x: x.lower().strip().replace(' ', '_'))
|
|
394
|
-
# Drop fully empty rows
|
|
395
|
-
.dropna(how='all')
|
|
396
|
-
# Clean string columns
|
|
397
|
-
.assign(
|
|
398
|
-
name=lambda x: x['name'].str.strip().str.title(),
|
|
399
|
-
email=lambda x: x['email'].str.lower().str.strip(),
|
|
400
|
-
)
|
|
401
|
-
# Handle missing values
|
|
402
|
-
.fillna({'department': 'Unknown'})
|
|
403
|
-
# Convert types
|
|
404
|
-
.astype({'age': 'Int64', 'department': 'category'})
|
|
405
|
-
# Remove duplicates
|
|
406
|
-
.drop_duplicates(subset=['email'])
|
|
407
|
-
# Reset index
|
|
408
|
-
.reset_index(drop=True)
|
|
409
|
-
)
|
|
410
|
-
|
|
411
|
-
df_clean = clean_dataframe(df)
|
|
412
|
-
```
|
|
413
|
-
|
|
414
|
-
### Pipeline with Validation
|
|
415
|
-
|
|
416
|
-
```python
|
|
417
|
-
def clean_and_validate(
|
|
418
|
-
df: pd.DataFrame,
|
|
419
|
-
required_columns: list[str],
|
|
420
|
-
unique_columns: list[str] | None = None,
|
|
421
|
-
) -> tuple[pd.DataFrame, dict]:
|
|
422
|
-
"""Clean DataFrame and return validation report."""
|
|
423
|
-
|
|
424
|
-
# Validate required columns exist
|
|
425
|
-
missing_cols = set(required_columns) - set(df.columns)
|
|
426
|
-
if missing_cols:
|
|
427
|
-
raise ValueError(f"Missing required columns: {missing_cols}")
|
|
428
|
-
|
|
429
|
-
# Track cleaning stats
|
|
430
|
-
stats = {
|
|
431
|
-
'initial_rows': len(df),
|
|
432
|
-
'dropped_empty': 0,
|
|
433
|
-
'dropped_duplicates': 0,
|
|
434
|
-
'filled_missing': {},
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
# Clean
|
|
438
|
-
df = df.copy()
|
|
439
|
-
|
|
440
|
-
# Drop empty rows
|
|
441
|
-
before = len(df)
|
|
442
|
-
df = df.dropna(how='all')
|
|
443
|
-
stats['dropped_empty'] = before - len(df)
|
|
444
|
-
|
|
445
|
-
# Handle duplicates
|
|
446
|
-
if unique_columns:
|
|
447
|
-
before = len(df)
|
|
448
|
-
df = df.drop_duplicates(subset=unique_columns)
|
|
449
|
-
stats['dropped_duplicates'] = before - len(df)
|
|
450
|
-
|
|
451
|
-
stats['final_rows'] = len(df)
|
|
452
|
-
|
|
453
|
-
return df, stats
|
|
454
|
-
```
|
|
455
|
-
|
|
456
|
-
---
|
|
457
|
-
|
|
458
|
-
## Best Practices Summary
|
|
459
|
-
|
|
460
|
-
1. **Always check data quality first** - Use `.info()`, `.describe()`, and missing value analysis
|
|
461
|
-
2. **Document cleaning decisions** - Track what was dropped/filled and why
|
|
462
|
-
3. **Use nullable types** - `Int64`, `string`, `boolean` for proper NA handling
|
|
463
|
-
4. **Validate after cleaning** - Ensure data meets expectations
|
|
464
|
-
5. **Use method chaining** - Readable, maintainable cleaning pipelines
|
|
465
|
-
6. **Copy before modifying** - Avoid SettingWithCopyWarning
|
|
466
|
-
7. **Handle edge cases** - Empty strings, whitespace, invalid formats
|
|
467
|
-
|
|
468
|
-
---
|
|
469
|
-
|
|
470
|
-
## Anti-Patterns to Avoid
|
|
471
|
-
|
|
472
|
-
```python
|
|
473
|
-
# BAD: Dropping NaN without understanding impact
|
|
474
|
-
df = df.dropna() # May lose significant data
|
|
475
|
-
|
|
476
|
-
# GOOD: Investigate first, then decide
|
|
477
|
-
print(f"Missing values: {df.isna().sum()}")
|
|
478
|
-
print(f"Rows affected: {df.isna().any(axis=1).sum()}")
|
|
479
|
-
# Then make informed decision
|
|
480
|
-
|
|
481
|
-
# BAD: Filling without domain knowledge
|
|
482
|
-
df['age'] = df['age'].fillna(0) # Age 0 is not valid
|
|
483
|
-
|
|
484
|
-
# GOOD: Use appropriate fill strategy
|
|
485
|
-
df['age'] = df['age'].fillna(df['age'].median())
|
|
486
|
-
|
|
487
|
-
# BAD: Type conversion without error handling
|
|
488
|
-
df['id'] = df['id'].astype(int) # Will fail on NaN or invalid
|
|
489
|
-
|
|
490
|
-
# GOOD: Safe conversion
|
|
491
|
-
df['id'] = pd.to_numeric(df['id'], errors='coerce').astype('Int64')
|
|
492
|
-
```
|
|
493
|
-
|
|
494
|
-
---
|
|
495
|
-
|
|
496
|
-
## Related References
|
|
497
|
-
|
|
498
|
-
- `dataframe-operations.md` - Selection and filtering for targeted cleaning
|
|
499
|
-
- `aggregation-groupby.md` - Aggregate duplicates instead of dropping
|
|
500
|
-
- `performance-optimization.md` - Efficient cleaning of large datasets
|
|
1
|
+
# Data Cleaning
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Data cleaning is critical for reliable analysis. This reference covers handling missing values, duplicates, type conversion, and data validation with pandas 2.0+ patterns.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Missing Values
|
|
12
|
+
|
|
13
|
+
### Detecting Missing Values
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
df = pd.DataFrame({
|
|
20
|
+
'name': ['Alice', 'Bob', None, 'Diana'],
|
|
21
|
+
'age': [25, np.nan, 35, 28],
|
|
22
|
+
'salary': [50000, 60000, np.nan, np.nan],
|
|
23
|
+
'department': ['Eng', '', 'Eng', 'Sales']
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
# Check for any missing values
|
|
27
|
+
df.isna().any() # Per column
|
|
28
|
+
df.isna().any().any() # Entire DataFrame
|
|
29
|
+
|
|
30
|
+
# Count missing values
|
|
31
|
+
df.isna().sum() # Per column
|
|
32
|
+
df.isna().sum().sum() # Total
|
|
33
|
+
|
|
34
|
+
# Percentage of missing values
|
|
35
|
+
(df.isna().sum() / len(df) * 100).round(2)
|
|
36
|
+
|
|
37
|
+
# Rows with any missing values
|
|
38
|
+
df[df.isna().any(axis=1)]
|
|
39
|
+
|
|
40
|
+
# Rows with all values present
|
|
41
|
+
df[df.notna().all(axis=1)]
|
|
42
|
+
|
|
43
|
+
# Missing value heatmap info
|
|
44
|
+
missing_info = pd.DataFrame({
|
|
45
|
+
'missing': df.isna().sum(),
|
|
46
|
+
'percent': (df.isna().sum() / len(df) * 100).round(2),
|
|
47
|
+
'dtype': df.dtypes
|
|
48
|
+
})
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Handling Missing Values - Dropping
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
# Drop rows with any missing value
|
|
55
|
+
df_clean = df.dropna()
|
|
56
|
+
|
|
57
|
+
# Drop rows where specific columns have missing values
|
|
58
|
+
df_clean = df.dropna(subset=['name', 'age'])
|
|
59
|
+
|
|
60
|
+
# Drop rows where ALL values are missing
|
|
61
|
+
df_clean = df.dropna(how='all')
|
|
62
|
+
|
|
63
|
+
# Drop rows with minimum non-null values
|
|
64
|
+
df_clean = df.dropna(thresh=3) # Keep rows with at least 3 non-null
|
|
65
|
+
|
|
66
|
+
# Drop columns with missing values
|
|
67
|
+
df_clean = df.dropna(axis=1)
|
|
68
|
+
|
|
69
|
+
# Drop columns with more than 50% missing
|
|
70
|
+
threshold = len(df) * 0.5
|
|
71
|
+
df_clean = df.dropna(axis=1, thresh=threshold)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Handling Missing Values - Filling
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
# Fill with constant value
|
|
78
|
+
df['age'] = df['age'].fillna(0)
|
|
79
|
+
|
|
80
|
+
# Fill with column mean/median/mode
|
|
81
|
+
df['age'] = df['age'].fillna(df['age'].mean())
|
|
82
|
+
df['salary'] = df['salary'].fillna(df['salary'].median())
|
|
83
|
+
df['department'] = df['department'].fillna(df['department'].mode()[0])
|
|
84
|
+
|
|
85
|
+
# Forward fill (use previous value)
|
|
86
|
+
df['salary'] = df['salary'].ffill()
|
|
87
|
+
|
|
88
|
+
# Backward fill (use next value)
|
|
89
|
+
df['salary'] = df['salary'].bfill()
|
|
90
|
+
|
|
91
|
+
# Fill with different values per column
|
|
92
|
+
fill_values = {'age': 0, 'salary': df['salary'].median(), 'name': 'Unknown'}
|
|
93
|
+
df = df.fillna(fill_values)
|
|
94
|
+
|
|
95
|
+
# Fill with interpolation (numeric data)
|
|
96
|
+
df['salary'] = df['salary'].interpolate(method='linear')
|
|
97
|
+
|
|
98
|
+
# Group-specific fill (fill with group mean)
|
|
99
|
+
df['salary'] = df.groupby('department')['salary'].transform(
|
|
100
|
+
lambda x: x.fillna(x.mean())
|
|
101
|
+
)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Handling Empty Strings vs NaN
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
# Empty strings are NOT detected as NaN
|
|
108
|
+
df['department'].isna().sum() # Won't count ''
|
|
109
|
+
|
|
110
|
+
# Replace empty strings with NaN
|
|
111
|
+
df['department'] = df['department'].replace('', np.nan)
|
|
112
|
+
# Or
|
|
113
|
+
df['department'] = df['department'].replace(r'^\s*$', np.nan, regex=True)
|
|
114
|
+
|
|
115
|
+
# Replace multiple values with NaN
|
|
116
|
+
df = df.replace(['', 'N/A', 'null', 'None', '-'], np.nan)
|
|
117
|
+
|
|
118
|
+
# Using na_values when reading files
|
|
119
|
+
df = pd.read_csv('file.csv', na_values=['', 'N/A', 'null', 'None', '-'])
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Handling Duplicates
|
|
125
|
+
|
|
126
|
+
### Detecting Duplicates
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
df = pd.DataFrame({
|
|
130
|
+
'id': [1, 2, 2, 3, 4, 4],
|
|
131
|
+
'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'Diana', 'Diana'],
|
|
132
|
+
'email': ['a@x.com', 'b@x.com', 'b@x.com', 'c@x.com', 'd@x.com', 'd2@x.com']
|
|
133
|
+
})
|
|
134
|
+
|
|
135
|
+
# Check for duplicate rows (all columns)
|
|
136
|
+
df.duplicated().sum()
|
|
137
|
+
|
|
138
|
+
# Check specific columns
|
|
139
|
+
df.duplicated(subset=['id']).sum()
|
|
140
|
+
df.duplicated(subset=['name', 'email']).sum()
|
|
141
|
+
|
|
142
|
+
# View duplicate rows
|
|
143
|
+
df[df.duplicated(keep=False)] # All duplicates
|
|
144
|
+
df[df.duplicated(keep='first')] # Duplicates except first occurrence
|
|
145
|
+
df[df.duplicated(keep='last')] # Duplicates except last occurrence
|
|
146
|
+
|
|
147
|
+
# Count duplicates per key
|
|
148
|
+
df.groupby('id').size().loc[lambda x: x > 1]
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Removing Duplicates
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
# Remove duplicate rows (keep first)
|
|
155
|
+
df_clean = df.drop_duplicates()
|
|
156
|
+
|
|
157
|
+
# Keep last occurrence
|
|
158
|
+
df_clean = df.drop_duplicates(keep='last')
|
|
159
|
+
|
|
160
|
+
# Remove all duplicates (keep none)
|
|
161
|
+
df_clean = df.drop_duplicates(keep=False)
|
|
162
|
+
|
|
163
|
+
# Based on specific columns
|
|
164
|
+
df_clean = df.drop_duplicates(subset=['id'])
|
|
165
|
+
df_clean = df.drop_duplicates(subset=['name', 'email'], keep='last')
|
|
166
|
+
|
|
167
|
+
# In-place modification
|
|
168
|
+
df.drop_duplicates(inplace=True)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Handling Duplicates with Aggregation
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
# Instead of dropping, aggregate duplicates
|
|
175
|
+
df_agg = df.groupby('id').agg({
|
|
176
|
+
'name': 'first',
|
|
177
|
+
'email': lambda x: ', '.join(x.unique())
|
|
178
|
+
}).reset_index()
|
|
179
|
+
|
|
180
|
+
# Keep row with max/min value
|
|
181
|
+
df_best = df.loc[df.groupby('id')['score'].idxmax()]
|
|
182
|
+
|
|
183
|
+
# Rank duplicates
|
|
184
|
+
df['rank'] = df.groupby('id').cumcount() + 1
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Type Conversion
|
|
190
|
+
|
|
191
|
+
### Checking and Converting Types
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
# Check current types
|
|
195
|
+
df.dtypes
|
|
196
|
+
df.info()
|
|
197
|
+
|
|
198
|
+
# Convert to specific type
|
|
199
|
+
df['age'] = df['age'].astype(int)
|
|
200
|
+
df['salary'] = df['salary'].astype(float)
|
|
201
|
+
df['name'] = df['name'].astype(str)
|
|
202
|
+
|
|
203
|
+
# Safe conversion with errors handling
|
|
204
|
+
df['age'] = pd.to_numeric(df['age'], errors='coerce') # Invalid -> NaN
|
|
205
|
+
df['age'] = pd.to_numeric(df['age'], errors='ignore') # Keep original if invalid
|
|
206
|
+
|
|
207
|
+
# Convert multiple columns
|
|
208
|
+
df = df.astype({'age': 'int64', 'salary': 'float64'})
|
|
209
|
+
|
|
210
|
+
# Convert object to string (pandas 2.0+ StringDtype)
|
|
211
|
+
df['name'] = df['name'].astype('string') # Nullable string type
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Datetime Conversion
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
df = pd.DataFrame({
|
|
218
|
+
'date_str': ['2024-01-15', '2024-02-20', 'invalid', '2024-03-10'],
|
|
219
|
+
'timestamp': [1705276800, 1708387200, 1710028800, 1710028800]
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
# String to datetime
|
|
223
|
+
df['date'] = pd.to_datetime(df['date_str'], errors='coerce')
|
|
224
|
+
|
|
225
|
+
# Specify format for faster parsing
|
|
226
|
+
df['date'] = pd.to_datetime(df['date_str'], format='%Y-%m-%d', errors='coerce')
|
|
227
|
+
|
|
228
|
+
# Unix timestamp to datetime
|
|
229
|
+
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
|
|
230
|
+
|
|
231
|
+
# Extract components
|
|
232
|
+
df['year'] = df['date'].dt.year
|
|
233
|
+
df['month'] = df['date'].dt.month
|
|
234
|
+
df['day_of_week'] = df['date'].dt.day_name()
|
|
235
|
+
|
|
236
|
+
# Handle mixed formats
|
|
237
|
+
df['date'] = pd.to_datetime(df['date_str'], format='mixed', dayfirst=False)
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Categorical Conversion
|
|
241
|
+
|
|
242
|
+
```python
|
|
243
|
+
# Convert to categorical (memory efficient for low cardinality)
|
|
244
|
+
df['department'] = df['department'].astype('category')
|
|
245
|
+
|
|
246
|
+
# Ordered categorical
|
|
247
|
+
df['size'] = pd.Categorical(
|
|
248
|
+
df['size'],
|
|
249
|
+
categories=['Small', 'Medium', 'Large'],
|
|
250
|
+
ordered=True
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Check memory savings
|
|
254
|
+
print(f"Object: {df['department'].nbytes}")
|
|
255
|
+
df['department'] = df['department'].astype('category')
|
|
256
|
+
print(f"Category: {df['department'].nbytes}")
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Nullable Integer Types (pandas 2.0+)
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
# Standard int doesn't support NaN
|
|
263
|
+
# Use nullable integer types
|
|
264
|
+
df['age'] = df['age'].astype('Int64') # Note capital I
|
|
265
|
+
|
|
266
|
+
# All nullable types
|
|
267
|
+
df = df.astype({
|
|
268
|
+
'count': 'Int64', # Nullable integer
|
|
269
|
+
'price': 'Float64', # Nullable float
|
|
270
|
+
'flag': 'boolean', # Nullable boolean
|
|
271
|
+
'name': 'string', # Nullable string
|
|
272
|
+
})
|
|
273
|
+
|
|
274
|
+
# Convert with NA handling
|
|
275
|
+
df['age'] = pd.array([1, 2, None, 4], dtype='Int64')
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
|
|
280
|
+
## String Cleaning
|
|
281
|
+
|
|
282
|
+
### Common String Operations
|
|
283
|
+
|
|
284
|
+
```python
|
|
285
|
+
df = pd.DataFrame({
|
|
286
|
+
'name': [' Alice ', 'BOB', 'charlie', None, 'Diana Smith'],
|
|
287
|
+
'email': ['ALICE@EXAMPLE.COM', 'bob@test', 'invalid', None, 'diana@example.com']
|
|
288
|
+
})
|
|
289
|
+
|
|
290
|
+
# Strip whitespace
|
|
291
|
+
df['name'] = df['name'].str.strip()
|
|
292
|
+
|
|
293
|
+
# Case normalization
|
|
294
|
+
df['name'] = df['name'].str.lower()
|
|
295
|
+
df['name'] = df['name'].str.upper()
|
|
296
|
+
df['name'] = df['name'].str.title() # Title Case
|
|
297
|
+
|
|
298
|
+
# Replace patterns
|
|
299
|
+
df['name'] = df['name'].str.replace(r'\s+', ' ', regex=True) # Multiple spaces to one
|
|
300
|
+
df['phone'] = df['phone'].str.replace(r'[^0-9]', '', regex=True) # Keep only digits
|
|
301
|
+
|
|
302
|
+
# Extract with regex
|
|
303
|
+
df['domain'] = df['email'].str.extract(r'@(.+)$')
|
|
304
|
+
df['first_name'] = df['name'].str.extract(r'^(\w+)')
|
|
305
|
+
|
|
306
|
+
# Split strings
|
|
307
|
+
df[['first', 'last']] = df['name'].str.split(' ', n=1, expand=True)
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
### String Validation
|
|
311
|
+
|
|
312
|
+
```python
|
|
313
|
+
# Check patterns
|
|
314
|
+
df['valid_email'] = df['email'].str.match(r'^[\w.]+@[\w.]+\.\w+$', na=False)
|
|
315
|
+
|
|
316
|
+
# String length
|
|
317
|
+
df['name_length'] = df['name'].str.len()
|
|
318
|
+
df['valid_length'] = df['name'].str.len().between(2, 50)
|
|
319
|
+
|
|
320
|
+
# Contains check
|
|
321
|
+
df['has_domain'] = df['email'].str.contains('@', na=False)
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
---
|
|
325
|
+
|
|
326
|
+
## Data Validation
|
|
327
|
+
|
|
328
|
+
### Validation Functions
|
|
329
|
+
|
|
330
|
+
```python
|
|
331
|
+
def validate_dataframe(df: pd.DataFrame) -> dict:
|
|
332
|
+
"""Comprehensive DataFrame validation."""
|
|
333
|
+
report = {
|
|
334
|
+
'rows': len(df),
|
|
335
|
+
'columns': len(df.columns),
|
|
336
|
+
'duplicates': df.duplicated().sum(),
|
|
337
|
+
'missing_by_column': df.isna().sum().to_dict(),
|
|
338
|
+
'dtypes': df.dtypes.astype(str).to_dict(),
|
|
339
|
+
}
|
|
340
|
+
return report
|
|
341
|
+
|
|
342
|
+
# Range validation
|
|
343
|
+
def validate_range(series: pd.Series, min_val, max_val) -> pd.Series:
|
|
344
|
+
"""Return boolean mask for values in range."""
|
|
345
|
+
return series.between(min_val, max_val)
|
|
346
|
+
|
|
347
|
+
df['valid_age'] = validate_range(df['age'], 0, 120)
|
|
348
|
+
|
|
349
|
+
# Custom validation
|
|
350
|
+
def validate_email(series: pd.Series) -> pd.Series:
|
|
351
|
+
"""Validate email format."""
|
|
352
|
+
pattern = r'^[\w.+-]+@[\w-]+\.[\w.-]+$'
|
|
353
|
+
return series.str.match(pattern, na=False)
|
|
354
|
+
|
|
355
|
+
df['valid_email'] = validate_email(df['email'])
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
### Schema Validation with pandera
|
|
359
|
+
|
|
360
|
+
```python
|
|
361
|
+
# Using pandera for schema validation (recommended for production)
|
|
362
|
+
import pandera as pa
|
|
363
|
+
from pandera import Column, Check
|
|
364
|
+
|
|
365
|
+
schema = pa.DataFrameSchema({
|
|
366
|
+
'name': Column(str, Check.str_length(min_value=1, max_value=100)),
|
|
367
|
+
'age': Column(int, Check.in_range(0, 120)),
|
|
368
|
+
'email': Column(str, Check.str_matches(r'^[\w.+-]+@[\w-]+\.[\w.-]+$')),
|
|
369
|
+
'salary': Column(float, Check.greater_than(0), nullable=True),
|
|
370
|
+
})
|
|
371
|
+
|
|
372
|
+
# Validate DataFrame
|
|
373
|
+
try:
|
|
374
|
+
schema.validate(df)
|
|
375
|
+
except pa.errors.SchemaError as e:
|
|
376
|
+
print(f"Validation failed: {e}")
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
---
|
|
380
|
+
|
|
381
|
+
## Data Cleaning Pipeline
|
|
382
|
+
|
|
383
|
+
### Method Chaining Pattern
|
|
384
|
+
|
|
385
|
+
```python
|
|
386
|
+
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
387
|
+
"""Complete data cleaning pipeline using method chaining."""
|
|
388
|
+
return (
|
|
389
|
+
df
|
|
390
|
+
# Make a copy
|
|
391
|
+
.copy()
|
|
392
|
+
# Standardize column names
|
|
393
|
+
.rename(columns=lambda x: x.lower().strip().replace(' ', '_'))
|
|
394
|
+
# Drop fully empty rows
|
|
395
|
+
.dropna(how='all')
|
|
396
|
+
# Clean string columns
|
|
397
|
+
.assign(
|
|
398
|
+
name=lambda x: x['name'].str.strip().str.title(),
|
|
399
|
+
email=lambda x: x['email'].str.lower().str.strip(),
|
|
400
|
+
)
|
|
401
|
+
# Handle missing values
|
|
402
|
+
.fillna({'department': 'Unknown'})
|
|
403
|
+
# Convert types
|
|
404
|
+
.astype({'age': 'Int64', 'department': 'category'})
|
|
405
|
+
# Remove duplicates
|
|
406
|
+
.drop_duplicates(subset=['email'])
|
|
407
|
+
# Reset index
|
|
408
|
+
.reset_index(drop=True)
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
df_clean = clean_dataframe(df)
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
### Pipeline with Validation
|
|
415
|
+
|
|
416
|
+
```python
|
|
417
|
+
def clean_and_validate(
|
|
418
|
+
df: pd.DataFrame,
|
|
419
|
+
required_columns: list[str],
|
|
420
|
+
unique_columns: list[str] | None = None,
|
|
421
|
+
) -> tuple[pd.DataFrame, dict]:
|
|
422
|
+
"""Clean DataFrame and return validation report."""
|
|
423
|
+
|
|
424
|
+
# Validate required columns exist
|
|
425
|
+
missing_cols = set(required_columns) - set(df.columns)
|
|
426
|
+
if missing_cols:
|
|
427
|
+
raise ValueError(f"Missing required columns: {missing_cols}")
|
|
428
|
+
|
|
429
|
+
# Track cleaning stats
|
|
430
|
+
stats = {
|
|
431
|
+
'initial_rows': len(df),
|
|
432
|
+
'dropped_empty': 0,
|
|
433
|
+
'dropped_duplicates': 0,
|
|
434
|
+
'filled_missing': {},
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
# Clean
|
|
438
|
+
df = df.copy()
|
|
439
|
+
|
|
440
|
+
# Drop empty rows
|
|
441
|
+
before = len(df)
|
|
442
|
+
df = df.dropna(how='all')
|
|
443
|
+
stats['dropped_empty'] = before - len(df)
|
|
444
|
+
|
|
445
|
+
# Handle duplicates
|
|
446
|
+
if unique_columns:
|
|
447
|
+
before = len(df)
|
|
448
|
+
df = df.drop_duplicates(subset=unique_columns)
|
|
449
|
+
stats['dropped_duplicates'] = before - len(df)
|
|
450
|
+
|
|
451
|
+
stats['final_rows'] = len(df)
|
|
452
|
+
|
|
453
|
+
return df, stats
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
---
|
|
457
|
+
|
|
458
|
+
## Best Practices Summary
|
|
459
|
+
|
|
460
|
+
1. **Always check data quality first** - Use `.info()`, `.describe()`, and missing value analysis
|
|
461
|
+
2. **Document cleaning decisions** - Track what was dropped/filled and why
|
|
462
|
+
3. **Use nullable types** - `Int64`, `string`, `boolean` for proper NA handling
|
|
463
|
+
4. **Validate after cleaning** - Ensure data meets expectations
|
|
464
|
+
5. **Use method chaining** - Readable, maintainable cleaning pipelines
|
|
465
|
+
6. **Copy before modifying** - Avoid SettingWithCopyWarning
|
|
466
|
+
7. **Handle edge cases** - Empty strings, whitespace, invalid formats
|
|
467
|
+
|
|
468
|
+
---
|
|
469
|
+
|
|
470
|
+
## Anti-Patterns to Avoid
|
|
471
|
+
|
|
472
|
+
```python
|
|
473
|
+
# BAD: Dropping NaN without understanding impact
|
|
474
|
+
df = df.dropna() # May lose significant data
|
|
475
|
+
|
|
476
|
+
# GOOD: Investigate first, then decide
|
|
477
|
+
print(f"Missing values: {df.isna().sum()}")
|
|
478
|
+
print(f"Rows affected: {df.isna().any(axis=1).sum()}")
|
|
479
|
+
# Then make informed decision
|
|
480
|
+
|
|
481
|
+
# BAD: Filling without domain knowledge
|
|
482
|
+
df['age'] = df['age'].fillna(0) # Age 0 is not valid
|
|
483
|
+
|
|
484
|
+
# GOOD: Use appropriate fill strategy
|
|
485
|
+
df['age'] = df['age'].fillna(df['age'].median())
|
|
486
|
+
|
|
487
|
+
# BAD: Type conversion without error handling
|
|
488
|
+
df['id'] = df['id'].astype(int) # Will fail on NaN or invalid
|
|
489
|
+
|
|
490
|
+
# GOOD: Safe conversion
|
|
491
|
+
df['id'] = pd.to_numeric(df['id'], errors='coerce').astype('Int64')
|
|
492
|
+
```
|
|
493
|
+
|
|
494
|
+
---
|
|
495
|
+
|
|
496
|
+
## Related References
|
|
497
|
+
|
|
498
|
+
- `dataframe-operations.md` - Selection and filtering for targeted cleaning
|
|
499
|
+
- `aggregation-groupby.md` - Aggregate duplicates instead of dropping
|
|
500
|
+
- `performance-optimization.md` - Efficient cleaning of large datasets
|