aigroup-workflow 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/fix-build.md +10 -5
- package/.claude/commands/init-project.md +13 -8
- package/.claude/commands/plan.md +15 -8
- package/.claude/commands/review.md +12 -6
- package/.claude/commands/tdd.md +11 -5
- package/.claude/commands/workflow-start.md +20 -11
- package/.claude/settings.json +28 -0
- package/.codex/agents/architect.toml +207 -0
- package/.codex/agents/build-error-resolver.toml +110 -0
- package/.codex/agents/code-reviewer.toml +233 -0
- package/.codex/agents/doc-updater.toml +103 -0
- package/.codex/agents/e2e-runner.toml +103 -0
- package/.codex/agents/get-current-datetime.toml +23 -0
- package/.codex/agents/init-architect.toml +181 -0
- package/.codex/agents/planner.toml +208 -0
- package/.codex/agents/refactor-cleaner.toml +81 -0
- package/.codex/agents/rust-reviewer.toml +90 -0
- package/.codex/agents/security-reviewer.toml +104 -0
- package/.codex/agents/tdd-guide.toml +87 -0
- package/AGENTS.md +2 -2
- package/CLAUDE.md +23 -1
- package/LICENSE +20 -20
- package/README.md +333 -333
- package/agents/a11y-architect.md +141 -141
- package/agents/architect.md +211 -211
- package/agents/build-error-resolver.md +114 -114
- package/agents/chief-of-staff.md +151 -151
- package/agents/code-architect.md +71 -71
- package/agents/code-explorer.md +69 -69
- package/agents/code-reviewer.md +237 -237
- package/agents/code-simplifier.md +47 -47
- package/agents/comment-analyzer.md +45 -45
- package/agents/conversation-analyzer.md +52 -52
- package/agents/cpp-build-resolver.md +90 -90
- package/agents/cpp-reviewer.md +72 -72
- package/agents/csharp-reviewer.md +101 -101
- package/agents/dart-build-resolver.md +201 -201
- package/agents/database-reviewer.md +91 -91
- package/agents/doc-updater.md +107 -107
- package/agents/docs-lookup.md +68 -68
- package/agents/e2e-runner.md +107 -107
- package/agents/flutter-reviewer.md +243 -243
- package/agents/gan-evaluator.md +209 -209
- package/agents/gan-generator.md +131 -131
- package/agents/gan-planner.md +99 -99
- package/agents/get-current-datetime.md +26 -26
- package/agents/go-build-resolver.md +94 -94
- package/agents/go-reviewer.md +76 -76
- package/agents/harness-optimizer.md +35 -35
- package/agents/healthcare-reviewer.md +83 -83
- package/agents/java-build-resolver.md +153 -153
- package/agents/java-reviewer.md +92 -92
- package/agents/kotlin-build-resolver.md +118 -118
- package/agents/kotlin-reviewer.md +159 -159
- package/agents/loop-operator.md +36 -36
- package/agents/opensource-forker.md +198 -198
- package/agents/opensource-packager.md +249 -249
- package/agents/opensource-sanitizer.md +188 -188
- package/agents/performance-optimizer.md +446 -446
- package/agents/planner.md +212 -212
- package/agents/pr-test-analyzer.md +45 -45
- package/agents/python-reviewer.md +98 -98
- package/agents/pytorch-build-resolver.md +120 -120
- package/agents/refactor-cleaner.md +85 -85
- package/agents/rust-build-resolver.md +148 -148
- package/agents/rust-reviewer.md +94 -94
- package/agents/security-reviewer.md +108 -108
- package/agents/seo-specialist.md +59 -59
- package/agents/silent-failure-hunter.md +50 -50
- package/agents/tdd-guide.md +91 -91
- package/agents/type-design-analyzer.md +41 -41
- package/agents/typescript-reviewer.md +112 -112
- package/cli/commands/update.mjs +1 -1
- package/cli/utils/scaffold.mjs +53 -0
- package/docs/rules/agents.md +166 -50
- package/docs/rules/cpp/coding-style.md +44 -44
- package/docs/rules/cpp/hooks.md +39 -39
- package/docs/rules/cpp/patterns.md +51 -51
- package/docs/rules/cpp/security.md +51 -51
- package/docs/rules/cpp/testing.md +44 -44
- package/docs/rules/csharp/coding-style.md +72 -72
- package/docs/rules/csharp/hooks.md +25 -25
- package/docs/rules/csharp/patterns.md +50 -50
- package/docs/rules/csharp/security.md +58 -58
- package/docs/rules/csharp/testing.md +46 -46
- package/docs/rules/dart/coding-style.md +159 -159
- package/docs/rules/dart/hooks.md +66 -66
- package/docs/rules/dart/patterns.md +261 -261
- package/docs/rules/dart/security.md +135 -135
- package/docs/rules/dart/testing.md +215 -215
- package/docs/rules/golang/coding-style.md +32 -32
- package/docs/rules/golang/hooks.md +17 -17
- package/docs/rules/golang/patterns.md +45 -45
- package/docs/rules/golang/security.md +34 -34
- package/docs/rules/golang/testing.md +31 -31
- package/docs/rules/java/coding-style.md +114 -114
- package/docs/rules/java/hooks.md +18 -18
- package/docs/rules/java/patterns.md +146 -146
- package/docs/rules/java/security.md +100 -100
- package/docs/rules/java/testing.md +131 -131
- package/docs/rules/kotlin/coding-style.md +86 -86
- package/docs/rules/kotlin/hooks.md +17 -17
- package/docs/rules/kotlin/patterns.md +146 -146
- package/docs/rules/kotlin/security.md +82 -82
- package/docs/rules/kotlin/testing.md +128 -128
- package/docs/rules/perl/coding-style.md +46 -46
- package/docs/rules/perl/hooks.md +22 -22
- package/docs/rules/perl/patterns.md +76 -76
- package/docs/rules/perl/security.md +69 -69
- package/docs/rules/perl/testing.md +54 -54
- package/docs/rules/php/coding-style.md +40 -40
- package/docs/rules/php/hooks.md +24 -24
- package/docs/rules/php/patterns.md +33 -33
- package/docs/rules/php/security.md +37 -37
- package/docs/rules/php/testing.md +39 -39
- package/docs/rules/python/coding-style.md +42 -42
- package/docs/rules/python/hooks.md +19 -19
- package/docs/rules/python/patterns.md +39 -39
- package/docs/rules/python/security.md +30 -30
- package/docs/rules/python/testing.md +38 -38
- package/docs/rules/rust/coding-style.md +151 -151
- package/docs/rules/rust/hooks.md +16 -16
- package/docs/rules/rust/patterns.md +168 -168
- package/docs/rules/rust/security.md +141 -141
- package/docs/rules/rust/testing.md +154 -154
- package/docs/rules/swift/coding-style.md +47 -47
- package/docs/rules/swift/hooks.md +20 -20
- package/docs/rules/swift/patterns.md +66 -66
- package/docs/rules/swift/security.md +33 -33
- package/docs/rules/swift/testing.md +45 -45
- package/docs/rules/typescript/coding-style.md +199 -199
- package/docs/rules/typescript/hooks.md +22 -22
- package/docs/rules/typescript/patterns.md +52 -52
- package/docs/rules/typescript/security.md +28 -28
- package/docs/rules/typescript/testing.md +18 -18
- package/docs/rules/web/coding-style.md +96 -96
- package/docs/rules/web/design-quality.md +62 -62
- package/docs/rules/web/hooks.md +120 -120
- package/docs/rules/web/patterns.md +79 -79
- package/docs/rules/web/performance.md +64 -64
- package/docs/rules/web/security.md +57 -57
- package/docs/rules/web/testing.md +55 -55
- package/docs/templates/README.md +36 -36
- package/docs/templates/ai-project-final.md +124 -124
- package/docs/templates/ai-project.md +105 -105
- package/docs/templates/api.md +157 -157
- package/docs/templates/bug.md +62 -62
- package/docs/templates/code-review.md +87 -87
- package/docs/templates/generic.md +116 -116
- package/docs/templates/implementation-plan.md +1 -1
- package/docs/templates/meeting.md +68 -68
- package/docs/templates/prd.md +98 -98
- package/docs/templates/ui.md +134 -134
- package/docs/workflow-pipeline.md +11 -10
- package/package.json +40 -39
- package/scripts/hooks/checks/orchestration-artifacts.cjs +28 -23
- package/scripts/hooks/checks/workflow-state.cjs +4 -5
- package/scripts/orchestration/lib/orchestrator.cjs +344 -117
- package/scripts/orchestration/lib/validate.cjs +145 -0
- package/scripts/orchestration/session.cjs +88 -44
- package/skills/SUPERPOWERS-LICENSE +21 -21
- package/skills/ai-ml/fine-tuning-expert/SKILL.md +162 -162
- package/skills/ai-ml/fine-tuning-expert/references/dataset-preparation.md +540 -540
- package/skills/ai-ml/fine-tuning-expert/references/deployment-optimization.md +673 -673
- package/skills/ai-ml/fine-tuning-expert/references/evaluation-metrics.md +597 -597
- package/skills/ai-ml/fine-tuning-expert/references/hyperparameter-tuning.md +565 -565
- package/skills/ai-ml/fine-tuning-expert/references/lora-peft.md +347 -347
- package/skills/ai-ml/ml-pipeline/SKILL.md +159 -159
- package/skills/ai-ml/ml-pipeline/references/experiment-tracking.md +833 -833
- package/skills/ai-ml/ml-pipeline/references/feature-engineering.md +631 -631
- package/skills/ai-ml/ml-pipeline/references/model-validation.md +978 -978
- package/skills/ai-ml/ml-pipeline/references/pipeline-orchestration.md +907 -907
- package/skills/ai-ml/ml-pipeline/references/training-pipelines.md +782 -782
- package/skills/ai-ml/rag-architect/SKILL.md +194 -194
- package/skills/ai-ml/rag-architect/references/chunking-strategies.md +878 -878
- package/skills/ai-ml/rag-architect/references/embedding-models.md +561 -561
- package/skills/ai-ml/rag-architect/references/rag-evaluation.md +833 -833
- package/skills/ai-ml/rag-architect/references/retrieval-optimization.md +795 -795
- package/skills/ai-ml/rag-architect/references/vector-databases.md +589 -589
- package/skills/ai-ml/spark-engineer/SKILL.md +148 -148
- package/skills/ai-ml/spark-engineer/references/partitioning-caching.md +543 -543
- package/skills/ai-ml/spark-engineer/references/performance-tuning.md +544 -544
- package/skills/ai-ml/spark-engineer/references/rdd-operations.md +599 -599
- package/skills/ai-ml/spark-engineer/references/spark-sql-dataframes.md +474 -474
- package/skills/ai-ml/spark-engineer/references/streaming-patterns.md +786 -786
- package/skills/backend/api-designer/SKILL.md +217 -217
- package/skills/backend/api-designer/references/error-handling.md +541 -541
- package/skills/backend/api-designer/references/openapi.md +824 -824
- package/skills/backend/api-designer/references/pagination.md +494 -494
- package/skills/backend/api-designer/references/rest-patterns.md +335 -335
- package/skills/backend/api-designer/references/versioning.md +391 -391
- package/skills/backend/architecture-designer/SKILL.md +117 -117
- package/skills/backend/architecture-designer/references/adr-template.md +116 -116
- package/skills/backend/architecture-designer/references/architecture-patterns.md +111 -111
- package/skills/backend/architecture-designer/references/database-selection.md +102 -102
- package/skills/backend/architecture-designer/references/nfr-checklist.md +112 -112
- package/skills/backend/architecture-designer/references/system-design.md +100 -100
- package/skills/backend/code-documenter/SKILL.md +147 -147
- package/skills/backend/code-documenter/references/api-docs-fastapi-django.md +166 -166
- package/skills/backend/code-documenter/references/api-docs-nestjs-express.md +220 -220
- package/skills/backend/code-documenter/references/coverage-reports.md +125 -125
- package/skills/backend/code-documenter/references/documentation-systems.md +333 -333
- package/skills/backend/code-documenter/references/interactive-api-docs.md +531 -531
- package/skills/backend/code-documenter/references/python-docstrings.md +121 -121
- package/skills/backend/code-documenter/references/typescript-jsdoc.md +145 -145
- package/skills/backend/code-documenter/references/user-guides-tutorials.md +530 -530
- package/skills/backend/debugging-wizard/SKILL.md +105 -105
- package/skills/backend/debugging-wizard/references/common-patterns.md +132 -132
- package/skills/backend/debugging-wizard/references/debugging-tools.md +140 -140
- package/skills/backend/debugging-wizard/references/quick-fixes.md +177 -177
- package/skills/backend/debugging-wizard/references/strategies.md +142 -142
- package/skills/backend/debugging-wizard/references/systematic-debugging.md +367 -367
- package/skills/backend/feature-forge/SKILL.md +98 -98
- package/skills/backend/feature-forge/references/acceptance-criteria.md +104 -104
- package/skills/backend/feature-forge/references/ears-syntax.md +99 -99
- package/skills/backend/feature-forge/references/interview-questions.md +150 -150
- package/skills/backend/feature-forge/references/pre-discovery-subagents.md +54 -54
- package/skills/backend/feature-forge/references/specification-template.md +103 -103
- package/skills/backend/fullstack-guardian/SKILL.md +105 -105
- package/skills/backend/fullstack-guardian/references/api-design-standards.md +307 -307
- package/skills/backend/fullstack-guardian/references/architecture-decisions.md +350 -350
- package/skills/backend/fullstack-guardian/references/backend-patterns.md +237 -237
- package/skills/backend/fullstack-guardian/references/common-patterns.md +134 -134
- package/skills/backend/fullstack-guardian/references/deliverables-checklist.md +354 -354
- package/skills/backend/fullstack-guardian/references/design-template.md +91 -91
- package/skills/backend/fullstack-guardian/references/error-handling.md +135 -135
- package/skills/backend/fullstack-guardian/references/frontend-patterns.md +340 -340
- package/skills/backend/fullstack-guardian/references/integration-patterns.md +333 -333
- package/skills/backend/fullstack-guardian/references/security-checklist.md +106 -106
- package/skills/backend/graphql-architect/SKILL.md +146 -146
- package/skills/backend/graphql-architect/references/federation.md +418 -418
- package/skills/backend/graphql-architect/references/migration-from-rest.md +1141 -1141
- package/skills/backend/graphql-architect/references/resolvers.md +425 -425
- package/skills/backend/graphql-architect/references/schema-design.md +393 -393
- package/skills/backend/graphql-architect/references/security.md +569 -569
- package/skills/backend/graphql-architect/references/subscriptions.md +510 -510
- package/skills/backend/legacy-modernizer/SKILL.md +137 -137
- package/skills/backend/legacy-modernizer/references/legacy-testing.md +381 -381
- package/skills/backend/legacy-modernizer/references/migration-strategies.md +423 -423
- package/skills/backend/legacy-modernizer/references/refactoring-patterns.md +395 -395
- package/skills/backend/legacy-modernizer/references/strangler-fig-pattern.md +281 -281
- package/skills/backend/legacy-modernizer/references/system-assessment.md +487 -487
- package/skills/backend/microservices-architect/SKILL.md +164 -164
- package/skills/backend/microservices-architect/references/communication.md +499 -499
- package/skills/backend/microservices-architect/references/data.md +721 -721
- package/skills/backend/microservices-architect/references/decomposition.md +344 -344
- package/skills/backend/microservices-architect/references/observability.md +805 -805
- package/skills/backend/microservices-architect/references/patterns.md +603 -603
- package/skills/database/database-optimizer/SKILL.md +147 -147
- package/skills/database/database-optimizer/references/index-strategies.md +331 -331
- package/skills/database/database-optimizer/references/monitoring-analysis.md +501 -501
- package/skills/database/database-optimizer/references/mysql-tuning.md +452 -452
- package/skills/database/database-optimizer/references/postgresql-tuning.md +413 -413
- package/skills/database/database-optimizer/references/query-optimization.md +251 -251
- package/skills/database/postgres-pro/SKILL.md +152 -152
- package/skills/database/postgres-pro/references/extensions.md +404 -404
- package/skills/database/postgres-pro/references/jsonb.md +321 -321
- package/skills/database/postgres-pro/references/maintenance.md +481 -481
- package/skills/database/postgres-pro/references/performance.md +265 -265
- package/skills/database/postgres-pro/references/replication.md +446 -446
- package/skills/database/sql-pro/SKILL.md +129 -129
- package/skills/database/sql-pro/references/database-design.md +402 -402
- package/skills/database/sql-pro/references/dialect-differences.md +419 -419
- package/skills/database/sql-pro/references/optimization.md +384 -384
- package/skills/database/sql-pro/references/query-patterns.md +285 -285
- package/skills/database/sql-pro/references/window-functions.md +328 -328
- package/skills/dotnet/csharp-developer/SKILL.md +125 -125
- package/skills/dotnet/csharp-developer/references/aspnet-core.md +394 -394
- package/skills/dotnet/csharp-developer/references/blazor.md +553 -553
- package/skills/dotnet/csharp-developer/references/entity-framework.md +409 -409
- package/skills/dotnet/csharp-developer/references/modern-csharp.md +248 -248
- package/skills/dotnet/csharp-developer/references/performance.md +498 -498
- package/skills/dotnet/dotnet-core-expert/SKILL.md +138 -138
- package/skills/dotnet/dotnet-core-expert/references/authentication.md +546 -546
- package/skills/dotnet/dotnet-core-expert/references/clean-architecture.md +455 -455
- package/skills/dotnet/dotnet-core-expert/references/cloud-native.md +548 -548
- package/skills/dotnet/dotnet-core-expert/references/entity-framework.md +440 -440
- package/skills/dotnet/dotnet-core-expert/references/minimal-apis.md +319 -319
- package/skills/frontend/angular-architect/SKILL.md +152 -152
- package/skills/frontend/angular-architect/references/components.md +297 -297
- package/skills/frontend/angular-architect/references/ngrx.md +401 -401
- package/skills/frontend/angular-architect/references/routing.md +361 -361
- package/skills/frontend/angular-architect/references/rxjs.md +319 -319
- package/skills/frontend/angular-architect/references/testing.md +405 -405
- package/skills/frontend/design-commands/design.md +91 -91
- package/skills/frontend/design-commands/handoff.md +97 -97
- package/skills/frontend/design-commands/prototype.md +120 -120
- package/skills/frontend/design-commands/spec.md +160 -160
- package/skills/frontend/design-commands/style.md +78 -78
- package/skills/frontend/flutter-expert/SKILL.md +138 -138
- package/skills/frontend/flutter-expert/references/bloc-state.md +259 -259
- package/skills/frontend/flutter-expert/references/gorouter-navigation.md +119 -119
- package/skills/frontend/flutter-expert/references/performance.md +99 -99
- package/skills/frontend/flutter-expert/references/project-structure.md +118 -118
- package/skills/frontend/flutter-expert/references/riverpod-state.md +130 -130
- package/skills/frontend/flutter-expert/references/widget-patterns.md +123 -123
- package/skills/frontend/nextjs-developer/SKILL.md +143 -143
- package/skills/frontend/nextjs-developer/references/app-router.md +311 -311
- package/skills/frontend/nextjs-developer/references/data-fetching.md +482 -482
- package/skills/frontend/nextjs-developer/references/deployment.md +545 -545
- package/skills/frontend/nextjs-developer/references/server-actions.md +462 -462
- package/skills/frontend/nextjs-developer/references/server-components.md +384 -384
- package/skills/frontend/react-expert/SKILL.md +149 -149
- package/skills/frontend/react-expert/references/hooks-patterns.md +162 -162
- package/skills/frontend/react-expert/references/migration-class-to-modern.md +1119 -1119
- package/skills/frontend/react-expert/references/performance.md +168 -168
- package/skills/frontend/react-expert/references/react-19-features.md +174 -174
- package/skills/frontend/react-expert/references/server-components.md +143 -143
- package/skills/frontend/react-expert/references/state-management.md +171 -171
- package/skills/frontend/react-expert/references/testing-react.md +174 -174
- package/skills/frontend/react-native-expert/SKILL.md +185 -185
- package/skills/frontend/react-native-expert/references/expo-router.md +187 -187
- package/skills/frontend/react-native-expert/references/list-optimization.md +204 -204
- package/skills/frontend/react-native-expert/references/platform-handling.md +188 -188
- package/skills/frontend/react-native-expert/references/project-structure.md +171 -171
- package/skills/frontend/react-native-expert/references/storage-hooks.md +173 -173
- package/skills/frontend/senior-frontend/SKILL.md +477 -477
- package/skills/frontend/senior-frontend/references/frontend_best_practices.md +806 -806
- package/skills/frontend/senior-frontend/references/nextjs_optimization_guide.md +724 -724
- package/skills/frontend/senior-frontend/references/react_patterns.md +746 -746
- package/skills/frontend/senior-frontend/scripts/bundle_analyzer.py +407 -407
- package/skills/frontend/senior-frontend/scripts/component_generator.py +329 -329
- package/skills/frontend/senior-frontend/scripts/frontend_scaffolder.py +1005 -1005
- package/skills/frontend/ui-ux-pro-max/SKILL.md +386 -386
- package/skills/frontend/ui-ux-pro-max/data/charts.csv +26 -26
- package/skills/frontend/ui-ux-pro-max/data/colors.csv +97 -97
- package/skills/frontend/ui-ux-pro-max/data/icons.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/landing.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/data/products.csv +96 -96
- package/skills/frontend/ui-ux-pro-max/data/react-performance.csv +45 -45
- package/skills/frontend/ui-ux-pro-max/data/stacks/astro.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/flutter.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -56
- package/skills/frontend/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nextjs.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -59
- package/skills/frontend/ui-ux-pro-max/data/stacks/react-native.csv +52 -52
- package/skills/frontend/ui-ux-pro-max/data/stacks/react.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/shadcn.csv +61 -61
- package/skills/frontend/ui-ux-pro-max/data/stacks/svelte.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/swiftui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/vue.csv +50 -50
- package/skills/frontend/ui-ux-pro-max/data/styles.csv +68 -68
- package/skills/frontend/ui-ux-pro-max/data/typography.csv +57 -57
- package/skills/frontend/ui-ux-pro-max/data/ui-reasoning.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/ux-guidelines.csv +99 -99
- package/skills/frontend/ui-ux-pro-max/data/web-interface.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/scripts/core.py +253 -253
- package/skills/frontend/ui-ux-pro-max/scripts/design_system.py +1067 -1067
- package/skills/frontend/ui-ux-pro-max/scripts/search.py +114 -114
- package/skills/frontend/vue-expert/SKILL.md +98 -98
- package/skills/frontend/vue-expert/references/build-tooling.md +480 -480
- package/skills/frontend/vue-expert/references/components.md +448 -448
- package/skills/frontend/vue-expert/references/composition-api.md +299 -299
- package/skills/frontend/vue-expert/references/mobile-hybrid.md +636 -636
- package/skills/frontend/vue-expert/references/nuxt.md +669 -669
- package/skills/frontend/vue-expert/references/state-management.md +449 -449
- package/skills/frontend/vue-expert/references/typescript.md +584 -584
- package/skills/frontend/vue-expert-js/SKILL.md +167 -167
- package/skills/frontend/vue-expert-js/references/component-architecture.md +219 -219
- package/skills/frontend/vue-expert-js/references/composables-patterns.md +183 -183
- package/skills/frontend/vue-expert-js/references/jsdoc-typing.md +535 -535
- package/skills/frontend/vue-expert-js/references/state-management.md +249 -249
- package/skills/frontend/vue-expert-js/references/testing-patterns.md +237 -237
- package/skills/go-rust-cpp/cpp-pro/SKILL.md +115 -115
- package/skills/go-rust-cpp/cpp-pro/references/build-tooling.md +440 -440
- package/skills/go-rust-cpp/cpp-pro/references/concurrency.md +437 -437
- package/skills/go-rust-cpp/cpp-pro/references/memory-performance.md +397 -397
- package/skills/go-rust-cpp/cpp-pro/references/modern-cpp.md +304 -304
- package/skills/go-rust-cpp/cpp-pro/references/templates.md +357 -357
- package/skills/go-rust-cpp/golang-pro/SKILL.md +122 -122
- package/skills/go-rust-cpp/golang-pro/references/concurrency.md +329 -329
- package/skills/go-rust-cpp/golang-pro/references/generics.md +442 -442
- package/skills/go-rust-cpp/golang-pro/references/interfaces.md +432 -432
- package/skills/go-rust-cpp/golang-pro/references/project-structure.md +477 -477
- package/skills/go-rust-cpp/golang-pro/references/testing.md +451 -451
- package/skills/go-rust-cpp/rust-engineer/SKILL.md +167 -167
- package/skills/go-rust-cpp/rust-engineer/references/async.md +458 -458
- package/skills/go-rust-cpp/rust-engineer/references/error-handling.md +334 -334
- package/skills/go-rust-cpp/rust-engineer/references/ownership.md +278 -278
- package/skills/go-rust-cpp/rust-engineer/references/testing.md +470 -470
- package/skills/go-rust-cpp/rust-engineer/references/traits.md +413 -413
- package/skills/infra/cli-developer/SKILL.md +113 -113
- package/skills/infra/cli-developer/references/design-patterns.md +221 -221
- package/skills/infra/cli-developer/references/go-cli.md +540 -540
- package/skills/infra/cli-developer/references/node-cli.md +383 -383
- package/skills/infra/cli-developer/references/python-cli.md +422 -422
- package/skills/infra/cli-developer/references/ux-patterns.md +448 -448
- package/skills/infra/cloud-architect/SKILL.md +216 -216
- package/skills/infra/cloud-architect/references/aws.md +394 -394
- package/skills/infra/cloud-architect/references/azure.md +562 -562
- package/skills/infra/cloud-architect/references/cost.md +582 -582
- package/skills/infra/cloud-architect/references/gcp.md +633 -633
- package/skills/infra/cloud-architect/references/multi-cloud.md +483 -483
- package/skills/infra/devops-engineer/SKILL.md +144 -144
- package/skills/infra/devops-engineer/references/deployment-strategies.md +241 -241
- package/skills/infra/devops-engineer/references/docker-patterns.md +113 -113
- package/skills/infra/devops-engineer/references/github-actions.md +139 -139
- package/skills/infra/devops-engineer/references/incident-response.md +331 -331
- package/skills/infra/devops-engineer/references/kubernetes.md +154 -154
- package/skills/infra/devops-engineer/references/platform-engineering.md +417 -417
- package/skills/infra/devops-engineer/references/release-automation.md +527 -527
- package/skills/infra/devops-engineer/references/terraform-iac.md +141 -141
- package/skills/infra/kubernetes-specialist/SKILL.md +241 -241
- package/skills/infra/kubernetes-specialist/references/configuration.md +452 -452
- package/skills/infra/kubernetes-specialist/references/cost-optimization.md +458 -458
- package/skills/infra/kubernetes-specialist/references/custom-operators.md +563 -563
- package/skills/infra/kubernetes-specialist/references/gitops.md +530 -530
- package/skills/infra/kubernetes-specialist/references/helm-charts.md +912 -912
- package/skills/infra/kubernetes-specialist/references/multi-cluster.md +507 -507
- package/skills/infra/kubernetes-specialist/references/networking.md +447 -447
- package/skills/infra/kubernetes-specialist/references/service-mesh.md +459 -459
- package/skills/infra/kubernetes-specialist/references/storage.md +535 -535
- package/skills/infra/kubernetes-specialist/references/troubleshooting.md +414 -414
- package/skills/infra/kubernetes-specialist/references/workloads.md +377 -377
- package/skills/infra/mcp-developer/SKILL.md +143 -143
- package/skills/infra/mcp-developer/references/protocol.md +244 -244
- package/skills/infra/mcp-developer/references/python-sdk.md +367 -367
- package/skills/infra/mcp-developer/references/resources.md +554 -554
- package/skills/infra/mcp-developer/references/tools.md +480 -480
- package/skills/infra/mcp-developer/references/typescript-sdk.md +350 -350
- package/skills/infra/monitoring-expert/SKILL.md +176 -176
- package/skills/infra/monitoring-expert/references/alerting-rules.md +141 -141
- package/skills/infra/monitoring-expert/references/application-profiling.md +331 -331
- package/skills/infra/monitoring-expert/references/capacity-planning.md +344 -344
- package/skills/infra/monitoring-expert/references/dashboards.md +126 -126
- package/skills/infra/monitoring-expert/references/opentelemetry.md +123 -123
- package/skills/infra/monitoring-expert/references/performance-testing.md +269 -269
- package/skills/infra/monitoring-expert/references/prometheus-metrics.md +136 -136
- package/skills/infra/monitoring-expert/references/structured-logging.md +142 -142
- package/skills/infra/sre-engineer/SKILL.md +181 -181
- package/skills/infra/sre-engineer/references/automation-toil.md +492 -492
- package/skills/infra/sre-engineer/references/error-budget-policy.md +334 -334
- package/skills/infra/sre-engineer/references/incident-chaos.md +576 -576
- package/skills/infra/sre-engineer/references/monitoring-alerting.md +424 -424
- package/skills/infra/sre-engineer/references/slo-sli-management.md +238 -238
- package/skills/infra/terraform-engineer/SKILL.md +143 -143
- package/skills/infra/terraform-engineer/references/best-practices.md +583 -583
- package/skills/infra/terraform-engineer/references/module-patterns.md +297 -297
- package/skills/infra/terraform-engineer/references/providers.md +452 -452
- package/skills/infra/terraform-engineer/references/state-management.md +371 -371
- package/skills/infra/terraform-engineer/references/testing.md +486 -486
- package/skills/infra/websocket-engineer/SKILL.md +168 -168
- package/skills/infra/websocket-engineer/references/alternatives.md +391 -391
- package/skills/infra/websocket-engineer/references/patterns.md +400 -400
- package/skills/infra/websocket-engineer/references/protocol.md +195 -195
- package/skills/infra/websocket-engineer/references/scaling.md +333 -333
- package/skills/infra/websocket-engineer/references/security.md +474 -474
- package/skills/java/java-architect/SKILL.md +132 -132
- package/skills/java/java-architect/references/jpa-optimization.md +393 -393
- package/skills/java/java-architect/references/reactive-webflux.md +356 -356
- package/skills/java/java-architect/references/spring-boot-setup.md +269 -269
- package/skills/java/java-architect/references/spring-security.md +445 -445
- package/skills/java/java-architect/references/testing-patterns.md +500 -500
- package/skills/java/kotlin-specialist/SKILL.md +147 -147
- package/skills/java/kotlin-specialist/references/android-compose.md +419 -419
- package/skills/java/kotlin-specialist/references/coroutines-flow.md +276 -276
- package/skills/java/kotlin-specialist/references/dsl-idioms.md +421 -421
- package/skills/java/kotlin-specialist/references/ktor-server.md +426 -426
- package/skills/java/kotlin-specialist/references/multiplatform-kmp.md +380 -380
- package/skills/java/spring-boot-engineer/SKILL.md +195 -195
- package/skills/java/spring-boot-engineer/references/cloud.md +498 -498
- package/skills/java/spring-boot-engineer/references/data.md +381 -381
- package/skills/java/spring-boot-engineer/references/security.md +459 -459
- package/skills/java/spring-boot-engineer/references/testing.md +545 -545
- package/skills/java/spring-boot-engineer/references/web.md +295 -295
- package/skills/javascript/javascript-pro/SKILL.md +132 -132
- package/skills/javascript/javascript-pro/references/async-patterns.md +334 -334
- package/skills/javascript/javascript-pro/references/browser-apis.md +398 -398
- package/skills/javascript/javascript-pro/references/modern-syntax.md +272 -272
- package/skills/javascript/javascript-pro/references/modules.md +357 -357
- package/skills/javascript/javascript-pro/references/node-essentials.md +471 -471
- package/skills/javascript/nestjs-expert/SKILL.md +206 -206
- package/skills/javascript/nestjs-expert/references/authentication.md +166 -166
- package/skills/javascript/nestjs-expert/references/controllers-routing.md +111 -111
- package/skills/javascript/nestjs-expert/references/dtos-validation.md +153 -153
- package/skills/javascript/nestjs-expert/references/migration-from-express.md +1237 -1237
- package/skills/javascript/nestjs-expert/references/services-di.md +140 -140
- package/skills/javascript/nestjs-expert/references/testing-patterns.md +186 -186
- package/skills/javascript/typescript-pro/SKILL.md +145 -145
- package/skills/javascript/typescript-pro/references/advanced-types.md +259 -259
- package/skills/javascript/typescript-pro/references/configuration.md +445 -445
- package/skills/javascript/typescript-pro/references/patterns.md +484 -484
- package/skills/javascript/typescript-pro/references/type-guards.md +352 -352
- package/skills/javascript/typescript-pro/references/utility-types.md +329 -329
- package/skills/php/laravel-specialist/SKILL.md +262 -262
- package/skills/php/laravel-specialist/references/eloquent.md +351 -351
- package/skills/php/laravel-specialist/references/livewire.md +512 -512
- package/skills/php/laravel-specialist/references/queues.md +423 -423
- package/skills/php/laravel-specialist/references/routing.md +362 -362
- package/skills/php/laravel-specialist/references/testing.md +522 -522
- package/skills/php/php-pro/SKILL.md +206 -206
- package/skills/php/php-pro/references/async-patterns.md +412 -412
- package/skills/php/php-pro/references/laravel-patterns.md +377 -377
- package/skills/php/php-pro/references/modern-php-features.md +323 -323
- package/skills/php/php-pro/references/symfony-patterns.md +466 -466
- package/skills/php/php-pro/references/testing-quality.md +466 -466
- package/skills/product/competitive-analysis/SKILL.md +257 -257
- package/skills/product/meeting-notes/SKILL.md +266 -266
- package/skills/product/prd-template/SKILL.md +150 -150
- package/skills/product/stakeholder-update/SKILL.md +225 -225
- package/skills/product/user-research-synthesis/SKILL.md +235 -235
- package/skills/python/django-expert/SKILL.md +162 -162
- package/skills/python/django-expert/references/authentication.md +145 -145
- package/skills/python/django-expert/references/drf-serializers.md +148 -148
- package/skills/python/django-expert/references/models-orm.md +151 -151
- package/skills/python/django-expert/references/testing-django.md +204 -204
- package/skills/python/django-expert/references/viewsets-views.md +153 -153
- package/skills/python/fastapi-expert/SKILL.md +185 -185
- package/skills/python/fastapi-expert/references/async-sqlalchemy.md +146 -146
- package/skills/python/fastapi-expert/references/authentication.md +159 -159
- package/skills/python/fastapi-expert/references/endpoints-routing.md +142 -142
- package/skills/python/fastapi-expert/references/migration-from-django.md +996 -996
- package/skills/python/fastapi-expert/references/pydantic-v2.md +135 -135
- package/skills/python/fastapi-expert/references/testing-async.md +159 -159
- package/skills/python/pandas-pro/SKILL.md +178 -178
- package/skills/python/pandas-pro/references/aggregation-groupby.md +545 -545
- package/skills/python/pandas-pro/references/data-cleaning.md +500 -500
- package/skills/python/pandas-pro/references/dataframe-operations.md +420 -420
- package/skills/python/pandas-pro/references/merging-joining.md +596 -596
- package/skills/python/pandas-pro/references/performance-optimization.md +597 -597
- package/skills/python/python-pro/SKILL.md +177 -177
- package/skills/python/python-pro/references/async-patterns.md +356 -356
- package/skills/python/python-pro/references/packaging.md +460 -460
- package/skills/python/python-pro/references/standard-library.md +378 -378
- package/skills/python/python-pro/references/testing.md +404 -404
- package/skills/python/python-pro/references/type-system.md +290 -290
- package/skills/quality/chaos-engineer/SKILL.md +182 -182
- package/skills/quality/chaos-engineer/references/chaos-tools.md +511 -511
- package/skills/quality/chaos-engineer/references/experiment-design.md +229 -229
- package/skills/quality/chaos-engineer/references/game-days.md +434 -434
- package/skills/quality/chaos-engineer/references/infrastructure-chaos.md +348 -348
- package/skills/quality/chaos-engineer/references/kubernetes-chaos.md +432 -432
- package/skills/quality/code-reviewer/SKILL.md +119 -119
- package/skills/quality/code-reviewer/references/common-issues.md +142 -142
- package/skills/quality/code-reviewer/references/feedback-examples.md +144 -144
- package/skills/quality/code-reviewer/references/receiving-feedback.md +238 -238
- package/skills/quality/code-reviewer/references/report-template.md +109 -109
- package/skills/quality/code-reviewer/references/review-checklist.md +88 -88
- package/skills/quality/code-reviewer/references/spec-compliance-review.md +258 -258
- package/skills/quality/playwright-expert/SKILL.md +169 -169
- package/skills/quality/playwright-expert/references/api-mocking.md +140 -140
- package/skills/quality/playwright-expert/references/configuration.md +155 -155
- package/skills/quality/playwright-expert/references/debugging-flaky.md +150 -150
- package/skills/quality/playwright-expert/references/page-object-model.md +152 -152
- package/skills/quality/playwright-expert/references/selectors-locators.md +119 -119
- package/skills/quality/secure-code-guardian/SKILL.md +191 -191
- package/skills/quality/secure-code-guardian/references/authentication.md +136 -136
- package/skills/quality/secure-code-guardian/references/input-validation.md +146 -146
- package/skills/quality/secure-code-guardian/references/owasp-prevention.md +135 -135
- package/skills/quality/secure-code-guardian/references/security-headers.md +133 -133
- package/skills/quality/secure-code-guardian/references/xss-csrf.md +157 -157
- package/skills/quality/security-reviewer/SKILL.md +103 -103
- package/skills/quality/security-reviewer/references/infrastructure-security.md +268 -268
- package/skills/quality/security-reviewer/references/penetration-testing.md +268 -268
- package/skills/quality/security-reviewer/references/report-template.md +170 -170
- package/skills/quality/security-reviewer/references/sast-tools.md +117 -117
- package/skills/quality/security-reviewer/references/secret-scanning.md +125 -125
- package/skills/quality/security-reviewer/references/vulnerability-patterns.md +152 -152
- package/skills/quality/senior-qa/README.md +196 -196
- package/skills/quality/senior-qa/SKILL.md +399 -399
- package/skills/quality/senior-qa/references/qa_best_practices.md +964 -964
- package/skills/quality/senior-qa/references/test_automation_patterns.md +1009 -1009
- package/skills/quality/senior-qa/references/testing_strategies.md +649 -649
- package/skills/quality/senior-qa/scripts/coverage_analyzer.py +836 -836
- package/skills/quality/senior-qa/scripts/e2e_test_scaffolder.py +820 -820
- package/skills/quality/senior-qa/scripts/test_suite_generator.py +605 -605
- package/skills/quality/tdd-guide/HOW_TO_USE.md +313 -313
- package/skills/quality/tdd-guide/README.md +680 -680
- package/skills/quality/tdd-guide/SKILL.md +122 -122
- package/skills/quality/tdd-guide/assets/expected_output.json +77 -77
- package/skills/quality/tdd-guide/assets/sample_input_python.json +39 -39
- package/skills/quality/tdd-guide/assets/sample_input_typescript.json +36 -36
- package/skills/quality/tdd-guide/references/ci-integration.md +195 -195
- package/skills/quality/tdd-guide/references/framework-guide.md +206 -206
- package/skills/quality/tdd-guide/references/tdd-best-practices.md +128 -128
- package/skills/quality/tdd-guide/scripts/coverage_analyzer.py +434 -434
- package/skills/quality/tdd-guide/scripts/fixture_generator.py +440 -440
- package/skills/quality/tdd-guide/scripts/format_detector.py +384 -384
- package/skills/quality/tdd-guide/scripts/framework_adapter.py +428 -428
- package/skills/quality/tdd-guide/scripts/metrics_calculator.py +456 -456
- package/skills/quality/tdd-guide/scripts/output_formatter.py +354 -354
- package/skills/quality/tdd-guide/scripts/tdd_workflow.py +474 -474
- package/skills/quality/tdd-guide/scripts/test_generator.py +438 -438
- package/skills/quality/test-master/SKILL.md +94 -94
- package/skills/quality/test-master/references/automation-frameworks.md +294 -294
- package/skills/quality/test-master/references/e2e-testing.md +128 -128
- package/skills/quality/test-master/references/integration-testing.md +120 -120
- package/skills/quality/test-master/references/performance-testing.md +118 -118
- package/skills/quality/test-master/references/qa-methodology.md +247 -247
- package/skills/quality/test-master/references/security-testing.md +127 -127
- package/skills/quality/test-master/references/tdd-iron-laws.md +174 -174
- package/skills/quality/test-master/references/test-reports.md +104 -104
- package/skills/quality/test-master/references/testing-anti-patterns.md +231 -231
- package/skills/quality/test-master/references/unit-testing.md +113 -113
- package/skills/ruby/rails-expert/SKILL.md +154 -154
- package/skills/ruby/rails-expert/references/active-record.md +244 -244
- package/skills/ruby/rails-expert/references/api-development.md +401 -401
- package/skills/ruby/rails-expert/references/background-jobs.md +272 -272
- package/skills/ruby/rails-expert/references/hotwire-turbo.md +228 -228
- package/skills/ruby/rails-expert/references/rspec-testing.md +367 -367
- package/skills/swift/swift-expert/SKILL.md +163 -163
- package/skills/swift/swift-expert/references/async-concurrency.md +360 -360
- package/skills/swift/swift-expert/references/memory-performance.md +377 -377
- package/skills/swift/swift-expert/references/protocol-oriented.md +354 -354
- package/skills/swift/swift-expert/references/swiftui-patterns.md +291 -291
- package/skills/swift/swift-expert/references/testing-patterns.md +399 -399
- package/skills/workflow/brainstorming/SKILL.md +164 -164
- package/skills/workflow/brainstorming/scripts/frame-template.html +214 -214
- package/skills/workflow/brainstorming/scripts/helper.js +88 -88
- package/skills/workflow/brainstorming/scripts/server.cjs +354 -354
- package/skills/workflow/brainstorming/scripts/start-server.sh +148 -148
- package/skills/workflow/brainstorming/scripts/stop-server.sh +56 -56
- package/skills/workflow/brainstorming/spec-document-reviewer-prompt.md +49 -49
- package/skills/workflow/brainstorming/visual-companion.md +287 -287
- package/skills/workflow/documentation/SKILL.md +45 -45
- package/skills/workflow/entropy-management/SKILL.md +115 -115
- package/skills/workflow/executing-plans/SKILL.md +70 -70
- package/skills/workflow/finishing-a-development-branch/SKILL.md +200 -200
- package/skills/workflow/receiving-code-review/SKILL.md +213 -213
- package/skills/workflow/requesting-code-review/SKILL.md +105 -105
- package/skills/workflow/requesting-code-review/code-reviewer.md +146 -146
- package/skills/workflow/requirement-engineering/SKILL.md +111 -111
- package/skills/workflow/systematic-debugging/CREATION-LOG.md +119 -119
- package/skills/workflow/systematic-debugging/SKILL.md +296 -296
- package/skills/workflow/systematic-debugging/condition-based-waiting-example.ts +158 -158
- package/skills/workflow/systematic-debugging/condition-based-waiting.md +115 -115
- package/skills/workflow/systematic-debugging/defense-in-depth.md +122 -122
- package/skills/workflow/systematic-debugging/find-polluter.sh +63 -63
- package/skills/workflow/systematic-debugging/root-cause-tracing.md +169 -169
- package/skills/workflow/systematic-debugging/test-academic.md +14 -14
- package/skills/workflow/systematic-debugging/test-pressure-1.md +58 -58
- package/skills/workflow/systematic-debugging/test-pressure-2.md +68 -68
- package/skills/workflow/systematic-debugging/test-pressure-3.md +69 -69
- package/skills/workflow/using-git-worktrees/SKILL.md +218 -218
- package/skills/workflow/verification-before-completion/SKILL.md +139 -139
- package/skills/workflow/writing-plans/SKILL.md +151 -151
- package/skills/workflow/writing-plans/plan-document-reviewer-prompt.md +49 -49
- package/skills/workflow/writing-skills/SKILL.md +655 -655
- package/skills/workflow/writing-skills/anthropic-best-practices.md +1150 -1150
- package/skills/workflow/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -189
- package/skills/workflow/writing-skills/persuasion-principles.md +187 -187
- package/skills/workflow/writing-skills/render-graphs.js +168 -168
- package/skills/workflow/writing-skills/testing-skills-with-subagents.md +384 -384
|
@@ -1,544 +1,544 @@
|
|
|
1
|
-
# Performance Tuning
|
|
2
|
-
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
## Cluster Sizing
|
|
6
|
-
|
|
7
|
-
### Executor Configuration
|
|
8
|
-
|
|
9
|
-
```python
|
|
10
|
-
# Key executor configurations
|
|
11
|
-
spark.conf.set("spark.executor.instances", 10) # Number of executors
|
|
12
|
-
spark.conf.set("spark.executor.cores", 4) # Cores per executor
|
|
13
|
-
spark.conf.set("spark.executor.memory", "16g") # Memory per executor
|
|
14
|
-
|
|
15
|
-
# Dynamic allocation (recommended for varying workloads)
|
|
16
|
-
spark.conf.set("spark.dynamicAllocation.enabled", "true")
|
|
17
|
-
spark.conf.set("spark.dynamicAllocation.minExecutors", 2)
|
|
18
|
-
spark.conf.set("spark.dynamicAllocation.maxExecutors", 100)
|
|
19
|
-
spark.conf.set("spark.dynamicAllocation.executorIdleTimeout", "60s")
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
### Sizing Guidelines
|
|
23
|
-
|
|
24
|
-
| Cluster Size | Executor Memory | Executor Cores | Instances |
|
|
25
|
-
|--------------|-----------------|----------------|-----------|
|
|
26
|
-
| Small (dev) | 4-8GB | 2-4 | 2-5 |
|
|
27
|
-
| Medium | 8-16GB | 4-5 | 10-50 |
|
|
28
|
-
| Large | 16-32GB | 5-8 | 50-200 |
|
|
29
|
-
| Very Large | 32-64GB | 8-16 | 200+ |
|
|
30
|
-
|
|
31
|
-
**Rules of thumb:**
|
|
32
|
-
- 5 cores per executor is optimal (avoids HDFS I/O bottleneck)
|
|
33
|
-
- Leave 1 core per node for OS/YARN
|
|
34
|
-
- Leave 1GB per node for overhead
|
|
35
|
-
- executor.memoryOverhead = max(384MB, 10% of executor.memory)
|
|
36
|
-
|
|
37
|
-
### Memory Configuration
|
|
38
|
-
|
|
39
|
-
```python
|
|
40
|
-
# Executor memory breakdown
|
|
41
|
-
spark.conf.set("spark.executor.memory", "16g")
|
|
42
|
-
spark.conf.set("spark.executor.memoryOverhead", "2g") # For off-heap, network buffers
|
|
43
|
-
|
|
44
|
-
# Memory fractions (default values usually good)
|
|
45
|
-
spark.conf.set("spark.memory.fraction", 0.6) # Unified memory pool
|
|
46
|
-
spark.conf.set("spark.memory.storageFraction", 0.5) # Cache vs execution split
|
|
47
|
-
|
|
48
|
-
# Off-heap memory (for large data)
|
|
49
|
-
spark.conf.set("spark.memory.offHeap.enabled", "true")
|
|
50
|
-
spark.conf.set("spark.memory.offHeap.size", "8g")
|
|
51
|
-
```
|
|
52
|
-
|
|
53
|
-
---
|
|
54
|
-
|
|
55
|
-
## Shuffle Optimization
|
|
56
|
-
|
|
57
|
-
### Shuffle Configuration
|
|
58
|
-
|
|
59
|
-
```python
|
|
60
|
-
# Number of shuffle partitions
|
|
61
|
-
spark.conf.set("spark.sql.shuffle.partitions", 200) # Adjust based on data size
|
|
62
|
-
|
|
63
|
-
# Shuffle behavior
|
|
64
|
-
spark.conf.set("spark.shuffle.compress", "true") # Compress shuffle data
|
|
65
|
-
spark.conf.set("spark.shuffle.spill.compress", "true") # Compress spill data
|
|
66
|
-
spark.conf.set("spark.io.compression.codec", "lz4") # Fast compression
|
|
67
|
-
|
|
68
|
-
# Shuffle file management
|
|
69
|
-
spark.conf.set("spark.shuffle.file.buffer", "64k") # Buffer for shuffle writes
|
|
70
|
-
spark.conf.set("spark.shuffle.io.maxRetries", 3) # Retry failed fetches
|
|
71
|
-
spark.conf.set("spark.shuffle.io.retryWait", "5s") # Wait between retries
|
|
72
|
-
|
|
73
|
-
# Sort-based shuffle (default in Spark 2.0+)
|
|
74
|
-
spark.conf.set("spark.shuffle.sort.bypassMergeThreshold", 200)
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
### Reducing Shuffle Size
|
|
78
|
-
|
|
79
|
-
```python
|
|
80
|
-
from pyspark.sql import functions as F
|
|
81
|
-
|
|
82
|
-
# 1. Filter before join/aggregation
|
|
83
|
-
df_filtered = df.filter(F.col("date") >= "2024-01-01")
|
|
84
|
-
result = df_filtered.groupBy("category").count()
|
|
85
|
-
|
|
86
|
-
# 2. Use broadcast for small tables
|
|
87
|
-
from pyspark.sql.functions import broadcast
|
|
88
|
-
result = large_df.join(broadcast(small_df), "key") # No shuffle for small_df
|
|
89
|
-
|
|
90
|
-
# 3. Select only needed columns before shuffle
|
|
91
|
-
df_slim = df.select("key", "value") # Not all 50 columns
|
|
92
|
-
result = df_slim.groupBy("key").sum("value")
|
|
93
|
-
|
|
94
|
-
# 4. Use reduceByKey over groupByKey (RDD)
|
|
95
|
-
# BAD: groupByKey shuffles all values
|
|
96
|
-
counts = rdd.groupByKey().mapValues(len)
|
|
97
|
-
# GOOD: reduceByKey combines locally first
|
|
98
|
-
counts = rdd.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)
|
|
99
|
-
|
|
100
|
-
# 5. Coalesce after filter reduces data
|
|
101
|
-
df_filtered = df.filter(condition).coalesce(50) # Reduce partitions without shuffle
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
### Spark UI Shuffle Metrics
|
|
105
|
-
|
|
106
|
-
In Stages tab, check:
|
|
107
|
-
- **Shuffle Write Size**: Total data written for shuffle
|
|
108
|
-
- **Shuffle Read Size**: Total data read from shuffle
|
|
109
|
-
- **Shuffle Read Blocked Time**: Time waiting for shuffle data
|
|
110
|
-
- **Shuffle Spill (Memory)**: Data spilled to memory
|
|
111
|
-
- **Shuffle Spill (Disk)**: Data spilled to disk (bad, increase memory)
|
|
112
|
-
|
|
113
|
-
---
|
|
114
|
-
|
|
115
|
-
## Data Skew Handling
|
|
116
|
-
|
|
117
|
-
### Identifying Skew
|
|
118
|
-
|
|
119
|
-
```python
|
|
120
|
-
from pyspark.sql import functions as F
|
|
121
|
-
|
|
122
|
-
# Check key distribution
|
|
123
|
-
key_counts = df.groupBy("join_key").count()
|
|
124
|
-
key_counts.orderBy(F.desc("count")).show(20)
|
|
125
|
-
|
|
126
|
-
# Summary statistics
|
|
127
|
-
stats = key_counts.agg(
|
|
128
|
-
F.min("count").alias("min"),
|
|
129
|
-
F.max("count").alias("max"),
|
|
130
|
-
F.avg("count").alias("avg"),
|
|
131
|
-
F.percentile_approx("count", 0.99).alias("p99")
|
|
132
|
-
)
|
|
133
|
-
stats.show()
|
|
134
|
-
|
|
135
|
-
# Skew ratio: max/avg > 10 indicates severe skew
|
|
136
|
-
```
|
|
137
|
-
|
|
138
|
-
**Spark UI indicators:**
|
|
139
|
-
- Few tasks taking much longer than others
|
|
140
|
-
- Task duration histogram shows long tail
|
|
141
|
-
- Some partitions much larger than others
|
|
142
|
-
|
|
143
|
-
### Skew Solutions
|
|
144
|
-
|
|
145
|
-
#### 1. Adaptive Query Execution (Spark 3.x)
|
|
146
|
-
|
|
147
|
-
```python
|
|
148
|
-
# Enable AQE skew handling
|
|
149
|
-
spark.conf.set("spark.sql.adaptive.enabled", "true")
|
|
150
|
-
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")
|
|
151
|
-
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionFactor", 5)
|
|
152
|
-
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes", "256MB")
|
|
153
|
-
|
|
154
|
-
# AQE will automatically split skewed partitions
|
|
155
|
-
result = large_df.join(another_df, "key")
|
|
156
|
-
```
|
|
157
|
-
|
|
158
|
-
#### 2. Salting Technique
|
|
159
|
-
|
|
160
|
-
```python
|
|
161
|
-
from pyspark.sql import functions as F
|
|
162
|
-
|
|
163
|
-
# Identify skewed keys
|
|
164
|
-
skewed_keys = ["NULL", "UNKNOWN", "DEFAULT"]
|
|
165
|
-
salt_buckets = 20
|
|
166
|
-
|
|
167
|
-
# Salt the skewed keys in large table
|
|
168
|
-
large_salted = large_df.withColumn(
|
|
169
|
-
"salted_key",
|
|
170
|
-
F.when(
|
|
171
|
-
F.col("join_key").isin(skewed_keys),
|
|
172
|
-
F.concat(F.col("join_key"), F.lit("_"), (F.rand() * salt_buckets).cast("int").cast("string"))
|
|
173
|
-
).otherwise(F.col("join_key"))
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
# Explode small table for skewed keys only
|
|
177
|
-
from pyspark.sql.functions import explode, array, lit, when
|
|
178
|
-
|
|
179
|
-
small_exploded = small_df.withColumn(
|
|
180
|
-
"salted_key",
|
|
181
|
-
F.when(
|
|
182
|
-
F.col("join_key").isin(skewed_keys),
|
|
183
|
-
F.explode(F.array([F.concat(F.col("join_key"), F.lit("_"), F.lit(i)) for i in range(salt_buckets)]))
|
|
184
|
-
).otherwise(F.col("join_key"))
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
# Join on salted key
|
|
188
|
-
result = large_salted.join(small_exploded, "salted_key")
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
#### 3. Broadcast Join for Skewed Keys
|
|
192
|
-
|
|
193
|
-
```python
|
|
194
|
-
from pyspark.sql.functions import broadcast
|
|
195
|
-
|
|
196
|
-
# Separate skewed and non-skewed data
|
|
197
|
-
skewed_keys = ["NULL", "UNKNOWN"]
|
|
198
|
-
|
|
199
|
-
large_skewed = large_df.filter(F.col("join_key").isin(skewed_keys))
|
|
200
|
-
large_normal = large_df.filter(~F.col("join_key").isin(skewed_keys))
|
|
201
|
-
|
|
202
|
-
small_skewed = small_df.filter(F.col("join_key").isin(skewed_keys))
|
|
203
|
-
small_normal = small_df.filter(~F.col("join_key").isin(skewed_keys))
|
|
204
|
-
|
|
205
|
-
# Broadcast join for skewed (small result expected)
|
|
206
|
-
result_skewed = large_skewed.join(broadcast(small_skewed), "join_key")
|
|
207
|
-
|
|
208
|
-
# Regular join for non-skewed
|
|
209
|
-
result_normal = large_normal.join(small_normal, "join_key")
|
|
210
|
-
|
|
211
|
-
# Union results
|
|
212
|
-
final_result = result_skewed.union(result_normal)
|
|
213
|
-
```
|
|
214
|
-
|
|
215
|
-
#### 4. Iterative Broadcast for Large Skewed Keys
|
|
216
|
-
|
|
217
|
-
```python
|
|
218
|
-
# For extremely skewed single keys
|
|
219
|
-
skewed_key_value = "NULL"
|
|
220
|
-
|
|
221
|
-
# Process skewed key separately with broadcast
|
|
222
|
-
skewed_large = large_df.filter(F.col("join_key") == skewed_key_value)
|
|
223
|
-
skewed_small = small_df.filter(F.col("join_key") == skewed_key_value)
|
|
224
|
-
result_skewed = skewed_large.crossJoin(broadcast(skewed_small))
|
|
225
|
-
|
|
226
|
-
# Process rest normally
|
|
227
|
-
normal_large = large_df.filter(F.col("join_key") != skewed_key_value)
|
|
228
|
-
normal_small = small_df.filter(F.col("join_key") != skewed_key_value)
|
|
229
|
-
result_normal = normal_large.join(normal_small, "join_key")
|
|
230
|
-
|
|
231
|
-
# Combine
|
|
232
|
-
final = result_skewed.union(result_normal)
|
|
233
|
-
```
|
|
234
|
-
|
|
235
|
-
---
|
|
236
|
-
|
|
237
|
-
## Memory Tuning
|
|
238
|
-
|
|
239
|
-
### Memory Pressure Symptoms
|
|
240
|
-
|
|
241
|
-
| Symptom | Cause | Solution |
|
|
242
|
-
|---------|-------|----------|
|
|
243
|
-
| Long GC pauses | Too much cached data | Reduce cache, use serialized storage |
|
|
244
|
-
| Spill to disk | Partitions too large | Increase partitions, add memory |
|
|
245
|
-
| OOM on driver | Large collect/broadcast | Reduce data to driver |
|
|
246
|
-
| OOM on executor | Large partitions | Repartition, increase memory |
|
|
247
|
-
|
|
248
|
-
### Garbage Collection Tuning
|
|
249
|
-
|
|
250
|
-
```python
|
|
251
|
-
# GC options (set via spark-submit --conf)
|
|
252
|
-
# For executor JVM
|
|
253
|
-
spark.conf.set("spark.executor.extraJavaOptions",
|
|
254
|
-
"-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4")
|
|
255
|
-
|
|
256
|
-
# For driver JVM
|
|
257
|
-
spark.conf.set("spark.driver.extraJavaOptions",
|
|
258
|
-
"-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35")
|
|
259
|
-
|
|
260
|
-
# Monitor GC in Spark UI
|
|
261
|
-
# Executors tab shows GC Time for each executor
|
|
262
|
-
# Target: GC Time < 10% of total task time
|
|
263
|
-
```
|
|
264
|
-
|
|
265
|
-
### Reducing Memory Pressure
|
|
266
|
-
|
|
267
|
-
```python
|
|
268
|
-
# 1. Use serialized caching
|
|
269
|
-
from pyspark import StorageLevel
|
|
270
|
-
df.persist(StorageLevel.MEMORY_AND_DISK_SER)
|
|
271
|
-
|
|
272
|
-
# 2. Kryo serialization (faster, more compact)
|
|
273
|
-
spark.conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
|
274
|
-
|
|
275
|
-
# 3. Avoid UDFs that create objects
|
|
276
|
-
# BAD: Creates Python objects
|
|
277
|
-
@udf("string")
|
|
278
|
-
def process(x):
|
|
279
|
-
return x.upper() # String allocation
|
|
280
|
-
|
|
281
|
-
# GOOD: Use built-in
|
|
282
|
-
df.withColumn("upper", F.upper("column"))
|
|
283
|
-
|
|
284
|
-
# 4. Use mapPartitions with generators
|
|
285
|
-
def efficient_process(iterator):
|
|
286
|
-
for row in iterator:
|
|
287
|
-
yield transform(row) # No list allocation
|
|
288
|
-
|
|
289
|
-
result = df.rdd.mapPartitions(efficient_process)
|
|
290
|
-
|
|
291
|
-
# 5. Release cached data promptly
|
|
292
|
-
df.unpersist()
|
|
293
|
-
```
|
|
294
|
-
|
|
295
|
-
### Driver Memory Issues
|
|
296
|
-
|
|
297
|
-
```python
|
|
298
|
-
# Increase driver memory
|
|
299
|
-
spark.conf.set("spark.driver.memory", "8g")
|
|
300
|
-
spark.conf.set("spark.driver.maxResultSize", "4g")
|
|
301
|
-
|
|
302
|
-
# Avoid large collects
|
|
303
|
-
# BAD
|
|
304
|
-
all_data = df.collect() # Pulls everything to driver
|
|
305
|
-
|
|
306
|
-
# GOOD
|
|
307
|
-
sample = df.take(1000) # Small sample
|
|
308
|
-
df.write.parquet("s3://output/") # Write distributed
|
|
309
|
-
```
|
|
310
|
-
|
|
311
|
-
---
|
|
312
|
-
|
|
313
|
-
## Join Optimization
|
|
314
|
-
|
|
315
|
-
### Join Strategy Selection
|
|
316
|
-
|
|
317
|
-
```python
|
|
318
|
-
# Broadcast Hash Join - small table (< 200MB)
|
|
319
|
-
from pyspark.sql.functions import broadcast
|
|
320
|
-
result = large.join(broadcast(small), "key")
|
|
321
|
-
|
|
322
|
-
# Sort Merge Join - large tables, equi-join
|
|
323
|
-
# Default for non-broadcast joins
|
|
324
|
-
result = large1.join(large2, "key")
|
|
325
|
-
|
|
326
|
-
# Shuffle Hash Join - medium tables, memory-constrained
|
|
327
|
-
spark.conf.set("spark.sql.join.preferSortMergeJoin", "false")
|
|
328
|
-
|
|
329
|
-
# Cartesian Product - cross join (avoid if possible)
|
|
330
|
-
result = df1.crossJoin(df2)
|
|
331
|
-
|
|
332
|
-
# Bucket Join - pre-bucketed tables (no shuffle)
|
|
333
|
-
# Requires saveAsTable with bucketBy
|
|
334
|
-
```
|
|
335
|
-
|
|
336
|
-
### Join Hints (Spark 3.0+)
|
|
337
|
-
|
|
338
|
-
```python
|
|
339
|
-
# Broadcast hint
|
|
340
|
-
result = df1.join(df2.hint("broadcast"), "key")
|
|
341
|
-
|
|
342
|
-
# Shuffle merge hint
|
|
343
|
-
result = df1.hint("merge").join(df2, "key")
|
|
344
|
-
|
|
345
|
-
# Shuffle hash hint
|
|
346
|
-
result = df1.hint("shuffle_hash").join(df2, "key")
|
|
347
|
-
|
|
348
|
-
# Shuffle replicate NL hint (for small-large joins)
|
|
349
|
-
result = df1.hint("shuffle_replicate_nl").join(df2, "key")
|
|
350
|
-
```
|
|
351
|
-
|
|
352
|
-
### Checking Join Plan
|
|
353
|
-
|
|
354
|
-
```python
|
|
355
|
-
# View physical plan
|
|
356
|
-
df1.join(df2, "key").explain(True)
|
|
357
|
-
|
|
358
|
-
# Look for:
|
|
359
|
-
# - BroadcastHashJoin (best for small tables)
|
|
360
|
-
# - SortMergeJoin (good for large-large joins)
|
|
361
|
-
# - BroadcastNestedLoopJoin (avoid, expensive)
|
|
362
|
-
# - CartesianProduct (avoid unless intentional)
|
|
363
|
-
```
|
|
364
|
-
|
|
365
|
-
---
|
|
366
|
-
|
|
367
|
-
## I/O Optimization
|
|
368
|
-
|
|
369
|
-
### Reading Data
|
|
370
|
-
|
|
371
|
-
```python
|
|
372
|
-
# Parquet (best for Spark)
|
|
373
|
-
df = spark.read.parquet("s3://bucket/data/")
|
|
374
|
-
|
|
375
|
-
# Optimize Parquet reading
|
|
376
|
-
spark.conf.set("spark.sql.parquet.filterPushdown", "true")
|
|
377
|
-
spark.conf.set("spark.sql.parquet.mergeSchema", "false") # Faster if schema consistent
|
|
378
|
-
|
|
379
|
-
# Partition pruning - filter on partition columns
|
|
380
|
-
df = spark.read.parquet("s3://bucket/data/") \
|
|
381
|
-
.filter(F.col("date") >= "2024-01-01") # Only reads matching partitions
|
|
382
|
-
|
|
383
|
-
# Column pruning - select only needed columns
|
|
384
|
-
df = spark.read.parquet("s3://bucket/data/").select("id", "name", "amount")
|
|
385
|
-
|
|
386
|
-
# Explicit schema (avoid inference)
|
|
387
|
-
df = spark.read.schema(my_schema).json("s3://bucket/data/")
|
|
388
|
-
```
|
|
389
|
-
|
|
390
|
-
### Writing Data
|
|
391
|
-
|
|
392
|
-
```python
|
|
393
|
-
# Optimal file sizes (128MB-256MB)
|
|
394
|
-
spark.conf.set("spark.sql.files.maxRecordsPerFile", 1000000)
|
|
395
|
-
|
|
396
|
-
# Compaction for small files
|
|
397
|
-
df.coalesce(100).write.parquet("s3://bucket/output/")
|
|
398
|
-
|
|
399
|
-
# Partitioned writes
|
|
400
|
-
df.write.partitionBy("date").parquet("s3://bucket/output/")
|
|
401
|
-
|
|
402
|
-
# Bucketed writes (requires Hive metastore)
|
|
403
|
-
df.write.bucketBy(100, "user_id").sortBy("timestamp").saveAsTable("table")
|
|
404
|
-
|
|
405
|
-
# Compression
|
|
406
|
-
df.write.option("compression", "snappy").parquet("s3://bucket/output/")
|
|
407
|
-
```
|
|
408
|
-
|
|
409
|
-
### Small File Problem
|
|
410
|
-
|
|
411
|
-
```python
|
|
412
|
-
# Detect small files
|
|
413
|
-
file_list = spark.sparkContext._jvm.org.apache.hadoop.fs.FileSystem \
|
|
414
|
-
.get(spark.sparkContext._jsc.hadoopConfiguration()) \
|
|
415
|
-
.listStatus(spark.sparkContext._jvm.org.apache.hadoop.fs.Path("s3://bucket/data/"))
|
|
416
|
-
|
|
417
|
-
# Compact small files
|
|
418
|
-
df = spark.read.parquet("s3://bucket/small_files/")
|
|
419
|
-
df.coalesce(optimal_partition_count).write.parquet("s3://bucket/compacted/")
|
|
420
|
-
|
|
421
|
-
# Or use repartition for even distribution
|
|
422
|
-
df.repartition(100).write.parquet("s3://bucket/compacted/")
|
|
423
|
-
```
|
|
424
|
-
|
|
425
|
-
---
|
|
426
|
-
|
|
427
|
-
## Spark UI Deep Dive
|
|
428
|
-
|
|
429
|
-
### Jobs Tab
|
|
430
|
-
|
|
431
|
-
- **Job Duration**: Identify slow jobs
|
|
432
|
-
- **Stages**: Number of stages (more stages = more shuffles)
|
|
433
|
-
- **DAG Visualization**: Understand data flow
|
|
434
|
-
|
|
435
|
-
### Stages Tab
|
|
436
|
-
|
|
437
|
-
| Metric | Healthy | Action if Abnormal |
|
|
438
|
-
|--------|---------|-------------------|
|
|
439
|
-
| Duration | < 5 min per stage | Break up large stages |
|
|
440
|
-
| Tasks | Even distribution | Address skew |
|
|
441
|
-
| Shuffle Write | Minimize | Filter earlier, select fewer columns |
|
|
442
|
-
| Shuffle Read Blocked Time | Near 0 | Check network, increase parallelism |
|
|
443
|
-
| Spill (Disk) | 0 | Increase memory or partitions |
|
|
444
|
-
| GC Time | < 10% of task time | Tune GC, reduce cached data |
|
|
445
|
-
|
|
446
|
-
### Executors Tab
|
|
447
|
-
|
|
448
|
-
- **Storage Memory**: Cache usage
|
|
449
|
-
- **Shuffle Read/Write**: I/O patterns
|
|
450
|
-
- **GC Time**: Garbage collection overhead
|
|
451
|
-
- **Failed Tasks**: Executor failures
|
|
452
|
-
|
|
453
|
-
### SQL Tab
|
|
454
|
-
|
|
455
|
-
- **Duration**: Query execution time
|
|
456
|
-
- **Details**: Physical plan details
|
|
457
|
-
- **Metrics**: Input/output rows at each stage
|
|
458
|
-
|
|
459
|
-
### Storage Tab
|
|
460
|
-
|
|
461
|
-
- **Cached RDDs/DataFrames**: Size and partition distribution
|
|
462
|
-
- **Fraction Cached**: Should be 100%
|
|
463
|
-
|
|
464
|
-
---
|
|
465
|
-
|
|
466
|
-
## Common Configuration Template
|
|
467
|
-
|
|
468
|
-
```python
|
|
469
|
-
# Production configuration template
|
|
470
|
-
spark_configs = {
|
|
471
|
-
# Executor configuration
|
|
472
|
-
"spark.executor.instances": 50,
|
|
473
|
-
"spark.executor.cores": 5,
|
|
474
|
-
"spark.executor.memory": "16g",
|
|
475
|
-
"spark.executor.memoryOverhead": "2g",
|
|
476
|
-
|
|
477
|
-
# Driver configuration
|
|
478
|
-
"spark.driver.memory": "8g",
|
|
479
|
-
"spark.driver.maxResultSize": "4g",
|
|
480
|
-
|
|
481
|
-
# Shuffle configuration
|
|
482
|
-
"spark.sql.shuffle.partitions": 500,
|
|
483
|
-
"spark.shuffle.compress": "true",
|
|
484
|
-
"spark.io.compression.codec": "lz4",
|
|
485
|
-
|
|
486
|
-
# SQL optimization
|
|
487
|
-
"spark.sql.adaptive.enabled": "true",
|
|
488
|
-
"spark.sql.adaptive.coalescePartitions.enabled": "true",
|
|
489
|
-
"spark.sql.adaptive.skewJoin.enabled": "true",
|
|
490
|
-
"spark.sql.autoBroadcastJoinThreshold": str(200 * 1024 * 1024), # 200MB
|
|
491
|
-
|
|
492
|
-
# Serialization
|
|
493
|
-
"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
|
|
494
|
-
|
|
495
|
-
# Dynamic allocation
|
|
496
|
-
"spark.dynamicAllocation.enabled": "true",
|
|
497
|
-
"spark.dynamicAllocation.minExecutors": 5,
|
|
498
|
-
"spark.dynamicAllocation.maxExecutors": 100,
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
for key, value in spark_configs.items():
|
|
502
|
-
spark.conf.set(key, value)
|
|
503
|
-
```
|
|
504
|
-
|
|
505
|
-
---
|
|
506
|
-
|
|
507
|
-
## Troubleshooting Decision Tree
|
|
508
|
-
|
|
509
|
-
```
|
|
510
|
-
Slow Spark Job
|
|
511
|
-
├── Long GC Time (> 10%)?
|
|
512
|
-
│ ├── Yes → Increase executor memory or reduce cache
|
|
513
|
-
│ └── No → Continue
|
|
514
|
-
├── Shuffle Spill to Disk?
|
|
515
|
-
│ ├── Yes → Increase partitions or memory
|
|
516
|
-
│ └── No → Continue
|
|
517
|
-
├── Uneven Task Duration?
|
|
518
|
-
│ ├── Yes → Data skew, use salting or AQE
|
|
519
|
-
│ └── No → Continue
|
|
520
|
-
├── Long Shuffle Read Time?
|
|
521
|
-
│ ├── Yes → Network bottleneck, increase locality
|
|
522
|
-
│ └── No → Continue
|
|
523
|
-
├── Large Shuffle Size?
|
|
524
|
-
│ ├── Yes → Filter earlier, broadcast small tables
|
|
525
|
-
│ └── No → Continue
|
|
526
|
-
└── Too Many Small Tasks?
|
|
527
|
-
├── Yes → Reduce partitions with coalesce
|
|
528
|
-
└── No → Check for code-level optimizations
|
|
529
|
-
```
|
|
530
|
-
|
|
531
|
-
---
|
|
532
|
-
|
|
533
|
-
## Best Practices Summary
|
|
534
|
-
|
|
535
|
-
1. **Size executors appropriately** - 5 cores, 16GB memory typical
|
|
536
|
-
2. **Enable AQE (Spark 3.x)** - Automatic optimization for partitions and skew
|
|
537
|
-
3. **Tune shuffle partitions** - Based on data size, not default 200
|
|
538
|
-
4. **Address data skew** - Salt keys or use AQE automatic handling
|
|
539
|
-
5. **Monitor Spark UI** - Check shuffle, spill, GC metrics
|
|
540
|
-
6. **Use broadcast joins** - For tables under 200MB
|
|
541
|
-
7. **Filter and select early** - Reduce data before shuffle
|
|
542
|
-
8. **Avoid UDFs** - Use built-in functions (10-100x faster)
|
|
543
|
-
9. **Cache strategically** - Only reused data, unpersist when done
|
|
544
|
-
10. **Test at scale** - Performance varies significantly with data volume
|
|
1
|
+
# Performance Tuning
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## Cluster Sizing
|
|
6
|
+
|
|
7
|
+
### Executor Configuration
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
# Key executor configurations
|
|
11
|
+
spark.conf.set("spark.executor.instances", 10) # Number of executors
|
|
12
|
+
spark.conf.set("spark.executor.cores", 4) # Cores per executor
|
|
13
|
+
spark.conf.set("spark.executor.memory", "16g") # Memory per executor
|
|
14
|
+
|
|
15
|
+
# Dynamic allocation (recommended for varying workloads)
|
|
16
|
+
spark.conf.set("spark.dynamicAllocation.enabled", "true")
|
|
17
|
+
spark.conf.set("spark.dynamicAllocation.minExecutors", 2)
|
|
18
|
+
spark.conf.set("spark.dynamicAllocation.maxExecutors", 100)
|
|
19
|
+
spark.conf.set("spark.dynamicAllocation.executorIdleTimeout", "60s")
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### Sizing Guidelines
|
|
23
|
+
|
|
24
|
+
| Cluster Size | Executor Memory | Executor Cores | Instances |
|
|
25
|
+
|--------------|-----------------|----------------|-----------|
|
|
26
|
+
| Small (dev) | 4-8GB | 2-4 | 2-5 |
|
|
27
|
+
| Medium | 8-16GB | 4-5 | 10-50 |
|
|
28
|
+
| Large | 16-32GB | 5-8 | 50-200 |
|
|
29
|
+
| Very Large | 32-64GB | 8-16 | 200+ |
|
|
30
|
+
|
|
31
|
+
**Rules of thumb:**
|
|
32
|
+
- 5 cores per executor is optimal (avoids HDFS I/O bottleneck)
|
|
33
|
+
- Leave 1 core per node for OS/YARN
|
|
34
|
+
- Leave 1GB per node for overhead
|
|
35
|
+
- executor.memoryOverhead = max(384MB, 10% of executor.memory)
|
|
36
|
+
|
|
37
|
+
### Memory Configuration
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
# Executor memory breakdown
|
|
41
|
+
spark.conf.set("spark.executor.memory", "16g")
|
|
42
|
+
spark.conf.set("spark.executor.memoryOverhead", "2g") # For off-heap, network buffers
|
|
43
|
+
|
|
44
|
+
# Memory fractions (default values usually good)
|
|
45
|
+
spark.conf.set("spark.memory.fraction", 0.6) # Unified memory pool
|
|
46
|
+
spark.conf.set("spark.memory.storageFraction", 0.5) # Cache vs execution split
|
|
47
|
+
|
|
48
|
+
# Off-heap memory (for large data)
|
|
49
|
+
spark.conf.set("spark.memory.offHeap.enabled", "true")
|
|
50
|
+
spark.conf.set("spark.memory.offHeap.size", "8g")
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Shuffle Optimization
|
|
56
|
+
|
|
57
|
+
### Shuffle Configuration
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
# Number of shuffle partitions
|
|
61
|
+
spark.conf.set("spark.sql.shuffle.partitions", 200) # Adjust based on data size
|
|
62
|
+
|
|
63
|
+
# Shuffle behavior
|
|
64
|
+
spark.conf.set("spark.shuffle.compress", "true") # Compress shuffle data
|
|
65
|
+
spark.conf.set("spark.shuffle.spill.compress", "true") # Compress spill data
|
|
66
|
+
spark.conf.set("spark.io.compression.codec", "lz4") # Fast compression
|
|
67
|
+
|
|
68
|
+
# Shuffle file management
|
|
69
|
+
spark.conf.set("spark.shuffle.file.buffer", "64k") # Buffer for shuffle writes
|
|
70
|
+
spark.conf.set("spark.shuffle.io.maxRetries", 3) # Retry failed fetches
|
|
71
|
+
spark.conf.set("spark.shuffle.io.retryWait", "5s") # Wait between retries
|
|
72
|
+
|
|
73
|
+
# Sort-based shuffle (default in Spark 2.0+)
|
|
74
|
+
spark.conf.set("spark.shuffle.sort.bypassMergeThreshold", 200)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Reducing Shuffle Size
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from pyspark.sql import functions as F
|
|
81
|
+
|
|
82
|
+
# 1. Filter before join/aggregation
|
|
83
|
+
df_filtered = df.filter(F.col("date") >= "2024-01-01")
|
|
84
|
+
result = df_filtered.groupBy("category").count()
|
|
85
|
+
|
|
86
|
+
# 2. Use broadcast for small tables
|
|
87
|
+
from pyspark.sql.functions import broadcast
|
|
88
|
+
result = large_df.join(broadcast(small_df), "key") # No shuffle for small_df
|
|
89
|
+
|
|
90
|
+
# 3. Select only needed columns before shuffle
|
|
91
|
+
df_slim = df.select("key", "value") # Not all 50 columns
|
|
92
|
+
result = df_slim.groupBy("key").sum("value")
|
|
93
|
+
|
|
94
|
+
# 4. Use reduceByKey over groupByKey (RDD)
|
|
95
|
+
# BAD: groupByKey shuffles all values
|
|
96
|
+
counts = rdd.groupByKey().mapValues(len)
|
|
97
|
+
# GOOD: reduceByKey combines locally first
|
|
98
|
+
counts = rdd.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)
|
|
99
|
+
|
|
100
|
+
# 5. Coalesce after filter reduces data
|
|
101
|
+
df_filtered = df.filter(condition).coalesce(50) # Reduce partitions without shuffle
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Spark UI Shuffle Metrics
|
|
105
|
+
|
|
106
|
+
In Stages tab, check:
|
|
107
|
+
- **Shuffle Write Size**: Total data written for shuffle
|
|
108
|
+
- **Shuffle Read Size**: Total data read from shuffle
|
|
109
|
+
- **Shuffle Read Blocked Time**: Time waiting for shuffle data
|
|
110
|
+
- **Shuffle Spill (Memory)**: Data spilled to memory
|
|
111
|
+
- **Shuffle Spill (Disk)**: Data spilled to disk (bad, increase memory)
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## Data Skew Handling
|
|
116
|
+
|
|
117
|
+
### Identifying Skew
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
from pyspark.sql import functions as F
|
|
121
|
+
|
|
122
|
+
# Check key distribution
|
|
123
|
+
key_counts = df.groupBy("join_key").count()
|
|
124
|
+
key_counts.orderBy(F.desc("count")).show(20)
|
|
125
|
+
|
|
126
|
+
# Summary statistics
|
|
127
|
+
stats = key_counts.agg(
|
|
128
|
+
F.min("count").alias("min"),
|
|
129
|
+
F.max("count").alias("max"),
|
|
130
|
+
F.avg("count").alias("avg"),
|
|
131
|
+
F.percentile_approx("count", 0.99).alias("p99")
|
|
132
|
+
)
|
|
133
|
+
stats.show()
|
|
134
|
+
|
|
135
|
+
# Skew ratio: max/avg > 10 indicates severe skew
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**Spark UI indicators:**
|
|
139
|
+
- Few tasks taking much longer than others
|
|
140
|
+
- Task duration histogram shows long tail
|
|
141
|
+
- Some partitions much larger than others
|
|
142
|
+
|
|
143
|
+
### Skew Solutions
|
|
144
|
+
|
|
145
|
+
#### 1. Adaptive Query Execution (Spark 3.x)
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
# Enable AQE skew handling
|
|
149
|
+
spark.conf.set("spark.sql.adaptive.enabled", "true")
|
|
150
|
+
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")
|
|
151
|
+
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionFactor", 5)
|
|
152
|
+
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes", "256MB")
|
|
153
|
+
|
|
154
|
+
# AQE will automatically split skewed partitions
|
|
155
|
+
result = large_df.join(another_df, "key")
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
#### 2. Salting Technique
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
from pyspark.sql import functions as F
|
|
162
|
+
|
|
163
|
+
# Identify skewed keys
|
|
164
|
+
skewed_keys = ["NULL", "UNKNOWN", "DEFAULT"]
|
|
165
|
+
salt_buckets = 20
|
|
166
|
+
|
|
167
|
+
# Salt the skewed keys in large table
|
|
168
|
+
large_salted = large_df.withColumn(
|
|
169
|
+
"salted_key",
|
|
170
|
+
F.when(
|
|
171
|
+
F.col("join_key").isin(skewed_keys),
|
|
172
|
+
F.concat(F.col("join_key"), F.lit("_"), (F.rand() * salt_buckets).cast("int").cast("string"))
|
|
173
|
+
).otherwise(F.col("join_key"))
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Explode small table for skewed keys only
|
|
177
|
+
from pyspark.sql.functions import explode, array, lit, when
|
|
178
|
+
|
|
179
|
+
small_exploded = small_df.withColumn(
|
|
180
|
+
"salted_key",
|
|
181
|
+
F.when(
|
|
182
|
+
F.col("join_key").isin(skewed_keys),
|
|
183
|
+
F.explode(F.array([F.concat(F.col("join_key"), F.lit("_"), F.lit(i)) for i in range(salt_buckets)]))
|
|
184
|
+
).otherwise(F.col("join_key"))
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Join on salted key
|
|
188
|
+
result = large_salted.join(small_exploded, "salted_key")
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
#### 3. Broadcast Join for Skewed Keys
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
from pyspark.sql.functions import broadcast
|
|
195
|
+
|
|
196
|
+
# Separate skewed and non-skewed data
|
|
197
|
+
skewed_keys = ["NULL", "UNKNOWN"]
|
|
198
|
+
|
|
199
|
+
large_skewed = large_df.filter(F.col("join_key").isin(skewed_keys))
|
|
200
|
+
large_normal = large_df.filter(~F.col("join_key").isin(skewed_keys))
|
|
201
|
+
|
|
202
|
+
small_skewed = small_df.filter(F.col("join_key").isin(skewed_keys))
|
|
203
|
+
small_normal = small_df.filter(~F.col("join_key").isin(skewed_keys))
|
|
204
|
+
|
|
205
|
+
# Broadcast join for skewed (small result expected)
|
|
206
|
+
result_skewed = large_skewed.join(broadcast(small_skewed), "join_key")
|
|
207
|
+
|
|
208
|
+
# Regular join for non-skewed
|
|
209
|
+
result_normal = large_normal.join(small_normal, "join_key")
|
|
210
|
+
|
|
211
|
+
# Union results
|
|
212
|
+
final_result = result_skewed.union(result_normal)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
#### 4. Iterative Broadcast for Large Skewed Keys
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
# For extremely skewed single keys
|
|
219
|
+
skewed_key_value = "NULL"
|
|
220
|
+
|
|
221
|
+
# Process skewed key separately with broadcast
|
|
222
|
+
skewed_large = large_df.filter(F.col("join_key") == skewed_key_value)
|
|
223
|
+
skewed_small = small_df.filter(F.col("join_key") == skewed_key_value)
|
|
224
|
+
result_skewed = skewed_large.crossJoin(broadcast(skewed_small))
|
|
225
|
+
|
|
226
|
+
# Process rest normally
|
|
227
|
+
normal_large = large_df.filter(F.col("join_key") != skewed_key_value)
|
|
228
|
+
normal_small = small_df.filter(F.col("join_key") != skewed_key_value)
|
|
229
|
+
result_normal = normal_large.join(normal_small, "join_key")
|
|
230
|
+
|
|
231
|
+
# Combine
|
|
232
|
+
final = result_skewed.union(result_normal)
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## Memory Tuning
|
|
238
|
+
|
|
239
|
+
### Memory Pressure Symptoms
|
|
240
|
+
|
|
241
|
+
| Symptom | Cause | Solution |
|
|
242
|
+
|---------|-------|----------|
|
|
243
|
+
| Long GC pauses | Too much cached data | Reduce cache, use serialized storage |
|
|
244
|
+
| Spill to disk | Partitions too large | Increase partitions, add memory |
|
|
245
|
+
| OOM on driver | Large collect/broadcast | Reduce data to driver |
|
|
246
|
+
| OOM on executor | Large partitions | Repartition, increase memory |
|
|
247
|
+
|
|
248
|
+
### Garbage Collection Tuning
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
# GC options (set via spark-submit --conf)
|
|
252
|
+
# For executor JVM
|
|
253
|
+
spark.conf.set("spark.executor.extraJavaOptions",
|
|
254
|
+
"-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=4")
|
|
255
|
+
|
|
256
|
+
# For driver JVM
|
|
257
|
+
spark.conf.set("spark.driver.extraJavaOptions",
|
|
258
|
+
"-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35")
|
|
259
|
+
|
|
260
|
+
# Monitor GC in Spark UI
|
|
261
|
+
# Executors tab shows GC Time for each executor
|
|
262
|
+
# Target: GC Time < 10% of total task time
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
### Reducing Memory Pressure
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
# 1. Use serialized caching
|
|
269
|
+
from pyspark import StorageLevel
|
|
270
|
+
df.persist(StorageLevel.MEMORY_AND_DISK_SER)
|
|
271
|
+
|
|
272
|
+
# 2. Kryo serialization (faster, more compact)
|
|
273
|
+
spark.conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
|
274
|
+
|
|
275
|
+
# 3. Avoid UDFs that create objects
|
|
276
|
+
# BAD: Creates Python objects
|
|
277
|
+
@udf("string")
|
|
278
|
+
def process(x):
|
|
279
|
+
return x.upper() # String allocation
|
|
280
|
+
|
|
281
|
+
# GOOD: Use built-in
|
|
282
|
+
df.withColumn("upper", F.upper("column"))
|
|
283
|
+
|
|
284
|
+
# 4. Use mapPartitions with generators
|
|
285
|
+
def efficient_process(iterator):
|
|
286
|
+
for row in iterator:
|
|
287
|
+
yield transform(row) # No list allocation
|
|
288
|
+
|
|
289
|
+
result = df.rdd.mapPartitions(efficient_process)
|
|
290
|
+
|
|
291
|
+
# 5. Release cached data promptly
|
|
292
|
+
df.unpersist()
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### Driver Memory Issues
|
|
296
|
+
|
|
297
|
+
```python
|
|
298
|
+
# Increase driver memory
|
|
299
|
+
spark.conf.set("spark.driver.memory", "8g")
|
|
300
|
+
spark.conf.set("spark.driver.maxResultSize", "4g")
|
|
301
|
+
|
|
302
|
+
# Avoid large collects
|
|
303
|
+
# BAD
|
|
304
|
+
all_data = df.collect() # Pulls everything to driver
|
|
305
|
+
|
|
306
|
+
# GOOD
|
|
307
|
+
sample = df.take(1000) # Small sample
|
|
308
|
+
df.write.parquet("s3://output/") # Write distributed
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
---
|
|
312
|
+
|
|
313
|
+
## Join Optimization
|
|
314
|
+
|
|
315
|
+
### Join Strategy Selection
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
# Broadcast Hash Join - small table (< 200MB)
|
|
319
|
+
from pyspark.sql.functions import broadcast
|
|
320
|
+
result = large.join(broadcast(small), "key")
|
|
321
|
+
|
|
322
|
+
# Sort Merge Join - large tables, equi-join
|
|
323
|
+
# Default for non-broadcast joins
|
|
324
|
+
result = large1.join(large2, "key")
|
|
325
|
+
|
|
326
|
+
# Shuffle Hash Join - medium tables, memory-constrained
|
|
327
|
+
spark.conf.set("spark.sql.join.preferSortMergeJoin", "false")
|
|
328
|
+
|
|
329
|
+
# Cartesian Product - cross join (avoid if possible)
|
|
330
|
+
result = df1.crossJoin(df2)
|
|
331
|
+
|
|
332
|
+
# Bucket Join - pre-bucketed tables (no shuffle)
|
|
333
|
+
# Requires saveAsTable with bucketBy
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### Join Hints (Spark 3.0+)
|
|
337
|
+
|
|
338
|
+
```python
|
|
339
|
+
# Broadcast hint
|
|
340
|
+
result = df1.join(df2.hint("broadcast"), "key")
|
|
341
|
+
|
|
342
|
+
# Shuffle merge hint
|
|
343
|
+
result = df1.hint("merge").join(df2, "key")
|
|
344
|
+
|
|
345
|
+
# Shuffle hash hint
|
|
346
|
+
result = df1.hint("shuffle_hash").join(df2, "key")
|
|
347
|
+
|
|
348
|
+
# Shuffle replicate NL hint (for small-large joins)
|
|
349
|
+
result = df1.hint("shuffle_replicate_nl").join(df2, "key")
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
### Checking Join Plan
|
|
353
|
+
|
|
354
|
+
```python
|
|
355
|
+
# View physical plan
|
|
356
|
+
df1.join(df2, "key").explain(True)
|
|
357
|
+
|
|
358
|
+
# Look for:
|
|
359
|
+
# - BroadcastHashJoin (best for small tables)
|
|
360
|
+
# - SortMergeJoin (good for large-large joins)
|
|
361
|
+
# - BroadcastNestedLoopJoin (avoid, expensive)
|
|
362
|
+
# - CartesianProduct (avoid unless intentional)
|
|
363
|
+
```
|
|
364
|
+
|
|
365
|
+
---
|
|
366
|
+
|
|
367
|
+
## I/O Optimization
|
|
368
|
+
|
|
369
|
+
### Reading Data
|
|
370
|
+
|
|
371
|
+
```python
|
|
372
|
+
# Parquet (best for Spark)
|
|
373
|
+
df = spark.read.parquet("s3://bucket/data/")
|
|
374
|
+
|
|
375
|
+
# Optimize Parquet reading
|
|
376
|
+
spark.conf.set("spark.sql.parquet.filterPushdown", "true")
|
|
377
|
+
spark.conf.set("spark.sql.parquet.mergeSchema", "false") # Faster if schema consistent
|
|
378
|
+
|
|
379
|
+
# Partition pruning - filter on partition columns
|
|
380
|
+
df = spark.read.parquet("s3://bucket/data/") \
|
|
381
|
+
.filter(F.col("date") >= "2024-01-01") # Only reads matching partitions
|
|
382
|
+
|
|
383
|
+
# Column pruning - select only needed columns
|
|
384
|
+
df = spark.read.parquet("s3://bucket/data/").select("id", "name", "amount")
|
|
385
|
+
|
|
386
|
+
# Explicit schema (avoid inference)
|
|
387
|
+
df = spark.read.schema(my_schema).json("s3://bucket/data/")
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
### Writing Data
|
|
391
|
+
|
|
392
|
+
```python
|
|
393
|
+
# Optimal file sizes (128MB-256MB)
|
|
394
|
+
spark.conf.set("spark.sql.files.maxRecordsPerFile", 1000000)
|
|
395
|
+
|
|
396
|
+
# Compaction for small files
|
|
397
|
+
df.coalesce(100).write.parquet("s3://bucket/output/")
|
|
398
|
+
|
|
399
|
+
# Partitioned writes
|
|
400
|
+
df.write.partitionBy("date").parquet("s3://bucket/output/")
|
|
401
|
+
|
|
402
|
+
# Bucketed writes (requires Hive metastore)
|
|
403
|
+
df.write.bucketBy(100, "user_id").sortBy("timestamp").saveAsTable("table")
|
|
404
|
+
|
|
405
|
+
# Compression
|
|
406
|
+
df.write.option("compression", "snappy").parquet("s3://bucket/output/")
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
### Small File Problem
|
|
410
|
+
|
|
411
|
+
```python
|
|
412
|
+
# Detect small files
|
|
413
|
+
file_list = spark.sparkContext._jvm.org.apache.hadoop.fs.FileSystem \
|
|
414
|
+
.get(spark.sparkContext._jsc.hadoopConfiguration()) \
|
|
415
|
+
.listStatus(spark.sparkContext._jvm.org.apache.hadoop.fs.Path("s3://bucket/data/"))
|
|
416
|
+
|
|
417
|
+
# Compact small files
|
|
418
|
+
df = spark.read.parquet("s3://bucket/small_files/")
|
|
419
|
+
df.coalesce(optimal_partition_count).write.parquet("s3://bucket/compacted/")
|
|
420
|
+
|
|
421
|
+
# Or use repartition for even distribution
|
|
422
|
+
df.repartition(100).write.parquet("s3://bucket/compacted/")
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
---
|
|
426
|
+
|
|
427
|
+
## Spark UI Deep Dive
|
|
428
|
+
|
|
429
|
+
### Jobs Tab
|
|
430
|
+
|
|
431
|
+
- **Job Duration**: Identify slow jobs
|
|
432
|
+
- **Stages**: Number of stages (more stages = more shuffles)
|
|
433
|
+
- **DAG Visualization**: Understand data flow
|
|
434
|
+
|
|
435
|
+
### Stages Tab
|
|
436
|
+
|
|
437
|
+
| Metric | Healthy | Action if Abnormal |
|
|
438
|
+
|--------|---------|-------------------|
|
|
439
|
+
| Duration | < 5 min per stage | Break up large stages |
|
|
440
|
+
| Tasks | Even distribution | Address skew |
|
|
441
|
+
| Shuffle Write | Minimize | Filter earlier, select fewer columns |
|
|
442
|
+
| Shuffle Read Blocked Time | Near 0 | Check network, increase parallelism |
|
|
443
|
+
| Spill (Disk) | 0 | Increase memory or partitions |
|
|
444
|
+
| GC Time | < 10% of task time | Tune GC, reduce cached data |
|
|
445
|
+
|
|
446
|
+
### Executors Tab
|
|
447
|
+
|
|
448
|
+
- **Storage Memory**: Cache usage
|
|
449
|
+
- **Shuffle Read/Write**: I/O patterns
|
|
450
|
+
- **GC Time**: Garbage collection overhead
|
|
451
|
+
- **Failed Tasks**: Executor failures
|
|
452
|
+
|
|
453
|
+
### SQL Tab
|
|
454
|
+
|
|
455
|
+
- **Duration**: Query execution time
|
|
456
|
+
- **Details**: Physical plan details
|
|
457
|
+
- **Metrics**: Input/output rows at each stage
|
|
458
|
+
|
|
459
|
+
### Storage Tab
|
|
460
|
+
|
|
461
|
+
- **Cached RDDs/DataFrames**: Size and partition distribution
|
|
462
|
+
- **Fraction Cached**: Should be 100%
|
|
463
|
+
|
|
464
|
+
---
|
|
465
|
+
|
|
466
|
+
## Common Configuration Template
|
|
467
|
+
|
|
468
|
+
```python
|
|
469
|
+
# Production configuration template
|
|
470
|
+
spark_configs = {
|
|
471
|
+
# Executor configuration
|
|
472
|
+
"spark.executor.instances": 50,
|
|
473
|
+
"spark.executor.cores": 5,
|
|
474
|
+
"spark.executor.memory": "16g",
|
|
475
|
+
"spark.executor.memoryOverhead": "2g",
|
|
476
|
+
|
|
477
|
+
# Driver configuration
|
|
478
|
+
"spark.driver.memory": "8g",
|
|
479
|
+
"spark.driver.maxResultSize": "4g",
|
|
480
|
+
|
|
481
|
+
# Shuffle configuration
|
|
482
|
+
"spark.sql.shuffle.partitions": 500,
|
|
483
|
+
"spark.shuffle.compress": "true",
|
|
484
|
+
"spark.io.compression.codec": "lz4",
|
|
485
|
+
|
|
486
|
+
# SQL optimization
|
|
487
|
+
"spark.sql.adaptive.enabled": "true",
|
|
488
|
+
"spark.sql.adaptive.coalescePartitions.enabled": "true",
|
|
489
|
+
"spark.sql.adaptive.skewJoin.enabled": "true",
|
|
490
|
+
"spark.sql.autoBroadcastJoinThreshold": str(200 * 1024 * 1024), # 200MB
|
|
491
|
+
|
|
492
|
+
# Serialization
|
|
493
|
+
"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
|
|
494
|
+
|
|
495
|
+
# Dynamic allocation
|
|
496
|
+
"spark.dynamicAllocation.enabled": "true",
|
|
497
|
+
"spark.dynamicAllocation.minExecutors": 5,
|
|
498
|
+
"spark.dynamicAllocation.maxExecutors": 100,
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
for key, value in spark_configs.items():
|
|
502
|
+
spark.conf.set(key, value)
|
|
503
|
+
```
|
|
504
|
+
|
|
505
|
+
---
|
|
506
|
+
|
|
507
|
+
## Troubleshooting Decision Tree
|
|
508
|
+
|
|
509
|
+
```
|
|
510
|
+
Slow Spark Job
|
|
511
|
+
├── Long GC Time (> 10%)?
|
|
512
|
+
│ ├── Yes → Increase executor memory or reduce cache
|
|
513
|
+
│ └── No → Continue
|
|
514
|
+
├── Shuffle Spill to Disk?
|
|
515
|
+
│ ├── Yes → Increase partitions or memory
|
|
516
|
+
│ └── No → Continue
|
|
517
|
+
├── Uneven Task Duration?
|
|
518
|
+
│ ├── Yes → Data skew, use salting or AQE
|
|
519
|
+
│ └── No → Continue
|
|
520
|
+
├── Long Shuffle Read Time?
|
|
521
|
+
│ ├── Yes → Network bottleneck, increase locality
|
|
522
|
+
│ └── No → Continue
|
|
523
|
+
├── Large Shuffle Size?
|
|
524
|
+
│ ├── Yes → Filter earlier, broadcast small tables
|
|
525
|
+
│ └── No → Continue
|
|
526
|
+
└── Too Many Small Tasks?
|
|
527
|
+
├── Yes → Reduce partitions with coalesce
|
|
528
|
+
└── No → Check for code-level optimizations
|
|
529
|
+
```
|
|
530
|
+
|
|
531
|
+
---
|
|
532
|
+
|
|
533
|
+
## Best Practices Summary
|
|
534
|
+
|
|
535
|
+
1. **Size executors appropriately** - 5 cores, 16GB memory typical
|
|
536
|
+
2. **Enable AQE (Spark 3.x)** - Automatic optimization for partitions and skew
|
|
537
|
+
3. **Tune shuffle partitions** - Based on data size, not default 200
|
|
538
|
+
4. **Address data skew** - Salt keys or use AQE automatic handling
|
|
539
|
+
5. **Monitor Spark UI** - Check shuffle, spill, GC metrics
|
|
540
|
+
6. **Use broadcast joins** - For tables under 200MB
|
|
541
|
+
7. **Filter and select early** - Reduce data before shuffle
|
|
542
|
+
8. **Avoid UDFs** - Use built-in functions (10-100x faster)
|
|
543
|
+
9. **Cache strategically** - Only reused data, unpersist when done
|
|
544
|
+
10. **Test at scale** - Performance varies significantly with data volume
|