aigroup-workflow 2.2.1 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/fix-build.md +10 -5
- package/.claude/commands/init-project.md +13 -8
- package/.claude/commands/plan.md +15 -8
- package/.claude/commands/review.md +12 -6
- package/.claude/commands/tdd.md +11 -5
- package/.claude/commands/workflow-start.md +20 -11
- package/.claude/settings.json +28 -0
- package/.codex/agents/architect.toml +207 -0
- package/.codex/agents/build-error-resolver.toml +110 -0
- package/.codex/agents/code-reviewer.toml +233 -0
- package/.codex/agents/doc-updater.toml +103 -0
- package/.codex/agents/e2e-runner.toml +103 -0
- package/.codex/agents/get-current-datetime.toml +23 -0
- package/.codex/agents/init-architect.toml +181 -0
- package/.codex/agents/planner.toml +208 -0
- package/.codex/agents/refactor-cleaner.toml +81 -0
- package/.codex/agents/rust-reviewer.toml +90 -0
- package/.codex/agents/security-reviewer.toml +104 -0
- package/.codex/agents/tdd-guide.toml +87 -0
- package/AGENTS.md +2 -2
- package/CLAUDE.md +23 -1
- package/LICENSE +20 -20
- package/README.md +333 -333
- package/agents/a11y-architect.md +141 -141
- package/agents/architect.md +211 -211
- package/agents/build-error-resolver.md +114 -114
- package/agents/chief-of-staff.md +151 -151
- package/agents/code-architect.md +71 -71
- package/agents/code-explorer.md +69 -69
- package/agents/code-reviewer.md +237 -237
- package/agents/code-simplifier.md +47 -47
- package/agents/comment-analyzer.md +45 -45
- package/agents/conversation-analyzer.md +52 -52
- package/agents/cpp-build-resolver.md +90 -90
- package/agents/cpp-reviewer.md +72 -72
- package/agents/csharp-reviewer.md +101 -101
- package/agents/dart-build-resolver.md +201 -201
- package/agents/database-reviewer.md +91 -91
- package/agents/doc-updater.md +107 -107
- package/agents/docs-lookup.md +68 -68
- package/agents/e2e-runner.md +107 -107
- package/agents/flutter-reviewer.md +243 -243
- package/agents/gan-evaluator.md +209 -209
- package/agents/gan-generator.md +131 -131
- package/agents/gan-planner.md +99 -99
- package/agents/get-current-datetime.md +26 -26
- package/agents/go-build-resolver.md +94 -94
- package/agents/go-reviewer.md +76 -76
- package/agents/harness-optimizer.md +35 -35
- package/agents/healthcare-reviewer.md +83 -83
- package/agents/java-build-resolver.md +153 -153
- package/agents/java-reviewer.md +92 -92
- package/agents/kotlin-build-resolver.md +118 -118
- package/agents/kotlin-reviewer.md +159 -159
- package/agents/loop-operator.md +36 -36
- package/agents/opensource-forker.md +198 -198
- package/agents/opensource-packager.md +249 -249
- package/agents/opensource-sanitizer.md +188 -188
- package/agents/performance-optimizer.md +446 -446
- package/agents/planner.md +212 -212
- package/agents/pr-test-analyzer.md +45 -45
- package/agents/python-reviewer.md +98 -98
- package/agents/pytorch-build-resolver.md +120 -120
- package/agents/refactor-cleaner.md +85 -85
- package/agents/rust-build-resolver.md +148 -148
- package/agents/rust-reviewer.md +94 -94
- package/agents/security-reviewer.md +108 -108
- package/agents/seo-specialist.md +59 -59
- package/agents/silent-failure-hunter.md +50 -50
- package/agents/tdd-guide.md +91 -91
- package/agents/type-design-analyzer.md +41 -41
- package/agents/typescript-reviewer.md +112 -112
- package/cli/commands/update.mjs +1 -1
- package/cli/utils/scaffold.mjs +53 -0
- package/docs/rules/agents.md +166 -50
- package/docs/rules/cpp/coding-style.md +44 -44
- package/docs/rules/cpp/hooks.md +39 -39
- package/docs/rules/cpp/patterns.md +51 -51
- package/docs/rules/cpp/security.md +51 -51
- package/docs/rules/cpp/testing.md +44 -44
- package/docs/rules/csharp/coding-style.md +72 -72
- package/docs/rules/csharp/hooks.md +25 -25
- package/docs/rules/csharp/patterns.md +50 -50
- package/docs/rules/csharp/security.md +58 -58
- package/docs/rules/csharp/testing.md +46 -46
- package/docs/rules/dart/coding-style.md +159 -159
- package/docs/rules/dart/hooks.md +66 -66
- package/docs/rules/dart/patterns.md +261 -261
- package/docs/rules/dart/security.md +135 -135
- package/docs/rules/dart/testing.md +215 -215
- package/docs/rules/golang/coding-style.md +32 -32
- package/docs/rules/golang/hooks.md +17 -17
- package/docs/rules/golang/patterns.md +45 -45
- package/docs/rules/golang/security.md +34 -34
- package/docs/rules/golang/testing.md +31 -31
- package/docs/rules/java/coding-style.md +114 -114
- package/docs/rules/java/hooks.md +18 -18
- package/docs/rules/java/patterns.md +146 -146
- package/docs/rules/java/security.md +100 -100
- package/docs/rules/java/testing.md +131 -131
- package/docs/rules/kotlin/coding-style.md +86 -86
- package/docs/rules/kotlin/hooks.md +17 -17
- package/docs/rules/kotlin/patterns.md +146 -146
- package/docs/rules/kotlin/security.md +82 -82
- package/docs/rules/kotlin/testing.md +128 -128
- package/docs/rules/perl/coding-style.md +46 -46
- package/docs/rules/perl/hooks.md +22 -22
- package/docs/rules/perl/patterns.md +76 -76
- package/docs/rules/perl/security.md +69 -69
- package/docs/rules/perl/testing.md +54 -54
- package/docs/rules/php/coding-style.md +40 -40
- package/docs/rules/php/hooks.md +24 -24
- package/docs/rules/php/patterns.md +33 -33
- package/docs/rules/php/security.md +37 -37
- package/docs/rules/php/testing.md +39 -39
- package/docs/rules/python/coding-style.md +42 -42
- package/docs/rules/python/hooks.md +19 -19
- package/docs/rules/python/patterns.md +39 -39
- package/docs/rules/python/security.md +30 -30
- package/docs/rules/python/testing.md +38 -38
- package/docs/rules/rust/coding-style.md +151 -151
- package/docs/rules/rust/hooks.md +16 -16
- package/docs/rules/rust/patterns.md +168 -168
- package/docs/rules/rust/security.md +141 -141
- package/docs/rules/rust/testing.md +154 -154
- package/docs/rules/swift/coding-style.md +47 -47
- package/docs/rules/swift/hooks.md +20 -20
- package/docs/rules/swift/patterns.md +66 -66
- package/docs/rules/swift/security.md +33 -33
- package/docs/rules/swift/testing.md +45 -45
- package/docs/rules/typescript/coding-style.md +199 -199
- package/docs/rules/typescript/hooks.md +22 -22
- package/docs/rules/typescript/patterns.md +52 -52
- package/docs/rules/typescript/security.md +28 -28
- package/docs/rules/typescript/testing.md +18 -18
- package/docs/rules/web/coding-style.md +96 -96
- package/docs/rules/web/design-quality.md +62 -62
- package/docs/rules/web/hooks.md +120 -120
- package/docs/rules/web/patterns.md +79 -79
- package/docs/rules/web/performance.md +64 -64
- package/docs/rules/web/security.md +57 -57
- package/docs/rules/web/testing.md +55 -55
- package/docs/templates/README.md +36 -36
- package/docs/templates/ai-project-final.md +124 -124
- package/docs/templates/ai-project.md +105 -105
- package/docs/templates/api.md +157 -157
- package/docs/templates/bug.md +62 -62
- package/docs/templates/code-review.md +87 -87
- package/docs/templates/generic.md +116 -116
- package/docs/templates/implementation-plan.md +1 -1
- package/docs/templates/meeting.md +68 -68
- package/docs/templates/prd.md +98 -98
- package/docs/templates/ui.md +134 -134
- package/docs/workflow-pipeline.md +5 -5
- package/package.json +40 -39
- package/skills/SUPERPOWERS-LICENSE +21 -21
- package/skills/ai-ml/fine-tuning-expert/SKILL.md +162 -162
- package/skills/ai-ml/fine-tuning-expert/references/dataset-preparation.md +540 -540
- package/skills/ai-ml/fine-tuning-expert/references/deployment-optimization.md +673 -673
- package/skills/ai-ml/fine-tuning-expert/references/evaluation-metrics.md +597 -597
- package/skills/ai-ml/fine-tuning-expert/references/hyperparameter-tuning.md +565 -565
- package/skills/ai-ml/fine-tuning-expert/references/lora-peft.md +347 -347
- package/skills/ai-ml/ml-pipeline/SKILL.md +159 -159
- package/skills/ai-ml/ml-pipeline/references/experiment-tracking.md +833 -833
- package/skills/ai-ml/ml-pipeline/references/feature-engineering.md +631 -631
- package/skills/ai-ml/ml-pipeline/references/model-validation.md +978 -978
- package/skills/ai-ml/ml-pipeline/references/pipeline-orchestration.md +907 -907
- package/skills/ai-ml/ml-pipeline/references/training-pipelines.md +782 -782
- package/skills/ai-ml/rag-architect/SKILL.md +194 -194
- package/skills/ai-ml/rag-architect/references/chunking-strategies.md +878 -878
- package/skills/ai-ml/rag-architect/references/embedding-models.md +561 -561
- package/skills/ai-ml/rag-architect/references/rag-evaluation.md +833 -833
- package/skills/ai-ml/rag-architect/references/retrieval-optimization.md +795 -795
- package/skills/ai-ml/rag-architect/references/vector-databases.md +589 -589
- package/skills/ai-ml/spark-engineer/SKILL.md +148 -148
- package/skills/ai-ml/spark-engineer/references/partitioning-caching.md +543 -543
- package/skills/ai-ml/spark-engineer/references/performance-tuning.md +544 -544
- package/skills/ai-ml/spark-engineer/references/rdd-operations.md +599 -599
- package/skills/ai-ml/spark-engineer/references/spark-sql-dataframes.md +474 -474
- package/skills/ai-ml/spark-engineer/references/streaming-patterns.md +786 -786
- package/skills/backend/api-designer/SKILL.md +217 -217
- package/skills/backend/api-designer/references/error-handling.md +541 -541
- package/skills/backend/api-designer/references/openapi.md +824 -824
- package/skills/backend/api-designer/references/pagination.md +494 -494
- package/skills/backend/api-designer/references/rest-patterns.md +335 -335
- package/skills/backend/api-designer/references/versioning.md +391 -391
- package/skills/backend/architecture-designer/SKILL.md +117 -117
- package/skills/backend/architecture-designer/references/adr-template.md +116 -116
- package/skills/backend/architecture-designer/references/architecture-patterns.md +111 -111
- package/skills/backend/architecture-designer/references/database-selection.md +102 -102
- package/skills/backend/architecture-designer/references/nfr-checklist.md +112 -112
- package/skills/backend/architecture-designer/references/system-design.md +100 -100
- package/skills/backend/code-documenter/SKILL.md +147 -147
- package/skills/backend/code-documenter/references/api-docs-fastapi-django.md +166 -166
- package/skills/backend/code-documenter/references/api-docs-nestjs-express.md +220 -220
- package/skills/backend/code-documenter/references/coverage-reports.md +125 -125
- package/skills/backend/code-documenter/references/documentation-systems.md +333 -333
- package/skills/backend/code-documenter/references/interactive-api-docs.md +531 -531
- package/skills/backend/code-documenter/references/python-docstrings.md +121 -121
- package/skills/backend/code-documenter/references/typescript-jsdoc.md +145 -145
- package/skills/backend/code-documenter/references/user-guides-tutorials.md +530 -530
- package/skills/backend/debugging-wizard/SKILL.md +105 -105
- package/skills/backend/debugging-wizard/references/common-patterns.md +132 -132
- package/skills/backend/debugging-wizard/references/debugging-tools.md +140 -140
- package/skills/backend/debugging-wizard/references/quick-fixes.md +177 -177
- package/skills/backend/debugging-wizard/references/strategies.md +142 -142
- package/skills/backend/debugging-wizard/references/systematic-debugging.md +367 -367
- package/skills/backend/feature-forge/SKILL.md +98 -98
- package/skills/backend/feature-forge/references/acceptance-criteria.md +104 -104
- package/skills/backend/feature-forge/references/ears-syntax.md +99 -99
- package/skills/backend/feature-forge/references/interview-questions.md +150 -150
- package/skills/backend/feature-forge/references/pre-discovery-subagents.md +54 -54
- package/skills/backend/feature-forge/references/specification-template.md +103 -103
- package/skills/backend/fullstack-guardian/SKILL.md +105 -105
- package/skills/backend/fullstack-guardian/references/api-design-standards.md +307 -307
- package/skills/backend/fullstack-guardian/references/architecture-decisions.md +350 -350
- package/skills/backend/fullstack-guardian/references/backend-patterns.md +237 -237
- package/skills/backend/fullstack-guardian/references/common-patterns.md +134 -134
- package/skills/backend/fullstack-guardian/references/deliverables-checklist.md +354 -354
- package/skills/backend/fullstack-guardian/references/design-template.md +91 -91
- package/skills/backend/fullstack-guardian/references/error-handling.md +135 -135
- package/skills/backend/fullstack-guardian/references/frontend-patterns.md +340 -340
- package/skills/backend/fullstack-guardian/references/integration-patterns.md +333 -333
- package/skills/backend/fullstack-guardian/references/security-checklist.md +106 -106
- package/skills/backend/graphql-architect/SKILL.md +146 -146
- package/skills/backend/graphql-architect/references/federation.md +418 -418
- package/skills/backend/graphql-architect/references/migration-from-rest.md +1141 -1141
- package/skills/backend/graphql-architect/references/resolvers.md +425 -425
- package/skills/backend/graphql-architect/references/schema-design.md +393 -393
- package/skills/backend/graphql-architect/references/security.md +569 -569
- package/skills/backend/graphql-architect/references/subscriptions.md +510 -510
- package/skills/backend/legacy-modernizer/SKILL.md +137 -137
- package/skills/backend/legacy-modernizer/references/legacy-testing.md +381 -381
- package/skills/backend/legacy-modernizer/references/migration-strategies.md +423 -423
- package/skills/backend/legacy-modernizer/references/refactoring-patterns.md +395 -395
- package/skills/backend/legacy-modernizer/references/strangler-fig-pattern.md +281 -281
- package/skills/backend/legacy-modernizer/references/system-assessment.md +487 -487
- package/skills/backend/microservices-architect/SKILL.md +164 -164
- package/skills/backend/microservices-architect/references/communication.md +499 -499
- package/skills/backend/microservices-architect/references/data.md +721 -721
- package/skills/backend/microservices-architect/references/decomposition.md +344 -344
- package/skills/backend/microservices-architect/references/observability.md +805 -805
- package/skills/backend/microservices-architect/references/patterns.md +603 -603
- package/skills/database/database-optimizer/SKILL.md +147 -147
- package/skills/database/database-optimizer/references/index-strategies.md +331 -331
- package/skills/database/database-optimizer/references/monitoring-analysis.md +501 -501
- package/skills/database/database-optimizer/references/mysql-tuning.md +452 -452
- package/skills/database/database-optimizer/references/postgresql-tuning.md +413 -413
- package/skills/database/database-optimizer/references/query-optimization.md +251 -251
- package/skills/database/postgres-pro/SKILL.md +152 -152
- package/skills/database/postgres-pro/references/extensions.md +404 -404
- package/skills/database/postgres-pro/references/jsonb.md +321 -321
- package/skills/database/postgres-pro/references/maintenance.md +481 -481
- package/skills/database/postgres-pro/references/performance.md +265 -265
- package/skills/database/postgres-pro/references/replication.md +446 -446
- package/skills/database/sql-pro/SKILL.md +129 -129
- package/skills/database/sql-pro/references/database-design.md +402 -402
- package/skills/database/sql-pro/references/dialect-differences.md +419 -419
- package/skills/database/sql-pro/references/optimization.md +384 -384
- package/skills/database/sql-pro/references/query-patterns.md +285 -285
- package/skills/database/sql-pro/references/window-functions.md +328 -328
- package/skills/dotnet/csharp-developer/SKILL.md +125 -125
- package/skills/dotnet/csharp-developer/references/aspnet-core.md +394 -394
- package/skills/dotnet/csharp-developer/references/blazor.md +553 -553
- package/skills/dotnet/csharp-developer/references/entity-framework.md +409 -409
- package/skills/dotnet/csharp-developer/references/modern-csharp.md +248 -248
- package/skills/dotnet/csharp-developer/references/performance.md +498 -498
- package/skills/dotnet/dotnet-core-expert/SKILL.md +138 -138
- package/skills/dotnet/dotnet-core-expert/references/authentication.md +546 -546
- package/skills/dotnet/dotnet-core-expert/references/clean-architecture.md +455 -455
- package/skills/dotnet/dotnet-core-expert/references/cloud-native.md +548 -548
- package/skills/dotnet/dotnet-core-expert/references/entity-framework.md +440 -440
- package/skills/dotnet/dotnet-core-expert/references/minimal-apis.md +319 -319
- package/skills/frontend/angular-architect/SKILL.md +152 -152
- package/skills/frontend/angular-architect/references/components.md +297 -297
- package/skills/frontend/angular-architect/references/ngrx.md +401 -401
- package/skills/frontend/angular-architect/references/routing.md +361 -361
- package/skills/frontend/angular-architect/references/rxjs.md +319 -319
- package/skills/frontend/angular-architect/references/testing.md +405 -405
- package/skills/frontend/design-commands/design.md +91 -91
- package/skills/frontend/design-commands/handoff.md +97 -97
- package/skills/frontend/design-commands/prototype.md +120 -120
- package/skills/frontend/design-commands/spec.md +160 -160
- package/skills/frontend/design-commands/style.md +78 -78
- package/skills/frontend/flutter-expert/SKILL.md +138 -138
- package/skills/frontend/flutter-expert/references/bloc-state.md +259 -259
- package/skills/frontend/flutter-expert/references/gorouter-navigation.md +119 -119
- package/skills/frontend/flutter-expert/references/performance.md +99 -99
- package/skills/frontend/flutter-expert/references/project-structure.md +118 -118
- package/skills/frontend/flutter-expert/references/riverpod-state.md +130 -130
- package/skills/frontend/flutter-expert/references/widget-patterns.md +123 -123
- package/skills/frontend/nextjs-developer/SKILL.md +143 -143
- package/skills/frontend/nextjs-developer/references/app-router.md +311 -311
- package/skills/frontend/nextjs-developer/references/data-fetching.md +482 -482
- package/skills/frontend/nextjs-developer/references/deployment.md +545 -545
- package/skills/frontend/nextjs-developer/references/server-actions.md +462 -462
- package/skills/frontend/nextjs-developer/references/server-components.md +384 -384
- package/skills/frontend/react-expert/SKILL.md +149 -149
- package/skills/frontend/react-expert/references/hooks-patterns.md +162 -162
- package/skills/frontend/react-expert/references/migration-class-to-modern.md +1119 -1119
- package/skills/frontend/react-expert/references/performance.md +168 -168
- package/skills/frontend/react-expert/references/react-19-features.md +174 -174
- package/skills/frontend/react-expert/references/server-components.md +143 -143
- package/skills/frontend/react-expert/references/state-management.md +171 -171
- package/skills/frontend/react-expert/references/testing-react.md +174 -174
- package/skills/frontend/react-native-expert/SKILL.md +185 -185
- package/skills/frontend/react-native-expert/references/expo-router.md +187 -187
- package/skills/frontend/react-native-expert/references/list-optimization.md +204 -204
- package/skills/frontend/react-native-expert/references/platform-handling.md +188 -188
- package/skills/frontend/react-native-expert/references/project-structure.md +171 -171
- package/skills/frontend/react-native-expert/references/storage-hooks.md +173 -173
- package/skills/frontend/senior-frontend/SKILL.md +477 -477
- package/skills/frontend/senior-frontend/references/frontend_best_practices.md +806 -806
- package/skills/frontend/senior-frontend/references/nextjs_optimization_guide.md +724 -724
- package/skills/frontend/senior-frontend/references/react_patterns.md +746 -746
- package/skills/frontend/senior-frontend/scripts/bundle_analyzer.py +407 -407
- package/skills/frontend/senior-frontend/scripts/component_generator.py +329 -329
- package/skills/frontend/senior-frontend/scripts/frontend_scaffolder.py +1005 -1005
- package/skills/frontend/ui-ux-pro-max/SKILL.md +386 -386
- package/skills/frontend/ui-ux-pro-max/data/charts.csv +26 -26
- package/skills/frontend/ui-ux-pro-max/data/colors.csv +97 -97
- package/skills/frontend/ui-ux-pro-max/data/icons.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/landing.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/data/products.csv +96 -96
- package/skills/frontend/ui-ux-pro-max/data/react-performance.csv +45 -45
- package/skills/frontend/ui-ux-pro-max/data/stacks/astro.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/flutter.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -56
- package/skills/frontend/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nextjs.csv +53 -53
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -59
- package/skills/frontend/ui-ux-pro-max/data/stacks/react-native.csv +52 -52
- package/skills/frontend/ui-ux-pro-max/data/stacks/react.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/shadcn.csv +61 -61
- package/skills/frontend/ui-ux-pro-max/data/stacks/svelte.csv +54 -54
- package/skills/frontend/ui-ux-pro-max/data/stacks/swiftui.csv +51 -51
- package/skills/frontend/ui-ux-pro-max/data/stacks/vue.csv +50 -50
- package/skills/frontend/ui-ux-pro-max/data/styles.csv +68 -68
- package/skills/frontend/ui-ux-pro-max/data/typography.csv +57 -57
- package/skills/frontend/ui-ux-pro-max/data/ui-reasoning.csv +101 -101
- package/skills/frontend/ui-ux-pro-max/data/ux-guidelines.csv +99 -99
- package/skills/frontend/ui-ux-pro-max/data/web-interface.csv +31 -31
- package/skills/frontend/ui-ux-pro-max/scripts/core.py +253 -253
- package/skills/frontend/ui-ux-pro-max/scripts/design_system.py +1067 -1067
- package/skills/frontend/ui-ux-pro-max/scripts/search.py +114 -114
- package/skills/frontend/vue-expert/SKILL.md +98 -98
- package/skills/frontend/vue-expert/references/build-tooling.md +480 -480
- package/skills/frontend/vue-expert/references/components.md +448 -448
- package/skills/frontend/vue-expert/references/composition-api.md +299 -299
- package/skills/frontend/vue-expert/references/mobile-hybrid.md +636 -636
- package/skills/frontend/vue-expert/references/nuxt.md +669 -669
- package/skills/frontend/vue-expert/references/state-management.md +449 -449
- package/skills/frontend/vue-expert/references/typescript.md +584 -584
- package/skills/frontend/vue-expert-js/SKILL.md +167 -167
- package/skills/frontend/vue-expert-js/references/component-architecture.md +219 -219
- package/skills/frontend/vue-expert-js/references/composables-patterns.md +183 -183
- package/skills/frontend/vue-expert-js/references/jsdoc-typing.md +535 -535
- package/skills/frontend/vue-expert-js/references/state-management.md +249 -249
- package/skills/frontend/vue-expert-js/references/testing-patterns.md +237 -237
- package/skills/go-rust-cpp/cpp-pro/SKILL.md +115 -115
- package/skills/go-rust-cpp/cpp-pro/references/build-tooling.md +440 -440
- package/skills/go-rust-cpp/cpp-pro/references/concurrency.md +437 -437
- package/skills/go-rust-cpp/cpp-pro/references/memory-performance.md +397 -397
- package/skills/go-rust-cpp/cpp-pro/references/modern-cpp.md +304 -304
- package/skills/go-rust-cpp/cpp-pro/references/templates.md +357 -357
- package/skills/go-rust-cpp/golang-pro/SKILL.md +122 -122
- package/skills/go-rust-cpp/golang-pro/references/concurrency.md +329 -329
- package/skills/go-rust-cpp/golang-pro/references/generics.md +442 -442
- package/skills/go-rust-cpp/golang-pro/references/interfaces.md +432 -432
- package/skills/go-rust-cpp/golang-pro/references/project-structure.md +477 -477
- package/skills/go-rust-cpp/golang-pro/references/testing.md +451 -451
- package/skills/go-rust-cpp/rust-engineer/SKILL.md +167 -167
- package/skills/go-rust-cpp/rust-engineer/references/async.md +458 -458
- package/skills/go-rust-cpp/rust-engineer/references/error-handling.md +334 -334
- package/skills/go-rust-cpp/rust-engineer/references/ownership.md +278 -278
- package/skills/go-rust-cpp/rust-engineer/references/testing.md +470 -470
- package/skills/go-rust-cpp/rust-engineer/references/traits.md +413 -413
- package/skills/infra/cli-developer/SKILL.md +113 -113
- package/skills/infra/cli-developer/references/design-patterns.md +221 -221
- package/skills/infra/cli-developer/references/go-cli.md +540 -540
- package/skills/infra/cli-developer/references/node-cli.md +383 -383
- package/skills/infra/cli-developer/references/python-cli.md +422 -422
- package/skills/infra/cli-developer/references/ux-patterns.md +448 -448
- package/skills/infra/cloud-architect/SKILL.md +216 -216
- package/skills/infra/cloud-architect/references/aws.md +394 -394
- package/skills/infra/cloud-architect/references/azure.md +562 -562
- package/skills/infra/cloud-architect/references/cost.md +582 -582
- package/skills/infra/cloud-architect/references/gcp.md +633 -633
- package/skills/infra/cloud-architect/references/multi-cloud.md +483 -483
- package/skills/infra/devops-engineer/SKILL.md +144 -144
- package/skills/infra/devops-engineer/references/deployment-strategies.md +241 -241
- package/skills/infra/devops-engineer/references/docker-patterns.md +113 -113
- package/skills/infra/devops-engineer/references/github-actions.md +139 -139
- package/skills/infra/devops-engineer/references/incident-response.md +331 -331
- package/skills/infra/devops-engineer/references/kubernetes.md +154 -154
- package/skills/infra/devops-engineer/references/platform-engineering.md +417 -417
- package/skills/infra/devops-engineer/references/release-automation.md +527 -527
- package/skills/infra/devops-engineer/references/terraform-iac.md +141 -141
- package/skills/infra/kubernetes-specialist/SKILL.md +241 -241
- package/skills/infra/kubernetes-specialist/references/configuration.md +452 -452
- package/skills/infra/kubernetes-specialist/references/cost-optimization.md +458 -458
- package/skills/infra/kubernetes-specialist/references/custom-operators.md +563 -563
- package/skills/infra/kubernetes-specialist/references/gitops.md +530 -530
- package/skills/infra/kubernetes-specialist/references/helm-charts.md +912 -912
- package/skills/infra/kubernetes-specialist/references/multi-cluster.md +507 -507
- package/skills/infra/kubernetes-specialist/references/networking.md +447 -447
- package/skills/infra/kubernetes-specialist/references/service-mesh.md +459 -459
- package/skills/infra/kubernetes-specialist/references/storage.md +535 -535
- package/skills/infra/kubernetes-specialist/references/troubleshooting.md +414 -414
- package/skills/infra/kubernetes-specialist/references/workloads.md +377 -377
- package/skills/infra/mcp-developer/SKILL.md +143 -143
- package/skills/infra/mcp-developer/references/protocol.md +244 -244
- package/skills/infra/mcp-developer/references/python-sdk.md +367 -367
- package/skills/infra/mcp-developer/references/resources.md +554 -554
- package/skills/infra/mcp-developer/references/tools.md +480 -480
- package/skills/infra/mcp-developer/references/typescript-sdk.md +350 -350
- package/skills/infra/monitoring-expert/SKILL.md +176 -176
- package/skills/infra/monitoring-expert/references/alerting-rules.md +141 -141
- package/skills/infra/monitoring-expert/references/application-profiling.md +331 -331
- package/skills/infra/monitoring-expert/references/capacity-planning.md +344 -344
- package/skills/infra/monitoring-expert/references/dashboards.md +126 -126
- package/skills/infra/monitoring-expert/references/opentelemetry.md +123 -123
- package/skills/infra/monitoring-expert/references/performance-testing.md +269 -269
- package/skills/infra/monitoring-expert/references/prometheus-metrics.md +136 -136
- package/skills/infra/monitoring-expert/references/structured-logging.md +142 -142
- package/skills/infra/sre-engineer/SKILL.md +181 -181
- package/skills/infra/sre-engineer/references/automation-toil.md +492 -492
- package/skills/infra/sre-engineer/references/error-budget-policy.md +334 -334
- package/skills/infra/sre-engineer/references/incident-chaos.md +576 -576
- package/skills/infra/sre-engineer/references/monitoring-alerting.md +424 -424
- package/skills/infra/sre-engineer/references/slo-sli-management.md +238 -238
- package/skills/infra/terraform-engineer/SKILL.md +143 -143
- package/skills/infra/terraform-engineer/references/best-practices.md +583 -583
- package/skills/infra/terraform-engineer/references/module-patterns.md +297 -297
- package/skills/infra/terraform-engineer/references/providers.md +452 -452
- package/skills/infra/terraform-engineer/references/state-management.md +371 -371
- package/skills/infra/terraform-engineer/references/testing.md +486 -486
- package/skills/infra/websocket-engineer/SKILL.md +168 -168
- package/skills/infra/websocket-engineer/references/alternatives.md +391 -391
- package/skills/infra/websocket-engineer/references/patterns.md +400 -400
- package/skills/infra/websocket-engineer/references/protocol.md +195 -195
- package/skills/infra/websocket-engineer/references/scaling.md +333 -333
- package/skills/infra/websocket-engineer/references/security.md +474 -474
- package/skills/java/java-architect/SKILL.md +132 -132
- package/skills/java/java-architect/references/jpa-optimization.md +393 -393
- package/skills/java/java-architect/references/reactive-webflux.md +356 -356
- package/skills/java/java-architect/references/spring-boot-setup.md +269 -269
- package/skills/java/java-architect/references/spring-security.md +445 -445
- package/skills/java/java-architect/references/testing-patterns.md +500 -500
- package/skills/java/kotlin-specialist/SKILL.md +147 -147
- package/skills/java/kotlin-specialist/references/android-compose.md +419 -419
- package/skills/java/kotlin-specialist/references/coroutines-flow.md +276 -276
- package/skills/java/kotlin-specialist/references/dsl-idioms.md +421 -421
- package/skills/java/kotlin-specialist/references/ktor-server.md +426 -426
- package/skills/java/kotlin-specialist/references/multiplatform-kmp.md +380 -380
- package/skills/java/spring-boot-engineer/SKILL.md +195 -195
- package/skills/java/spring-boot-engineer/references/cloud.md +498 -498
- package/skills/java/spring-boot-engineer/references/data.md +381 -381
- package/skills/java/spring-boot-engineer/references/security.md +459 -459
- package/skills/java/spring-boot-engineer/references/testing.md +545 -545
- package/skills/java/spring-boot-engineer/references/web.md +295 -295
- package/skills/javascript/javascript-pro/SKILL.md +132 -132
- package/skills/javascript/javascript-pro/references/async-patterns.md +334 -334
- package/skills/javascript/javascript-pro/references/browser-apis.md +398 -398
- package/skills/javascript/javascript-pro/references/modern-syntax.md +272 -272
- package/skills/javascript/javascript-pro/references/modules.md +357 -357
- package/skills/javascript/javascript-pro/references/node-essentials.md +471 -471
- package/skills/javascript/nestjs-expert/SKILL.md +206 -206
- package/skills/javascript/nestjs-expert/references/authentication.md +166 -166
- package/skills/javascript/nestjs-expert/references/controllers-routing.md +111 -111
- package/skills/javascript/nestjs-expert/references/dtos-validation.md +153 -153
- package/skills/javascript/nestjs-expert/references/migration-from-express.md +1237 -1237
- package/skills/javascript/nestjs-expert/references/services-di.md +140 -140
- package/skills/javascript/nestjs-expert/references/testing-patterns.md +186 -186
- package/skills/javascript/typescript-pro/SKILL.md +145 -145
- package/skills/javascript/typescript-pro/references/advanced-types.md +259 -259
- package/skills/javascript/typescript-pro/references/configuration.md +445 -445
- package/skills/javascript/typescript-pro/references/patterns.md +484 -484
- package/skills/javascript/typescript-pro/references/type-guards.md +352 -352
- package/skills/javascript/typescript-pro/references/utility-types.md +329 -329
- package/skills/php/laravel-specialist/SKILL.md +262 -262
- package/skills/php/laravel-specialist/references/eloquent.md +351 -351
- package/skills/php/laravel-specialist/references/livewire.md +512 -512
- package/skills/php/laravel-specialist/references/queues.md +423 -423
- package/skills/php/laravel-specialist/references/routing.md +362 -362
- package/skills/php/laravel-specialist/references/testing.md +522 -522
- package/skills/php/php-pro/SKILL.md +206 -206
- package/skills/php/php-pro/references/async-patterns.md +412 -412
- package/skills/php/php-pro/references/laravel-patterns.md +377 -377
- package/skills/php/php-pro/references/modern-php-features.md +323 -323
- package/skills/php/php-pro/references/symfony-patterns.md +466 -466
- package/skills/php/php-pro/references/testing-quality.md +466 -466
- package/skills/product/competitive-analysis/SKILL.md +257 -257
- package/skills/product/meeting-notes/SKILL.md +266 -266
- package/skills/product/prd-template/SKILL.md +150 -150
- package/skills/product/stakeholder-update/SKILL.md +225 -225
- package/skills/product/user-research-synthesis/SKILL.md +235 -235
- package/skills/python/django-expert/SKILL.md +162 -162
- package/skills/python/django-expert/references/authentication.md +145 -145
- package/skills/python/django-expert/references/drf-serializers.md +148 -148
- package/skills/python/django-expert/references/models-orm.md +151 -151
- package/skills/python/django-expert/references/testing-django.md +204 -204
- package/skills/python/django-expert/references/viewsets-views.md +153 -153
- package/skills/python/fastapi-expert/SKILL.md +185 -185
- package/skills/python/fastapi-expert/references/async-sqlalchemy.md +146 -146
- package/skills/python/fastapi-expert/references/authentication.md +159 -159
- package/skills/python/fastapi-expert/references/endpoints-routing.md +142 -142
- package/skills/python/fastapi-expert/references/migration-from-django.md +996 -996
- package/skills/python/fastapi-expert/references/pydantic-v2.md +135 -135
- package/skills/python/fastapi-expert/references/testing-async.md +159 -159
- package/skills/python/pandas-pro/SKILL.md +178 -178
- package/skills/python/pandas-pro/references/aggregation-groupby.md +545 -545
- package/skills/python/pandas-pro/references/data-cleaning.md +500 -500
- package/skills/python/pandas-pro/references/dataframe-operations.md +420 -420
- package/skills/python/pandas-pro/references/merging-joining.md +596 -596
- package/skills/python/pandas-pro/references/performance-optimization.md +597 -597
- package/skills/python/python-pro/SKILL.md +177 -177
- package/skills/python/python-pro/references/async-patterns.md +356 -356
- package/skills/python/python-pro/references/packaging.md +460 -460
- package/skills/python/python-pro/references/standard-library.md +378 -378
- package/skills/python/python-pro/references/testing.md +404 -404
- package/skills/python/python-pro/references/type-system.md +290 -290
- package/skills/quality/chaos-engineer/SKILL.md +182 -182
- package/skills/quality/chaos-engineer/references/chaos-tools.md +511 -511
- package/skills/quality/chaos-engineer/references/experiment-design.md +229 -229
- package/skills/quality/chaos-engineer/references/game-days.md +434 -434
- package/skills/quality/chaos-engineer/references/infrastructure-chaos.md +348 -348
- package/skills/quality/chaos-engineer/references/kubernetes-chaos.md +432 -432
- package/skills/quality/code-reviewer/SKILL.md +119 -119
- package/skills/quality/code-reviewer/references/common-issues.md +142 -142
- package/skills/quality/code-reviewer/references/feedback-examples.md +144 -144
- package/skills/quality/code-reviewer/references/receiving-feedback.md +238 -238
- package/skills/quality/code-reviewer/references/report-template.md +109 -109
- package/skills/quality/code-reviewer/references/review-checklist.md +88 -88
- package/skills/quality/code-reviewer/references/spec-compliance-review.md +258 -258
- package/skills/quality/playwright-expert/SKILL.md +169 -169
- package/skills/quality/playwright-expert/references/api-mocking.md +140 -140
- package/skills/quality/playwright-expert/references/configuration.md +155 -155
- package/skills/quality/playwright-expert/references/debugging-flaky.md +150 -150
- package/skills/quality/playwright-expert/references/page-object-model.md +152 -152
- package/skills/quality/playwright-expert/references/selectors-locators.md +119 -119
- package/skills/quality/secure-code-guardian/SKILL.md +191 -191
- package/skills/quality/secure-code-guardian/references/authentication.md +136 -136
- package/skills/quality/secure-code-guardian/references/input-validation.md +146 -146
- package/skills/quality/secure-code-guardian/references/owasp-prevention.md +135 -135
- package/skills/quality/secure-code-guardian/references/security-headers.md +133 -133
- package/skills/quality/secure-code-guardian/references/xss-csrf.md +157 -157
- package/skills/quality/security-reviewer/SKILL.md +103 -103
- package/skills/quality/security-reviewer/references/infrastructure-security.md +268 -268
- package/skills/quality/security-reviewer/references/penetration-testing.md +268 -268
- package/skills/quality/security-reviewer/references/report-template.md +170 -170
- package/skills/quality/security-reviewer/references/sast-tools.md +117 -117
- package/skills/quality/security-reviewer/references/secret-scanning.md +125 -125
- package/skills/quality/security-reviewer/references/vulnerability-patterns.md +152 -152
- package/skills/quality/senior-qa/README.md +196 -196
- package/skills/quality/senior-qa/SKILL.md +399 -399
- package/skills/quality/senior-qa/references/qa_best_practices.md +964 -964
- package/skills/quality/senior-qa/references/test_automation_patterns.md +1009 -1009
- package/skills/quality/senior-qa/references/testing_strategies.md +649 -649
- package/skills/quality/senior-qa/scripts/coverage_analyzer.py +836 -836
- package/skills/quality/senior-qa/scripts/e2e_test_scaffolder.py +820 -820
- package/skills/quality/senior-qa/scripts/test_suite_generator.py +605 -605
- package/skills/quality/tdd-guide/HOW_TO_USE.md +313 -313
- package/skills/quality/tdd-guide/README.md +680 -680
- package/skills/quality/tdd-guide/SKILL.md +122 -122
- package/skills/quality/tdd-guide/assets/expected_output.json +77 -77
- package/skills/quality/tdd-guide/assets/sample_input_python.json +39 -39
- package/skills/quality/tdd-guide/assets/sample_input_typescript.json +36 -36
- package/skills/quality/tdd-guide/references/ci-integration.md +195 -195
- package/skills/quality/tdd-guide/references/framework-guide.md +206 -206
- package/skills/quality/tdd-guide/references/tdd-best-practices.md +128 -128
- package/skills/quality/tdd-guide/scripts/coverage_analyzer.py +434 -434
- package/skills/quality/tdd-guide/scripts/fixture_generator.py +440 -440
- package/skills/quality/tdd-guide/scripts/format_detector.py +384 -384
- package/skills/quality/tdd-guide/scripts/framework_adapter.py +428 -428
- package/skills/quality/tdd-guide/scripts/metrics_calculator.py +456 -456
- package/skills/quality/tdd-guide/scripts/output_formatter.py +354 -354
- package/skills/quality/tdd-guide/scripts/tdd_workflow.py +474 -474
- package/skills/quality/tdd-guide/scripts/test_generator.py +438 -438
- package/skills/quality/test-master/SKILL.md +94 -94
- package/skills/quality/test-master/references/automation-frameworks.md +294 -294
- package/skills/quality/test-master/references/e2e-testing.md +128 -128
- package/skills/quality/test-master/references/integration-testing.md +120 -120
- package/skills/quality/test-master/references/performance-testing.md +118 -118
- package/skills/quality/test-master/references/qa-methodology.md +247 -247
- package/skills/quality/test-master/references/security-testing.md +127 -127
- package/skills/quality/test-master/references/tdd-iron-laws.md +174 -174
- package/skills/quality/test-master/references/test-reports.md +104 -104
- package/skills/quality/test-master/references/testing-anti-patterns.md +231 -231
- package/skills/quality/test-master/references/unit-testing.md +113 -113
- package/skills/ruby/rails-expert/SKILL.md +154 -154
- package/skills/ruby/rails-expert/references/active-record.md +244 -244
- package/skills/ruby/rails-expert/references/api-development.md +401 -401
- package/skills/ruby/rails-expert/references/background-jobs.md +272 -272
- package/skills/ruby/rails-expert/references/hotwire-turbo.md +228 -228
- package/skills/ruby/rails-expert/references/rspec-testing.md +367 -367
- package/skills/swift/swift-expert/SKILL.md +163 -163
- package/skills/swift/swift-expert/references/async-concurrency.md +360 -360
- package/skills/swift/swift-expert/references/memory-performance.md +377 -377
- package/skills/swift/swift-expert/references/protocol-oriented.md +354 -354
- package/skills/swift/swift-expert/references/swiftui-patterns.md +291 -291
- package/skills/swift/swift-expert/references/testing-patterns.md +399 -399
- package/skills/workflow/brainstorming/SKILL.md +164 -164
- package/skills/workflow/brainstorming/scripts/frame-template.html +214 -214
- package/skills/workflow/brainstorming/scripts/helper.js +88 -88
- package/skills/workflow/brainstorming/scripts/server.cjs +354 -354
- package/skills/workflow/brainstorming/scripts/start-server.sh +148 -148
- package/skills/workflow/brainstorming/scripts/stop-server.sh +56 -56
- package/skills/workflow/brainstorming/spec-document-reviewer-prompt.md +49 -49
- package/skills/workflow/brainstorming/visual-companion.md +287 -287
- package/skills/workflow/documentation/SKILL.md +45 -45
- package/skills/workflow/entropy-management/SKILL.md +115 -115
- package/skills/workflow/executing-plans/SKILL.md +70 -70
- package/skills/workflow/finishing-a-development-branch/SKILL.md +200 -200
- package/skills/workflow/receiving-code-review/SKILL.md +213 -213
- package/skills/workflow/requesting-code-review/SKILL.md +105 -105
- package/skills/workflow/requesting-code-review/code-reviewer.md +146 -146
- package/skills/workflow/requirement-engineering/SKILL.md +111 -111
- package/skills/workflow/systematic-debugging/CREATION-LOG.md +119 -119
- package/skills/workflow/systematic-debugging/SKILL.md +296 -296
- package/skills/workflow/systematic-debugging/condition-based-waiting-example.ts +158 -158
- package/skills/workflow/systematic-debugging/condition-based-waiting.md +115 -115
- package/skills/workflow/systematic-debugging/defense-in-depth.md +122 -122
- package/skills/workflow/systematic-debugging/find-polluter.sh +63 -63
- package/skills/workflow/systematic-debugging/root-cause-tracing.md +169 -169
- package/skills/workflow/systematic-debugging/test-academic.md +14 -14
- package/skills/workflow/systematic-debugging/test-pressure-1.md +58 -58
- package/skills/workflow/systematic-debugging/test-pressure-2.md +68 -68
- package/skills/workflow/systematic-debugging/test-pressure-3.md +69 -69
- package/skills/workflow/using-git-worktrees/SKILL.md +218 -218
- package/skills/workflow/verification-before-completion/SKILL.md +139 -139
- package/skills/workflow/writing-plans/SKILL.md +151 -151
- package/skills/workflow/writing-plans/plan-document-reviewer-prompt.md +49 -49
- package/skills/workflow/writing-skills/SKILL.md +655 -655
- package/skills/workflow/writing-skills/anthropic-best-practices.md +1150 -1150
- package/skills/workflow/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -189
- package/skills/workflow/writing-skills/persuasion-principles.md +187 -187
- package/skills/workflow/writing-skills/render-graphs.js +168 -168
- package/skills/workflow/writing-skills/testing-skills-with-subagents.md +384 -384
|
@@ -1,786 +1,786 @@
|
|
|
1
|
-
# Streaming Patterns
|
|
2
|
-
|
|
3
|
-
---
|
|
4
|
-
|
|
5
|
-
## Structured Streaming Overview
|
|
6
|
-
|
|
7
|
-
### When to Use Structured Streaming
|
|
8
|
-
|
|
9
|
-
**Use when:**
|
|
10
|
-
- Processing continuous data streams (Kafka, files, sockets)
|
|
11
|
-
- Need exactly-once processing guarantees
|
|
12
|
-
- Real-time analytics and dashboards
|
|
13
|
-
- Event-driven architectures
|
|
14
|
-
- Incremental ETL from streaming sources
|
|
15
|
-
|
|
16
|
-
**Consider alternatives when:**
|
|
17
|
-
- Batch processing is sufficient (lower complexity)
|
|
18
|
-
- Sub-second latency required (consider Flink)
|
|
19
|
-
- Very simple event processing (Kafka Streams may suffice)
|
|
20
|
-
|
|
21
|
-
---
|
|
22
|
-
|
|
23
|
-
## Reading from Streaming Sources
|
|
24
|
-
|
|
25
|
-
### Kafka Source
|
|
26
|
-
|
|
27
|
-
```python
|
|
28
|
-
# Read from Kafka
|
|
29
|
-
df = spark.readStream \
|
|
30
|
-
.format("kafka") \
|
|
31
|
-
.option("kafka.bootstrap.servers", "broker1:9092,broker2:9092") \
|
|
32
|
-
.option("subscribe", "topic1,topic2") \
|
|
33
|
-
.option("startingOffsets", "latest") \
|
|
34
|
-
.option("maxOffsetsPerTrigger", 100000) \
|
|
35
|
-
.option("kafka.security.protocol", "SASL_SSL") \
|
|
36
|
-
.option("kafka.sasl.mechanism", "PLAIN") \
|
|
37
|
-
.load()
|
|
38
|
-
|
|
39
|
-
# Kafka provides key, value as bytes
|
|
40
|
-
# Parse JSON value
|
|
41
|
-
from pyspark.sql import functions as F
|
|
42
|
-
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
|
|
43
|
-
|
|
44
|
-
schema = StructType([
|
|
45
|
-
StructField("event_id", StringType()),
|
|
46
|
-
StructField("user_id", StringType()),
|
|
47
|
-
StructField("event_time", TimestampType()),
|
|
48
|
-
StructField("amount", DoubleType())
|
|
49
|
-
])
|
|
50
|
-
|
|
51
|
-
parsed_df = df.select(
|
|
52
|
-
F.col("key").cast("string").alias("kafka_key"),
|
|
53
|
-
F.from_json(F.col("value").cast("string"), schema).alias("data"),
|
|
54
|
-
F.col("timestamp").alias("kafka_timestamp"),
|
|
55
|
-
F.col("partition"),
|
|
56
|
-
F.col("offset")
|
|
57
|
-
).select("kafka_key", "data.*", "kafka_timestamp", "partition", "offset")
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
```scala
|
|
61
|
-
// Scala Kafka source
|
|
62
|
-
val df = spark.readStream
|
|
63
|
-
.format("kafka")
|
|
64
|
-
.option("kafka.bootstrap.servers", "broker1:9092,broker2:9092")
|
|
65
|
-
.option("subscribe", "topic1")
|
|
66
|
-
.option("startingOffsets", "latest")
|
|
67
|
-
.load()
|
|
68
|
-
|
|
69
|
-
val parsed = df.select(
|
|
70
|
-
col("key").cast("string"),
|
|
71
|
-
from_json(col("value").cast("string"), schema).as("data")
|
|
72
|
-
).select("key", "data.*")
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
### File Source (Auto-Discovery)
|
|
76
|
-
|
|
77
|
-
```python
|
|
78
|
-
# Read new files as they arrive
|
|
79
|
-
df = spark.readStream \
|
|
80
|
-
.format("parquet") \
|
|
81
|
-
.schema(my_schema) \
|
|
82
|
-
.option("path", "s3://bucket/incoming/") \
|
|
83
|
-
.option("maxFilesPerTrigger", 100) \
|
|
84
|
-
.load()
|
|
85
|
-
|
|
86
|
-
# For JSON files
|
|
87
|
-
df = spark.readStream \
|
|
88
|
-
.format("json") \
|
|
89
|
-
.schema(my_schema) \
|
|
90
|
-
.option("path", "s3://bucket/incoming/") \
|
|
91
|
-
.load()
|
|
92
|
-
|
|
93
|
-
# CSV with header
|
|
94
|
-
df = spark.readStream \
|
|
95
|
-
.format("csv") \
|
|
96
|
-
.schema(my_schema) \
|
|
97
|
-
.option("path", "s3://bucket/incoming/") \
|
|
98
|
-
.option("header", "true") \
|
|
99
|
-
.load()
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
### Rate Source (Testing)
|
|
103
|
-
|
|
104
|
-
```python
|
|
105
|
-
# Generate test data at specified rate
|
|
106
|
-
df = spark.readStream \
|
|
107
|
-
.format("rate") \
|
|
108
|
-
.option("rowsPerSecond", 1000) \
|
|
109
|
-
.option("numPartitions", 10) \
|
|
110
|
-
.load()
|
|
111
|
-
|
|
112
|
-
# Columns: timestamp, value (incrementing long)
|
|
113
|
-
```
|
|
114
|
-
|
|
115
|
-
---
|
|
116
|
-
|
|
117
|
-
## Output Modes
|
|
118
|
-
|
|
119
|
-
### Append Mode (Default)
|
|
120
|
-
|
|
121
|
-
```python
|
|
122
|
-
# Only new rows added since last trigger
|
|
123
|
-
# Use when: No aggregations, or windowed aggregations with watermark
|
|
124
|
-
query = df.writeStream \
|
|
125
|
-
.outputMode("append") \
|
|
126
|
-
.format("parquet") \
|
|
127
|
-
.option("path", "s3://bucket/output/") \
|
|
128
|
-
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
129
|
-
.start()
|
|
130
|
-
```
|
|
131
|
-
|
|
132
|
-
### Update Mode
|
|
133
|
-
|
|
134
|
-
```python
|
|
135
|
-
# Only rows that changed since last trigger
|
|
136
|
-
# Use when: Aggregations, want incremental updates
|
|
137
|
-
query = df.groupBy("user_id").count() \
|
|
138
|
-
.writeStream \
|
|
139
|
-
.outputMode("update") \
|
|
140
|
-
.format("console") \
|
|
141
|
-
.start()
|
|
142
|
-
```
|
|
143
|
-
|
|
144
|
-
### Complete Mode
|
|
145
|
-
|
|
146
|
-
```python
|
|
147
|
-
# Entire result table every trigger
|
|
148
|
-
# Use when: Need full aggregation result each time
|
|
149
|
-
# Warning: Can be expensive for large state
|
|
150
|
-
query = df.groupBy("user_id").count() \
|
|
151
|
-
.writeStream \
|
|
152
|
-
.outputMode("complete") \
|
|
153
|
-
.format("console") \
|
|
154
|
-
.start()
|
|
155
|
-
```
|
|
156
|
-
|
|
157
|
-
### Mode Selection Guide
|
|
158
|
-
|
|
159
|
-
| Use Case | Output Mode | Notes |
|
|
160
|
-
|----------|-------------|-------|
|
|
161
|
-
| ETL to files | append | Default, efficient |
|
|
162
|
-
| Windowed aggregations | append | With watermark |
|
|
163
|
-
| Running counts/sums | update | Incremental |
|
|
164
|
-
| Dashboards needing full state | complete | Expensive |
|
|
165
|
-
| Deduplication | append | With dropDuplicates |
|
|
166
|
-
|
|
167
|
-
---
|
|
168
|
-
|
|
169
|
-
## Watermarks and Event Time
|
|
170
|
-
|
|
171
|
-
### Understanding Watermarks
|
|
172
|
-
|
|
173
|
-
Watermarks define how late data can arrive before being dropped. They enable Spark to:
|
|
174
|
-
- Clean up old state (bounded memory)
|
|
175
|
-
- Emit results at appropriate times
|
|
176
|
-
- Handle out-of-order events
|
|
177
|
-
|
|
178
|
-
### Setting Watermarks
|
|
179
|
-
|
|
180
|
-
```python
|
|
181
|
-
from pyspark.sql import functions as F
|
|
182
|
-
|
|
183
|
-
# Define watermark on event time column
|
|
184
|
-
df_with_watermark = df \
|
|
185
|
-
.withWatermark("event_time", "10 minutes")
|
|
186
|
-
|
|
187
|
-
# Watermark threshold: max_event_time - 10 minutes
|
|
188
|
-
# Events older than watermark are dropped
|
|
189
|
-
# State older than watermark is cleaned up
|
|
190
|
-
```
|
|
191
|
-
|
|
192
|
-
### Watermark Guidelines
|
|
193
|
-
|
|
194
|
-
| Scenario | Watermark Duration | Reasoning |
|
|
195
|
-
|----------|-------------------|-----------|
|
|
196
|
-
| Real-time analytics | 1-5 minutes | Low latency, tolerate minimal late data |
|
|
197
|
-
| Standard ETL | 10-30 minutes | Balance latency and late data |
|
|
198
|
-
| Late-arriving data common | 1-24 hours | Accommodate delayed events |
|
|
199
|
-
| Best-effort real-time | 0 minutes | No late data tolerance |
|
|
200
|
-
|
|
201
|
-
### Example with Windowed Aggregation
|
|
202
|
-
|
|
203
|
-
```python
|
|
204
|
-
from pyspark.sql import functions as F
|
|
205
|
-
from pyspark.sql.window import Window
|
|
206
|
-
|
|
207
|
-
# Streaming aggregation with watermark
|
|
208
|
-
result = df \
|
|
209
|
-
.withWatermark("event_time", "10 minutes") \
|
|
210
|
-
.groupBy(
|
|
211
|
-
F.window("event_time", "5 minutes", "1 minute"), # 5-min tumbling window, 1-min slide
|
|
212
|
-
"user_id"
|
|
213
|
-
) \
|
|
214
|
-
.agg(
|
|
215
|
-
F.count("*").alias("event_count"),
|
|
216
|
-
F.sum("amount").alias("total_amount")
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
# Output schema includes window struct: window.start, window.end
|
|
220
|
-
query = result \
|
|
221
|
-
.select(
|
|
222
|
-
F.col("window.start").alias("window_start"),
|
|
223
|
-
F.col("window.end").alias("window_end"),
|
|
224
|
-
"user_id",
|
|
225
|
-
"event_count",
|
|
226
|
-
"total_amount"
|
|
227
|
-
) \
|
|
228
|
-
.writeStream \
|
|
229
|
-
.outputMode("append") \
|
|
230
|
-
.format("parquet") \
|
|
231
|
-
.option("path", "s3://bucket/windowed_output/") \
|
|
232
|
-
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
233
|
-
.start()
|
|
234
|
-
```
|
|
235
|
-
|
|
236
|
-
---
|
|
237
|
-
|
|
238
|
-
## Windowed Operations
|
|
239
|
-
|
|
240
|
-
### Tumbling Windows (Non-Overlapping)
|
|
241
|
-
|
|
242
|
-
```python
|
|
243
|
-
from pyspark.sql import functions as F
|
|
244
|
-
|
|
245
|
-
# 5-minute tumbling windows
|
|
246
|
-
result = df \
|
|
247
|
-
.withWatermark("event_time", "10 minutes") \
|
|
248
|
-
.groupBy(
|
|
249
|
-
F.window("event_time", "5 minutes"),
|
|
250
|
-
"category"
|
|
251
|
-
) \
|
|
252
|
-
.agg(F.sum("amount").alias("total"))
|
|
253
|
-
|
|
254
|
-
# Windows: [00:00-00:05), [00:05-00:10), [00:10-00:15), ...
|
|
255
|
-
```
|
|
256
|
-
|
|
257
|
-
### Sliding Windows (Overlapping)
|
|
258
|
-
|
|
259
|
-
```python
|
|
260
|
-
# 10-minute windows, sliding every 2 minutes
|
|
261
|
-
result = df \
|
|
262
|
-
.withWatermark("event_time", "10 minutes") \
|
|
263
|
-
.groupBy(
|
|
264
|
-
F.window("event_time", "10 minutes", "2 minutes"),
|
|
265
|
-
"category"
|
|
266
|
-
) \
|
|
267
|
-
.agg(F.sum("amount").alias("total"))
|
|
268
|
-
|
|
269
|
-
# Windows: [00:00-00:10), [00:02-00:12), [00:04-00:14), ...
|
|
270
|
-
```
|
|
271
|
-
|
|
272
|
-
### Session Windows (Gap-Based)
|
|
273
|
-
|
|
274
|
-
```python
|
|
275
|
-
# Session windows with 5-minute gap threshold
|
|
276
|
-
result = df \
|
|
277
|
-
.withWatermark("event_time", "10 minutes") \
|
|
278
|
-
.groupBy(
|
|
279
|
-
F.session_window("event_time", "5 minutes"), # Spark 3.2+
|
|
280
|
-
"user_id"
|
|
281
|
-
) \
|
|
282
|
-
.agg(
|
|
283
|
-
F.count("*").alias("events_in_session"),
|
|
284
|
-
F.first("event_time").alias("session_start"),
|
|
285
|
-
F.last("event_time").alias("session_end")
|
|
286
|
-
)
|
|
287
|
-
```
|
|
288
|
-
|
|
289
|
-
---
|
|
290
|
-
|
|
291
|
-
## Stateful Operations
|
|
292
|
-
|
|
293
|
-
### Aggregations (Built-in State)
|
|
294
|
-
|
|
295
|
-
```python
|
|
296
|
-
# Running count by key
|
|
297
|
-
running_counts = df \
|
|
298
|
-
.withWatermark("event_time", "1 hour") \
|
|
299
|
-
.groupBy("user_id") \
|
|
300
|
-
.agg(F.count("*").alias("total_events"))
|
|
301
|
-
|
|
302
|
-
# State stored per user_id
|
|
303
|
-
# Cleaned up based on watermark
|
|
304
|
-
```
|
|
305
|
-
|
|
306
|
-
### Deduplication
|
|
307
|
-
|
|
308
|
-
```python
|
|
309
|
-
# Drop duplicates within watermark window
|
|
310
|
-
deduped = df \
|
|
311
|
-
.withWatermark("event_time", "10 minutes") \
|
|
312
|
-
.dropDuplicates(["event_id"]) # Keep first occurrence
|
|
313
|
-
|
|
314
|
-
# Can also dedupe by multiple columns
|
|
315
|
-
deduped = df \
|
|
316
|
-
.withWatermark("event_time", "10 minutes") \
|
|
317
|
-
.dropDuplicates(["user_id", "event_type", "event_time"])
|
|
318
|
-
```
|
|
319
|
-
|
|
320
|
-
### Custom Stateful Processing (flatMapGroupsWithState)
|
|
321
|
-
|
|
322
|
-
```python
|
|
323
|
-
# PySpark - Custom state using applyInPandasWithState (Spark 3.4+)
|
|
324
|
-
from pyspark.sql.streaming.state import GroupState, GroupStateTimeout
|
|
325
|
-
|
|
326
|
-
def update_session_state(
|
|
327
|
-
key: tuple,
|
|
328
|
-
pdf_iter: Iterator[pd.DataFrame],
|
|
329
|
-
state: GroupState
|
|
330
|
-
) -> Iterator[pd.DataFrame]:
|
|
331
|
-
# Get or initialize state
|
|
332
|
-
if state.exists:
|
|
333
|
-
session_data = state.get
|
|
334
|
-
else:
|
|
335
|
-
session_data = {"count": 0, "total": 0.0}
|
|
336
|
-
|
|
337
|
-
# Process input data
|
|
338
|
-
for pdf in pdf_iter:
|
|
339
|
-
session_data["count"] += len(pdf)
|
|
340
|
-
session_data["total"] += pdf["amount"].sum()
|
|
341
|
-
|
|
342
|
-
# Update state
|
|
343
|
-
state.update(session_data)
|
|
344
|
-
|
|
345
|
-
# Optionally set timeout
|
|
346
|
-
state.setTimeoutDuration(10 * 60 * 1000) # 10 minutes
|
|
347
|
-
|
|
348
|
-
# Yield output
|
|
349
|
-
yield pd.DataFrame([{
|
|
350
|
-
"user_id": key[0],
|
|
351
|
-
"event_count": session_data["count"],
|
|
352
|
-
"total_amount": session_data["total"]
|
|
353
|
-
}])
|
|
354
|
-
|
|
355
|
-
# Apply stateful function
|
|
356
|
-
result = df \
|
|
357
|
-
.withWatermark("event_time", "10 minutes") \
|
|
358
|
-
.groupBy("user_id") \
|
|
359
|
-
.applyInPandasWithState(
|
|
360
|
-
update_session_state,
|
|
361
|
-
outputStructType=output_schema,
|
|
362
|
-
stateStructType=state_schema,
|
|
363
|
-
outputMode="update",
|
|
364
|
-
timeoutConf=GroupStateTimeout.ProcessingTimeTimeout
|
|
365
|
-
)
|
|
366
|
-
```
|
|
367
|
-
|
|
368
|
-
```scala
|
|
369
|
-
// Scala flatMapGroupsWithState
|
|
370
|
-
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout}
|
|
371
|
-
|
|
372
|
-
case class UserState(count: Long, totalAmount: Double)
|
|
373
|
-
case class UserOutput(userId: String, count: Long, totalAmount: Double)
|
|
374
|
-
|
|
375
|
-
def updateState(
|
|
376
|
-
userId: String,
|
|
377
|
-
events: Iterator[Event],
|
|
378
|
-
state: GroupState[UserState]
|
|
379
|
-
): Iterator[UserOutput] = {
|
|
380
|
-
|
|
381
|
-
val currentState = state.getOption.getOrElse(UserState(0, 0.0))
|
|
382
|
-
|
|
383
|
-
var newCount = currentState.count
|
|
384
|
-
var newTotal = currentState.totalAmount
|
|
385
|
-
|
|
386
|
-
events.foreach { event =>
|
|
387
|
-
newCount += 1
|
|
388
|
-
newTotal += event.amount
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
val newState = UserState(newCount, newTotal)
|
|
392
|
-
state.update(newState)
|
|
393
|
-
state.setTimeoutDuration("10 minutes")
|
|
394
|
-
|
|
395
|
-
Iterator(UserOutput(userId, newCount, newTotal))
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
val result = df
|
|
399
|
-
.withWatermark("event_time", "10 minutes")
|
|
400
|
-
.as[Event]
|
|
401
|
-
.groupByKey(_.userId)
|
|
402
|
-
.flatMapGroupsWithState(
|
|
403
|
-
OutputMode.Update,
|
|
404
|
-
GroupStateTimeout.ProcessingTimeTimeout
|
|
405
|
-
)(updateState)
|
|
406
|
-
```
|
|
407
|
-
|
|
408
|
-
---
|
|
409
|
-
|
|
410
|
-
## Streaming Joins
|
|
411
|
-
|
|
412
|
-
### Stream-Static Join
|
|
413
|
-
|
|
414
|
-
```python
|
|
415
|
-
# Join streaming data with static lookup table
|
|
416
|
-
static_df = spark.read.parquet("s3://bucket/lookup/")
|
|
417
|
-
|
|
418
|
-
# Streaming df joined with static - no watermark needed
|
|
419
|
-
result = streaming_df.join(static_df, "join_key", "left")
|
|
420
|
-
|
|
421
|
-
# Static table can be periodically refreshed
|
|
422
|
-
# Use broadcast for small static tables
|
|
423
|
-
from pyspark.sql.functions import broadcast
|
|
424
|
-
result = streaming_df.join(broadcast(static_df), "join_key")
|
|
425
|
-
```
|
|
426
|
-
|
|
427
|
-
### Stream-Stream Join
|
|
428
|
-
|
|
429
|
-
```python
|
|
430
|
-
# Join two streams - requires watermarks on both
|
|
431
|
-
from pyspark.sql import functions as F
|
|
432
|
-
|
|
433
|
-
stream1 = spark.readStream.format("kafka")...
|
|
434
|
-
stream2 = spark.readStream.format("kafka")...
|
|
435
|
-
|
|
436
|
-
# Both streams need watermarks
|
|
437
|
-
stream1_wm = stream1.withWatermark("event_time", "10 minutes")
|
|
438
|
-
stream2_wm = stream2.withWatermark("event_time", "10 minutes")
|
|
439
|
-
|
|
440
|
-
# Inner join with time constraint
|
|
441
|
-
result = stream1_wm.join(
|
|
442
|
-
stream2_wm,
|
|
443
|
-
F.expr("""
|
|
444
|
-
stream1.user_id = stream2.user_id AND
|
|
445
|
-
stream1.event_time >= stream2.event_time AND
|
|
446
|
-
stream1.event_time <= stream2.event_time + INTERVAL 5 MINUTES
|
|
447
|
-
"""),
|
|
448
|
-
"inner"
|
|
449
|
-
)
|
|
450
|
-
|
|
451
|
-
# Left outer join (Spark 2.3+)
|
|
452
|
-
result = stream1_wm.join(
|
|
453
|
-
stream2_wm,
|
|
454
|
-
F.expr("""
|
|
455
|
-
stream1.user_id = stream2.user_id AND
|
|
456
|
-
stream1.event_time >= stream2.event_time - INTERVAL 5 MINUTES AND
|
|
457
|
-
stream1.event_time <= stream2.event_time + INTERVAL 5 MINUTES
|
|
458
|
-
"""),
|
|
459
|
-
"leftOuter"
|
|
460
|
-
)
|
|
461
|
-
```
|
|
462
|
-
|
|
463
|
-
### Join Type Support
|
|
464
|
-
|
|
465
|
-
| Join Type | Stream-Static | Stream-Stream |
|
|
466
|
-
|-----------|---------------|---------------|
|
|
467
|
-
| Inner | Yes | Yes |
|
|
468
|
-
| Left Outer | Yes | Yes (Spark 2.3+) |
|
|
469
|
-
| Right Outer | Yes | Yes (Spark 2.3+) |
|
|
470
|
-
| Full Outer | Yes | Yes (Spark 2.4+) |
|
|
471
|
-
| Left Semi | Yes | Not supported |
|
|
472
|
-
| Left Anti | Yes | Not supported |
|
|
473
|
-
|
|
474
|
-
---
|
|
475
|
-
|
|
476
|
-
## Sinks
|
|
477
|
-
|
|
478
|
-
### Kafka Sink
|
|
479
|
-
|
|
480
|
-
```python
|
|
481
|
-
# Write to Kafka
|
|
482
|
-
query = df \
|
|
483
|
-
.select(
|
|
484
|
-
F.col("user_id").alias("key"),
|
|
485
|
-
F.to_json(F.struct("*")).alias("value")
|
|
486
|
-
) \
|
|
487
|
-
.writeStream \
|
|
488
|
-
.format("kafka") \
|
|
489
|
-
.option("kafka.bootstrap.servers", "broker1:9092") \
|
|
490
|
-
.option("topic", "output_topic") \
|
|
491
|
-
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
492
|
-
.start()
|
|
493
|
-
```
|
|
494
|
-
|
|
495
|
-
### File Sink (Parquet, JSON, CSV)
|
|
496
|
-
|
|
497
|
-
```python
|
|
498
|
-
# Parquet sink with partitioning
|
|
499
|
-
query = df.writeStream \
|
|
500
|
-
.format("parquet") \
|
|
501
|
-
.option("path", "s3://bucket/output/") \
|
|
502
|
-
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
503
|
-
.partitionBy("date", "hour") \
|
|
504
|
-
.trigger(processingTime="1 minute") \
|
|
505
|
-
.start()
|
|
506
|
-
|
|
507
|
-
# JSON sink
|
|
508
|
-
query = df.writeStream \
|
|
509
|
-
.format("json") \
|
|
510
|
-
.option("path", "s3://bucket/output/") \
|
|
511
|
-
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
512
|
-
.start()
|
|
513
|
-
```
|
|
514
|
-
|
|
515
|
-
### Delta Lake Sink
|
|
516
|
-
|
|
517
|
-
```python
|
|
518
|
-
# Delta Lake (ACID transactions, schema evolution)
|
|
519
|
-
query = df.writeStream \
|
|
520
|
-
.format("delta") \
|
|
521
|
-
.outputMode("append") \
|
|
522
|
-
.option("path", "s3://bucket/delta_table/") \
|
|
523
|
-
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
524
|
-
.option("mergeSchema", "true") \
|
|
525
|
-
.start()
|
|
526
|
-
|
|
527
|
-
# Upsert with foreachBatch
|
|
528
|
-
def upsert_to_delta(batch_df, batch_id):
|
|
529
|
-
delta_table = DeltaTable.forPath(spark, "s3://bucket/delta_table/")
|
|
530
|
-
delta_table.alias("target").merge(
|
|
531
|
-
batch_df.alias("source"),
|
|
532
|
-
"target.id = source.id"
|
|
533
|
-
).whenMatchedUpdateAll() \
|
|
534
|
-
.whenNotMatchedInsertAll() \
|
|
535
|
-
.execute()
|
|
536
|
-
|
|
537
|
-
query = df.writeStream \
|
|
538
|
-
.foreachBatch(upsert_to_delta) \
|
|
539
|
-
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
540
|
-
.start()
|
|
541
|
-
```
|
|
542
|
-
|
|
543
|
-
### Custom Sink (foreachBatch)
|
|
544
|
-
|
|
545
|
-
```python
|
|
546
|
-
def write_to_database(batch_df, batch_id):
|
|
547
|
-
"""Write each micro-batch to external database."""
|
|
548
|
-
batch_df.write \
|
|
549
|
-
.format("jdbc") \
|
|
550
|
-
.option("url", "jdbc:postgresql://host:5432/db") \
|
|
551
|
-
.option("dbtable", "output_table") \
|
|
552
|
-
.option("user", "user") \
|
|
553
|
-
.option("password", "password") \
|
|
554
|
-
.mode("append") \
|
|
555
|
-
.save()
|
|
556
|
-
|
|
557
|
-
query = df.writeStream \
|
|
558
|
-
.foreachBatch(write_to_database) \
|
|
559
|
-
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
560
|
-
.trigger(processingTime="30 seconds") \
|
|
561
|
-
.start()
|
|
562
|
-
```
|
|
563
|
-
|
|
564
|
-
### foreach (Row-by-Row)
|
|
565
|
-
|
|
566
|
-
```python
|
|
567
|
-
# For custom processing of each row
|
|
568
|
-
class ForeachWriter:
|
|
569
|
-
def open(self, partition_id, epoch_id):
|
|
570
|
-
# Initialize connection
|
|
571
|
-
self.connection = create_connection()
|
|
572
|
-
return True
|
|
573
|
-
|
|
574
|
-
def process(self, row):
|
|
575
|
-
# Process each row
|
|
576
|
-
self.connection.insert(row.asDict())
|
|
577
|
-
|
|
578
|
-
def close(self, error):
|
|
579
|
-
# Clean up
|
|
580
|
-
self.connection.close()
|
|
581
|
-
|
|
582
|
-
query = df.writeStream \
|
|
583
|
-
.foreach(ForeachWriter()) \
|
|
584
|
-
.start()
|
|
585
|
-
```
|
|
586
|
-
|
|
587
|
-
---
|
|
588
|
-
|
|
589
|
-
## Triggers
|
|
590
|
-
|
|
591
|
-
### Available Trigger Types
|
|
592
|
-
|
|
593
|
-
```python
|
|
594
|
-
# Process as fast as possible (default)
|
|
595
|
-
query = df.writeStream.trigger(processingTime="0 seconds").start()
|
|
596
|
-
|
|
597
|
-
# Fixed interval
|
|
598
|
-
query = df.writeStream.trigger(processingTime="1 minute").start()
|
|
599
|
-
|
|
600
|
-
# Once - process all available data, then stop
|
|
601
|
-
query = df.writeStream.trigger(once=True).start()
|
|
602
|
-
|
|
603
|
-
# Available now - process all available data (Spark 3.3+)
|
|
604
|
-
query = df.writeStream.trigger(availableNow=True).start()
|
|
605
|
-
|
|
606
|
-
# Continuous processing (experimental, low latency)
|
|
607
|
-
query = df.writeStream.trigger(continuous="1 second").start()
|
|
608
|
-
```
|
|
609
|
-
|
|
610
|
-
### Trigger Selection Guide
|
|
611
|
-
|
|
612
|
-
| Trigger | Use Case |
|
|
613
|
-
|---------|----------|
|
|
614
|
-
| processingTime="0 seconds" | Maximum throughput |
|
|
615
|
-
| processingTime="N seconds" | Controlled resource usage |
|
|
616
|
-
| once=True | Batch-style processing |
|
|
617
|
-
| availableNow=True | Catch-up processing |
|
|
618
|
-
| continuous="N ms" | Ultra-low latency (experimental) |
|
|
619
|
-
|
|
620
|
-
---
|
|
621
|
-
|
|
622
|
-
## Monitoring and Management
|
|
623
|
-
|
|
624
|
-
### Query Management
|
|
625
|
-
|
|
626
|
-
```python
|
|
627
|
-
# Start query and get handle
|
|
628
|
-
query = df.writeStream.format("console").start()
|
|
629
|
-
|
|
630
|
-
# Query properties
|
|
631
|
-
print(f"Query ID: {query.id}")
|
|
632
|
-
print(f"Run ID: {query.runId}")
|
|
633
|
-
print(f"Name: {query.name}")
|
|
634
|
-
print(f"Is Active: {query.isActive}")
|
|
635
|
-
print(f"Status: {query.status}")
|
|
636
|
-
print(f"Last Progress: {query.lastProgress}")
|
|
637
|
-
print(f"Recent Progress: {query.recentProgress}")
|
|
638
|
-
|
|
639
|
-
# Wait for termination
|
|
640
|
-
query.awaitTermination()
|
|
641
|
-
query.awaitTermination(timeout=60) # With timeout
|
|
642
|
-
|
|
643
|
-
# Stop query
|
|
644
|
-
query.stop()
|
|
645
|
-
|
|
646
|
-
# Get exception if failed
|
|
647
|
-
exception = query.exception()
|
|
648
|
-
```
|
|
649
|
-
|
|
650
|
-
### Progress Monitoring
|
|
651
|
-
|
|
652
|
-
```python
|
|
653
|
-
# Get latest progress
|
|
654
|
-
progress = query.lastProgress
|
|
655
|
-
if progress:
|
|
656
|
-
print(f"Input rows/sec: {progress['inputRowsPerSecond']}")
|
|
657
|
-
print(f"Processed rows/sec: {progress['processedRowsPerSecond']}")
|
|
658
|
-
print(f"Batch ID: {progress['batchId']}")
|
|
659
|
-
print(f"Duration: {progress['batchDuration']} ms")
|
|
660
|
-
print(f"State rows: {progress['stateOperators']}")
|
|
661
|
-
|
|
662
|
-
# Custom progress listener
|
|
663
|
-
class ProgressListener:
|
|
664
|
-
def onQueryProgress(self, event):
|
|
665
|
-
print(f"Progress: {event.progress}")
|
|
666
|
-
|
|
667
|
-
def onQueryTerminated(self, event):
|
|
668
|
-
print(f"Terminated: {event.exception}")
|
|
669
|
-
|
|
670
|
-
spark.streams.addListener(ProgressListener())
|
|
671
|
-
```
|
|
672
|
-
|
|
673
|
-
### Checkpointing
|
|
674
|
-
|
|
675
|
-
```python
|
|
676
|
-
# Checkpoint location is required for fault tolerance
|
|
677
|
-
query = df.writeStream \
|
|
678
|
-
.format("parquet") \
|
|
679
|
-
.option("path", "s3://bucket/output/") \
|
|
680
|
-
.option("checkpointLocation", "s3://bucket/checkpoints/query_name/") \
|
|
681
|
-
.start()
|
|
682
|
-
|
|
683
|
-
# Checkpoint contains:
|
|
684
|
-
# - Offsets (what data has been processed)
|
|
685
|
-
# - State (for stateful operations)
|
|
686
|
-
# - Commits (what batches completed)
|
|
687
|
-
|
|
688
|
-
# Recovery: Query restarts from last checkpoint automatically
|
|
689
|
-
# Clean start: Delete checkpoint directory (loses state!)
|
|
690
|
-
```
|
|
691
|
-
|
|
692
|
-
---
|
|
693
|
-
|
|
694
|
-
## Performance Patterns
|
|
695
|
-
|
|
696
|
-
### Optimizing Throughput
|
|
697
|
-
|
|
698
|
-
```python
|
|
699
|
-
# 1. Increase Kafka partitions for parallelism
|
|
700
|
-
# Consumer parallelism = Kafka partitions
|
|
701
|
-
|
|
702
|
-
# 2. Tune maxOffsetsPerTrigger
|
|
703
|
-
query = df.readStream \
|
|
704
|
-
.format("kafka") \
|
|
705
|
-
.option("maxOffsetsPerTrigger", 500000) \ # More data per batch
|
|
706
|
-
.load()
|
|
707
|
-
|
|
708
|
-
# 3. Optimize shuffle partitions
|
|
709
|
-
spark.conf.set("spark.sql.shuffle.partitions", 100)
|
|
710
|
-
|
|
711
|
-
# 4. Use appropriate trigger interval
|
|
712
|
-
query = df.writeStream \
|
|
713
|
-
.trigger(processingTime="30 seconds") \
|
|
714
|
-
.start()
|
|
715
|
-
|
|
716
|
-
# 5. Enable AQE for dynamic optimization
|
|
717
|
-
spark.conf.set("spark.sql.adaptive.enabled", "true")
|
|
718
|
-
```
|
|
719
|
-
|
|
720
|
-
### Managing State Size
|
|
721
|
-
|
|
722
|
-
```python
|
|
723
|
-
# 1. Always use watermarks for stateful operations
|
|
724
|
-
df.withWatermark("event_time", "1 hour")
|
|
725
|
-
|
|
726
|
-
# 2. Monitor state size in progress
|
|
727
|
-
progress = query.lastProgress
|
|
728
|
-
for operator in progress["stateOperators"]:
|
|
729
|
-
print(f"State rows: {operator['numRowsTotal']}")
|
|
730
|
-
print(f"Memory used: {operator['memoryUsedBytes']}")
|
|
731
|
-
|
|
732
|
-
# 3. Configure state store
|
|
733
|
-
spark.conf.set("spark.sql.streaming.stateStore.providerClass",
|
|
734
|
-
"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider")
|
|
735
|
-
# RocksDB handles larger state better than in-memory default
|
|
736
|
-
|
|
737
|
-
# 4. Set state cleanup mode
|
|
738
|
-
spark.conf.set("spark.sql.streaming.stateStore.stateSchemaCheck", "false")
|
|
739
|
-
```
|
|
740
|
-
|
|
741
|
-
---
|
|
742
|
-
|
|
743
|
-
## Common Anti-Patterns
|
|
744
|
-
|
|
745
|
-
```python
|
|
746
|
-
# BAD: No watermark with aggregation
|
|
747
|
-
df.groupBy("user_id").count() # Unbounded state growth!
|
|
748
|
-
|
|
749
|
-
# GOOD: Always use watermark
|
|
750
|
-
df.withWatermark("event_time", "1 hour").groupBy("user_id").count()
|
|
751
|
-
|
|
752
|
-
# BAD: Complete mode with large state
|
|
753
|
-
df.groupBy("user_id").count().writeStream.outputMode("complete") # Outputs entire state
|
|
754
|
-
|
|
755
|
-
# GOOD: Update mode for incremental
|
|
756
|
-
df.groupBy("user_id").count().writeStream.outputMode("update")
|
|
757
|
-
|
|
758
|
-
# BAD: No checkpoint location
|
|
759
|
-
query = df.writeStream.format("console").start() # No fault tolerance!
|
|
760
|
-
|
|
761
|
-
# GOOD: Always specify checkpoint
|
|
762
|
-
query = df.writeStream.format("console") \
|
|
763
|
-
.option("checkpointLocation", "/checkpoints/query") \
|
|
764
|
-
.start()
|
|
765
|
-
|
|
766
|
-
# BAD: foreach for high-throughput
|
|
767
|
-
df.writeStream.foreach(process_row).start() # Row-by-row overhead
|
|
768
|
-
|
|
769
|
-
# GOOD: foreachBatch for batched processing
|
|
770
|
-
df.writeStream.foreachBatch(process_batch).start() # Batch-level efficiency
|
|
771
|
-
```
|
|
772
|
-
|
|
773
|
-
---
|
|
774
|
-
|
|
775
|
-
## Best Practices Summary
|
|
776
|
-
|
|
777
|
-
1. **Always use watermarks** - Prevents unbounded state growth
|
|
778
|
-
2. **Choose appropriate output mode** - Append for ETL, Update for aggregations
|
|
779
|
-
3. **Set checkpoint locations** - Required for fault tolerance
|
|
780
|
-
4. **Use foreachBatch over foreach** - Better performance for custom sinks
|
|
781
|
-
5. **Monitor state size** - Watch for memory growth in progress metrics
|
|
782
|
-
6. **Tune trigger intervals** - Balance latency vs throughput
|
|
783
|
-
7. **Match Kafka partitions to parallelism** - Consumer tasks = Kafka partitions
|
|
784
|
-
8. **Use stream-static joins when possible** - Simpler than stream-stream
|
|
785
|
-
9. **Test with production data rates** - Performance varies with volume
|
|
786
|
-
10. **Enable structured streaming UI** - Detailed metrics in Spark UI
|
|
1
|
+
# Streaming Patterns
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
## Structured Streaming Overview
|
|
6
|
+
|
|
7
|
+
### When to Use Structured Streaming
|
|
8
|
+
|
|
9
|
+
**Use when:**
|
|
10
|
+
- Processing continuous data streams (Kafka, files, sockets)
|
|
11
|
+
- Need exactly-once processing guarantees
|
|
12
|
+
- Real-time analytics and dashboards
|
|
13
|
+
- Event-driven architectures
|
|
14
|
+
- Incremental ETL from streaming sources
|
|
15
|
+
|
|
16
|
+
**Consider alternatives when:**
|
|
17
|
+
- Batch processing is sufficient (lower complexity)
|
|
18
|
+
- Sub-second latency required (consider Flink)
|
|
19
|
+
- Very simple event processing (Kafka Streams may suffice)
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Reading from Streaming Sources
|
|
24
|
+
|
|
25
|
+
### Kafka Source
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
# Read from Kafka
|
|
29
|
+
df = spark.readStream \
|
|
30
|
+
.format("kafka") \
|
|
31
|
+
.option("kafka.bootstrap.servers", "broker1:9092,broker2:9092") \
|
|
32
|
+
.option("subscribe", "topic1,topic2") \
|
|
33
|
+
.option("startingOffsets", "latest") \
|
|
34
|
+
.option("maxOffsetsPerTrigger", 100000) \
|
|
35
|
+
.option("kafka.security.protocol", "SASL_SSL") \
|
|
36
|
+
.option("kafka.sasl.mechanism", "PLAIN") \
|
|
37
|
+
.load()
|
|
38
|
+
|
|
39
|
+
# Kafka provides key, value as bytes
|
|
40
|
+
# Parse JSON value
|
|
41
|
+
from pyspark.sql import functions as F
|
|
42
|
+
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
|
|
43
|
+
|
|
44
|
+
schema = StructType([
|
|
45
|
+
StructField("event_id", StringType()),
|
|
46
|
+
StructField("user_id", StringType()),
|
|
47
|
+
StructField("event_time", TimestampType()),
|
|
48
|
+
StructField("amount", DoubleType())
|
|
49
|
+
])
|
|
50
|
+
|
|
51
|
+
parsed_df = df.select(
|
|
52
|
+
F.col("key").cast("string").alias("kafka_key"),
|
|
53
|
+
F.from_json(F.col("value").cast("string"), schema).alias("data"),
|
|
54
|
+
F.col("timestamp").alias("kafka_timestamp"),
|
|
55
|
+
F.col("partition"),
|
|
56
|
+
F.col("offset")
|
|
57
|
+
).select("kafka_key", "data.*", "kafka_timestamp", "partition", "offset")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
```scala
|
|
61
|
+
// Scala Kafka source
|
|
62
|
+
val df = spark.readStream
|
|
63
|
+
.format("kafka")
|
|
64
|
+
.option("kafka.bootstrap.servers", "broker1:9092,broker2:9092")
|
|
65
|
+
.option("subscribe", "topic1")
|
|
66
|
+
.option("startingOffsets", "latest")
|
|
67
|
+
.load()
|
|
68
|
+
|
|
69
|
+
val parsed = df.select(
|
|
70
|
+
col("key").cast("string"),
|
|
71
|
+
from_json(col("value").cast("string"), schema).as("data")
|
|
72
|
+
).select("key", "data.*")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### File Source (Auto-Discovery)
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
# Read new files as they arrive
|
|
79
|
+
df = spark.readStream \
|
|
80
|
+
.format("parquet") \
|
|
81
|
+
.schema(my_schema) \
|
|
82
|
+
.option("path", "s3://bucket/incoming/") \
|
|
83
|
+
.option("maxFilesPerTrigger", 100) \
|
|
84
|
+
.load()
|
|
85
|
+
|
|
86
|
+
# For JSON files
|
|
87
|
+
df = spark.readStream \
|
|
88
|
+
.format("json") \
|
|
89
|
+
.schema(my_schema) \
|
|
90
|
+
.option("path", "s3://bucket/incoming/") \
|
|
91
|
+
.load()
|
|
92
|
+
|
|
93
|
+
# CSV with header
|
|
94
|
+
df = spark.readStream \
|
|
95
|
+
.format("csv") \
|
|
96
|
+
.schema(my_schema) \
|
|
97
|
+
.option("path", "s3://bucket/incoming/") \
|
|
98
|
+
.option("header", "true") \
|
|
99
|
+
.load()
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Rate Source (Testing)
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
# Generate test data at specified rate
|
|
106
|
+
df = spark.readStream \
|
|
107
|
+
.format("rate") \
|
|
108
|
+
.option("rowsPerSecond", 1000) \
|
|
109
|
+
.option("numPartitions", 10) \
|
|
110
|
+
.load()
|
|
111
|
+
|
|
112
|
+
# Columns: timestamp, value (incrementing long)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Output Modes
|
|
118
|
+
|
|
119
|
+
### Append Mode (Default)
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
# Only new rows added since last trigger
|
|
123
|
+
# Use when: No aggregations, or windowed aggregations with watermark
|
|
124
|
+
query = df.writeStream \
|
|
125
|
+
.outputMode("append") \
|
|
126
|
+
.format("parquet") \
|
|
127
|
+
.option("path", "s3://bucket/output/") \
|
|
128
|
+
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
129
|
+
.start()
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Update Mode
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
# Only rows that changed since last trigger
|
|
136
|
+
# Use when: Aggregations, want incremental updates
|
|
137
|
+
query = df.groupBy("user_id").count() \
|
|
138
|
+
.writeStream \
|
|
139
|
+
.outputMode("update") \
|
|
140
|
+
.format("console") \
|
|
141
|
+
.start()
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Complete Mode
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
# Entire result table every trigger
|
|
148
|
+
# Use when: Need full aggregation result each time
|
|
149
|
+
# Warning: Can be expensive for large state
|
|
150
|
+
query = df.groupBy("user_id").count() \
|
|
151
|
+
.writeStream \
|
|
152
|
+
.outputMode("complete") \
|
|
153
|
+
.format("console") \
|
|
154
|
+
.start()
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Mode Selection Guide
|
|
158
|
+
|
|
159
|
+
| Use Case | Output Mode | Notes |
|
|
160
|
+
|----------|-------------|-------|
|
|
161
|
+
| ETL to files | append | Default, efficient |
|
|
162
|
+
| Windowed aggregations | append | With watermark |
|
|
163
|
+
| Running counts/sums | update | Incremental |
|
|
164
|
+
| Dashboards needing full state | complete | Expensive |
|
|
165
|
+
| Deduplication | append | With dropDuplicates |
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## Watermarks and Event Time
|
|
170
|
+
|
|
171
|
+
### Understanding Watermarks
|
|
172
|
+
|
|
173
|
+
Watermarks define how late data can arrive before being dropped. They enable Spark to:
|
|
174
|
+
- Clean up old state (bounded memory)
|
|
175
|
+
- Emit results at appropriate times
|
|
176
|
+
- Handle out-of-order events
|
|
177
|
+
|
|
178
|
+
### Setting Watermarks
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
from pyspark.sql import functions as F
|
|
182
|
+
|
|
183
|
+
# Define watermark on event time column
|
|
184
|
+
df_with_watermark = df \
|
|
185
|
+
.withWatermark("event_time", "10 minutes")
|
|
186
|
+
|
|
187
|
+
# Watermark threshold: max_event_time - 10 minutes
|
|
188
|
+
# Events older than watermark are dropped
|
|
189
|
+
# State older than watermark is cleaned up
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Watermark Guidelines
|
|
193
|
+
|
|
194
|
+
| Scenario | Watermark Duration | Reasoning |
|
|
195
|
+
|----------|-------------------|-----------|
|
|
196
|
+
| Real-time analytics | 1-5 minutes | Low latency, tolerate minimal late data |
|
|
197
|
+
| Standard ETL | 10-30 minutes | Balance latency and late data |
|
|
198
|
+
| Late-arriving data common | 1-24 hours | Accommodate delayed events |
|
|
199
|
+
| Best-effort real-time | 0 minutes | No late data tolerance |
|
|
200
|
+
|
|
201
|
+
### Example with Windowed Aggregation
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from pyspark.sql import functions as F
|
|
205
|
+
from pyspark.sql.window import Window
|
|
206
|
+
|
|
207
|
+
# Streaming aggregation with watermark
|
|
208
|
+
result = df \
|
|
209
|
+
.withWatermark("event_time", "10 minutes") \
|
|
210
|
+
.groupBy(
|
|
211
|
+
F.window("event_time", "5 minutes", "1 minute"), # 5-min tumbling window, 1-min slide
|
|
212
|
+
"user_id"
|
|
213
|
+
) \
|
|
214
|
+
.agg(
|
|
215
|
+
F.count("*").alias("event_count"),
|
|
216
|
+
F.sum("amount").alias("total_amount")
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Output schema includes window struct: window.start, window.end
|
|
220
|
+
query = result \
|
|
221
|
+
.select(
|
|
222
|
+
F.col("window.start").alias("window_start"),
|
|
223
|
+
F.col("window.end").alias("window_end"),
|
|
224
|
+
"user_id",
|
|
225
|
+
"event_count",
|
|
226
|
+
"total_amount"
|
|
227
|
+
) \
|
|
228
|
+
.writeStream \
|
|
229
|
+
.outputMode("append") \
|
|
230
|
+
.format("parquet") \
|
|
231
|
+
.option("path", "s3://bucket/windowed_output/") \
|
|
232
|
+
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
233
|
+
.start()
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Windowed Operations
|
|
239
|
+
|
|
240
|
+
### Tumbling Windows (Non-Overlapping)
|
|
241
|
+
|
|
242
|
+
```python
|
|
243
|
+
from pyspark.sql import functions as F
|
|
244
|
+
|
|
245
|
+
# 5-minute tumbling windows
|
|
246
|
+
result = df \
|
|
247
|
+
.withWatermark("event_time", "10 minutes") \
|
|
248
|
+
.groupBy(
|
|
249
|
+
F.window("event_time", "5 minutes"),
|
|
250
|
+
"category"
|
|
251
|
+
) \
|
|
252
|
+
.agg(F.sum("amount").alias("total"))
|
|
253
|
+
|
|
254
|
+
# Windows: [00:00-00:05), [00:05-00:10), [00:10-00:15), ...
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Sliding Windows (Overlapping)
|
|
258
|
+
|
|
259
|
+
```python
|
|
260
|
+
# 10-minute windows, sliding every 2 minutes
|
|
261
|
+
result = df \
|
|
262
|
+
.withWatermark("event_time", "10 minutes") \
|
|
263
|
+
.groupBy(
|
|
264
|
+
F.window("event_time", "10 minutes", "2 minutes"),
|
|
265
|
+
"category"
|
|
266
|
+
) \
|
|
267
|
+
.agg(F.sum("amount").alias("total"))
|
|
268
|
+
|
|
269
|
+
# Windows: [00:00-00:10), [00:02-00:12), [00:04-00:14), ...
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### Session Windows (Gap-Based)
|
|
273
|
+
|
|
274
|
+
```python
|
|
275
|
+
# Session windows with 5-minute gap threshold
|
|
276
|
+
result = df \
|
|
277
|
+
.withWatermark("event_time", "10 minutes") \
|
|
278
|
+
.groupBy(
|
|
279
|
+
F.session_window("event_time", "5 minutes"), # Spark 3.2+
|
|
280
|
+
"user_id"
|
|
281
|
+
) \
|
|
282
|
+
.agg(
|
|
283
|
+
F.count("*").alias("events_in_session"),
|
|
284
|
+
F.first("event_time").alias("session_start"),
|
|
285
|
+
F.last("event_time").alias("session_end")
|
|
286
|
+
)
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
---
|
|
290
|
+
|
|
291
|
+
## Stateful Operations
|
|
292
|
+
|
|
293
|
+
### Aggregations (Built-in State)
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
# Running count by key
|
|
297
|
+
running_counts = df \
|
|
298
|
+
.withWatermark("event_time", "1 hour") \
|
|
299
|
+
.groupBy("user_id") \
|
|
300
|
+
.agg(F.count("*").alias("total_events"))
|
|
301
|
+
|
|
302
|
+
# State stored per user_id
|
|
303
|
+
# Cleaned up based on watermark
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
### Deduplication
|
|
307
|
+
|
|
308
|
+
```python
|
|
309
|
+
# Drop duplicates within watermark window
|
|
310
|
+
deduped = df \
|
|
311
|
+
.withWatermark("event_time", "10 minutes") \
|
|
312
|
+
.dropDuplicates(["event_id"]) # Keep first occurrence
|
|
313
|
+
|
|
314
|
+
# Can also dedupe by multiple columns
|
|
315
|
+
deduped = df \
|
|
316
|
+
.withWatermark("event_time", "10 minutes") \
|
|
317
|
+
.dropDuplicates(["user_id", "event_type", "event_time"])
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
### Custom Stateful Processing (flatMapGroupsWithState)
|
|
321
|
+
|
|
322
|
+
```python
|
|
323
|
+
# PySpark - Custom state using applyInPandasWithState (Spark 3.4+)
|
|
324
|
+
from pyspark.sql.streaming.state import GroupState, GroupStateTimeout
|
|
325
|
+
|
|
326
|
+
def update_session_state(
|
|
327
|
+
key: tuple,
|
|
328
|
+
pdf_iter: Iterator[pd.DataFrame],
|
|
329
|
+
state: GroupState
|
|
330
|
+
) -> Iterator[pd.DataFrame]:
|
|
331
|
+
# Get or initialize state
|
|
332
|
+
if state.exists:
|
|
333
|
+
session_data = state.get
|
|
334
|
+
else:
|
|
335
|
+
session_data = {"count": 0, "total": 0.0}
|
|
336
|
+
|
|
337
|
+
# Process input data
|
|
338
|
+
for pdf in pdf_iter:
|
|
339
|
+
session_data["count"] += len(pdf)
|
|
340
|
+
session_data["total"] += pdf["amount"].sum()
|
|
341
|
+
|
|
342
|
+
# Update state
|
|
343
|
+
state.update(session_data)
|
|
344
|
+
|
|
345
|
+
# Optionally set timeout
|
|
346
|
+
state.setTimeoutDuration(10 * 60 * 1000) # 10 minutes
|
|
347
|
+
|
|
348
|
+
# Yield output
|
|
349
|
+
yield pd.DataFrame([{
|
|
350
|
+
"user_id": key[0],
|
|
351
|
+
"event_count": session_data["count"],
|
|
352
|
+
"total_amount": session_data["total"]
|
|
353
|
+
}])
|
|
354
|
+
|
|
355
|
+
# Apply stateful function
|
|
356
|
+
result = df \
|
|
357
|
+
.withWatermark("event_time", "10 minutes") \
|
|
358
|
+
.groupBy("user_id") \
|
|
359
|
+
.applyInPandasWithState(
|
|
360
|
+
update_session_state,
|
|
361
|
+
outputStructType=output_schema,
|
|
362
|
+
stateStructType=state_schema,
|
|
363
|
+
outputMode="update",
|
|
364
|
+
timeoutConf=GroupStateTimeout.ProcessingTimeTimeout
|
|
365
|
+
)
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
```scala
|
|
369
|
+
// Scala flatMapGroupsWithState
|
|
370
|
+
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout}
|
|
371
|
+
|
|
372
|
+
case class UserState(count: Long, totalAmount: Double)
|
|
373
|
+
case class UserOutput(userId: String, count: Long, totalAmount: Double)
|
|
374
|
+
|
|
375
|
+
def updateState(
|
|
376
|
+
userId: String,
|
|
377
|
+
events: Iterator[Event],
|
|
378
|
+
state: GroupState[UserState]
|
|
379
|
+
): Iterator[UserOutput] = {
|
|
380
|
+
|
|
381
|
+
val currentState = state.getOption.getOrElse(UserState(0, 0.0))
|
|
382
|
+
|
|
383
|
+
var newCount = currentState.count
|
|
384
|
+
var newTotal = currentState.totalAmount
|
|
385
|
+
|
|
386
|
+
events.foreach { event =>
|
|
387
|
+
newCount += 1
|
|
388
|
+
newTotal += event.amount
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
val newState = UserState(newCount, newTotal)
|
|
392
|
+
state.update(newState)
|
|
393
|
+
state.setTimeoutDuration("10 minutes")
|
|
394
|
+
|
|
395
|
+
Iterator(UserOutput(userId, newCount, newTotal))
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
val result = df
|
|
399
|
+
.withWatermark("event_time", "10 minutes")
|
|
400
|
+
.as[Event]
|
|
401
|
+
.groupByKey(_.userId)
|
|
402
|
+
.flatMapGroupsWithState(
|
|
403
|
+
OutputMode.Update,
|
|
404
|
+
GroupStateTimeout.ProcessingTimeTimeout
|
|
405
|
+
)(updateState)
|
|
406
|
+
```
|
|
407
|
+
|
|
408
|
+
---
|
|
409
|
+
|
|
410
|
+
## Streaming Joins
|
|
411
|
+
|
|
412
|
+
### Stream-Static Join
|
|
413
|
+
|
|
414
|
+
```python
|
|
415
|
+
# Join streaming data with static lookup table
|
|
416
|
+
static_df = spark.read.parquet("s3://bucket/lookup/")
|
|
417
|
+
|
|
418
|
+
# Streaming df joined with static - no watermark needed
|
|
419
|
+
result = streaming_df.join(static_df, "join_key", "left")
|
|
420
|
+
|
|
421
|
+
# Static table can be periodically refreshed
|
|
422
|
+
# Use broadcast for small static tables
|
|
423
|
+
from pyspark.sql.functions import broadcast
|
|
424
|
+
result = streaming_df.join(broadcast(static_df), "join_key")
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
### Stream-Stream Join
|
|
428
|
+
|
|
429
|
+
```python
|
|
430
|
+
# Join two streams - requires watermarks on both
|
|
431
|
+
from pyspark.sql import functions as F
|
|
432
|
+
|
|
433
|
+
stream1 = spark.readStream.format("kafka")...
|
|
434
|
+
stream2 = spark.readStream.format("kafka")...
|
|
435
|
+
|
|
436
|
+
# Both streams need watermarks
|
|
437
|
+
stream1_wm = stream1.withWatermark("event_time", "10 minutes")
|
|
438
|
+
stream2_wm = stream2.withWatermark("event_time", "10 minutes")
|
|
439
|
+
|
|
440
|
+
# Inner join with time constraint
|
|
441
|
+
result = stream1_wm.join(
|
|
442
|
+
stream2_wm,
|
|
443
|
+
F.expr("""
|
|
444
|
+
stream1.user_id = stream2.user_id AND
|
|
445
|
+
stream1.event_time >= stream2.event_time AND
|
|
446
|
+
stream1.event_time <= stream2.event_time + INTERVAL 5 MINUTES
|
|
447
|
+
"""),
|
|
448
|
+
"inner"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# Left outer join (Spark 2.3+)
|
|
452
|
+
result = stream1_wm.join(
|
|
453
|
+
stream2_wm,
|
|
454
|
+
F.expr("""
|
|
455
|
+
stream1.user_id = stream2.user_id AND
|
|
456
|
+
stream1.event_time >= stream2.event_time - INTERVAL 5 MINUTES AND
|
|
457
|
+
stream1.event_time <= stream2.event_time + INTERVAL 5 MINUTES
|
|
458
|
+
"""),
|
|
459
|
+
"leftOuter"
|
|
460
|
+
)
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
### Join Type Support
|
|
464
|
+
|
|
465
|
+
| Join Type | Stream-Static | Stream-Stream |
|
|
466
|
+
|-----------|---------------|---------------|
|
|
467
|
+
| Inner | Yes | Yes |
|
|
468
|
+
| Left Outer | Yes | Yes (Spark 2.3+) |
|
|
469
|
+
| Right Outer | Yes | Yes (Spark 2.3+) |
|
|
470
|
+
| Full Outer | Yes | Yes (Spark 2.4+) |
|
|
471
|
+
| Left Semi | Yes | Not supported |
|
|
472
|
+
| Left Anti | Yes | Not supported |
|
|
473
|
+
|
|
474
|
+
---
|
|
475
|
+
|
|
476
|
+
## Sinks
|
|
477
|
+
|
|
478
|
+
### Kafka Sink
|
|
479
|
+
|
|
480
|
+
```python
|
|
481
|
+
# Write to Kafka
|
|
482
|
+
query = df \
|
|
483
|
+
.select(
|
|
484
|
+
F.col("user_id").alias("key"),
|
|
485
|
+
F.to_json(F.struct("*")).alias("value")
|
|
486
|
+
) \
|
|
487
|
+
.writeStream \
|
|
488
|
+
.format("kafka") \
|
|
489
|
+
.option("kafka.bootstrap.servers", "broker1:9092") \
|
|
490
|
+
.option("topic", "output_topic") \
|
|
491
|
+
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
492
|
+
.start()
|
|
493
|
+
```
|
|
494
|
+
|
|
495
|
+
### File Sink (Parquet, JSON, CSV)
|
|
496
|
+
|
|
497
|
+
```python
|
|
498
|
+
# Parquet sink with partitioning
|
|
499
|
+
query = df.writeStream \
|
|
500
|
+
.format("parquet") \
|
|
501
|
+
.option("path", "s3://bucket/output/") \
|
|
502
|
+
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
503
|
+
.partitionBy("date", "hour") \
|
|
504
|
+
.trigger(processingTime="1 minute") \
|
|
505
|
+
.start()
|
|
506
|
+
|
|
507
|
+
# JSON sink
|
|
508
|
+
query = df.writeStream \
|
|
509
|
+
.format("json") \
|
|
510
|
+
.option("path", "s3://bucket/output/") \
|
|
511
|
+
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
512
|
+
.start()
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
### Delta Lake Sink
|
|
516
|
+
|
|
517
|
+
```python
|
|
518
|
+
# Delta Lake (ACID transactions, schema evolution)
|
|
519
|
+
query = df.writeStream \
|
|
520
|
+
.format("delta") \
|
|
521
|
+
.outputMode("append") \
|
|
522
|
+
.option("path", "s3://bucket/delta_table/") \
|
|
523
|
+
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
524
|
+
.option("mergeSchema", "true") \
|
|
525
|
+
.start()
|
|
526
|
+
|
|
527
|
+
# Upsert with foreachBatch
|
|
528
|
+
def upsert_to_delta(batch_df, batch_id):
|
|
529
|
+
delta_table = DeltaTable.forPath(spark, "s3://bucket/delta_table/")
|
|
530
|
+
delta_table.alias("target").merge(
|
|
531
|
+
batch_df.alias("source"),
|
|
532
|
+
"target.id = source.id"
|
|
533
|
+
).whenMatchedUpdateAll() \
|
|
534
|
+
.whenNotMatchedInsertAll() \
|
|
535
|
+
.execute()
|
|
536
|
+
|
|
537
|
+
query = df.writeStream \
|
|
538
|
+
.foreachBatch(upsert_to_delta) \
|
|
539
|
+
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
540
|
+
.start()
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
### Custom Sink (foreachBatch)
|
|
544
|
+
|
|
545
|
+
```python
|
|
546
|
+
def write_to_database(batch_df, batch_id):
|
|
547
|
+
"""Write each micro-batch to external database."""
|
|
548
|
+
batch_df.write \
|
|
549
|
+
.format("jdbc") \
|
|
550
|
+
.option("url", "jdbc:postgresql://host:5432/db") \
|
|
551
|
+
.option("dbtable", "output_table") \
|
|
552
|
+
.option("user", "user") \
|
|
553
|
+
.option("password", "password") \
|
|
554
|
+
.mode("append") \
|
|
555
|
+
.save()
|
|
556
|
+
|
|
557
|
+
query = df.writeStream \
|
|
558
|
+
.foreachBatch(write_to_database) \
|
|
559
|
+
.option("checkpointLocation", "s3://bucket/checkpoints/") \
|
|
560
|
+
.trigger(processingTime="30 seconds") \
|
|
561
|
+
.start()
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
### foreach (Row-by-Row)
|
|
565
|
+
|
|
566
|
+
```python
|
|
567
|
+
# For custom processing of each row
|
|
568
|
+
class ForeachWriter:
|
|
569
|
+
def open(self, partition_id, epoch_id):
|
|
570
|
+
# Initialize connection
|
|
571
|
+
self.connection = create_connection()
|
|
572
|
+
return True
|
|
573
|
+
|
|
574
|
+
def process(self, row):
|
|
575
|
+
# Process each row
|
|
576
|
+
self.connection.insert(row.asDict())
|
|
577
|
+
|
|
578
|
+
def close(self, error):
|
|
579
|
+
# Clean up
|
|
580
|
+
self.connection.close()
|
|
581
|
+
|
|
582
|
+
query = df.writeStream \
|
|
583
|
+
.foreach(ForeachWriter()) \
|
|
584
|
+
.start()
|
|
585
|
+
```
|
|
586
|
+
|
|
587
|
+
---
|
|
588
|
+
|
|
589
|
+
## Triggers
|
|
590
|
+
|
|
591
|
+
### Available Trigger Types
|
|
592
|
+
|
|
593
|
+
```python
|
|
594
|
+
# Process as fast as possible (default)
|
|
595
|
+
query = df.writeStream.trigger(processingTime="0 seconds").start()
|
|
596
|
+
|
|
597
|
+
# Fixed interval
|
|
598
|
+
query = df.writeStream.trigger(processingTime="1 minute").start()
|
|
599
|
+
|
|
600
|
+
# Once - process all available data, then stop
|
|
601
|
+
query = df.writeStream.trigger(once=True).start()
|
|
602
|
+
|
|
603
|
+
# Available now - process all available data (Spark 3.3+)
|
|
604
|
+
query = df.writeStream.trigger(availableNow=True).start()
|
|
605
|
+
|
|
606
|
+
# Continuous processing (experimental, low latency)
|
|
607
|
+
query = df.writeStream.trigger(continuous="1 second").start()
|
|
608
|
+
```
|
|
609
|
+
|
|
610
|
+
### Trigger Selection Guide
|
|
611
|
+
|
|
612
|
+
| Trigger | Use Case |
|
|
613
|
+
|---------|----------|
|
|
614
|
+
| processingTime="0 seconds" | Maximum throughput |
|
|
615
|
+
| processingTime="N seconds" | Controlled resource usage |
|
|
616
|
+
| once=True | Batch-style processing |
|
|
617
|
+
| availableNow=True | Catch-up processing |
|
|
618
|
+
| continuous="N ms" | Ultra-low latency (experimental) |
|
|
619
|
+
|
|
620
|
+
---
|
|
621
|
+
|
|
622
|
+
## Monitoring and Management
|
|
623
|
+
|
|
624
|
+
### Query Management
|
|
625
|
+
|
|
626
|
+
```python
|
|
627
|
+
# Start query and get handle
|
|
628
|
+
query = df.writeStream.format("console").start()
|
|
629
|
+
|
|
630
|
+
# Query properties
|
|
631
|
+
print(f"Query ID: {query.id}")
|
|
632
|
+
print(f"Run ID: {query.runId}")
|
|
633
|
+
print(f"Name: {query.name}")
|
|
634
|
+
print(f"Is Active: {query.isActive}")
|
|
635
|
+
print(f"Status: {query.status}")
|
|
636
|
+
print(f"Last Progress: {query.lastProgress}")
|
|
637
|
+
print(f"Recent Progress: {query.recentProgress}")
|
|
638
|
+
|
|
639
|
+
# Wait for termination
|
|
640
|
+
query.awaitTermination()
|
|
641
|
+
query.awaitTermination(timeout=60) # With timeout
|
|
642
|
+
|
|
643
|
+
# Stop query
|
|
644
|
+
query.stop()
|
|
645
|
+
|
|
646
|
+
# Get exception if failed
|
|
647
|
+
exception = query.exception()
|
|
648
|
+
```
|
|
649
|
+
|
|
650
|
+
### Progress Monitoring
|
|
651
|
+
|
|
652
|
+
```python
|
|
653
|
+
# Get latest progress
|
|
654
|
+
progress = query.lastProgress
|
|
655
|
+
if progress:
|
|
656
|
+
print(f"Input rows/sec: {progress['inputRowsPerSecond']}")
|
|
657
|
+
print(f"Processed rows/sec: {progress['processedRowsPerSecond']}")
|
|
658
|
+
print(f"Batch ID: {progress['batchId']}")
|
|
659
|
+
print(f"Duration: {progress['batchDuration']} ms")
|
|
660
|
+
print(f"State rows: {progress['stateOperators']}")
|
|
661
|
+
|
|
662
|
+
# Custom progress listener
|
|
663
|
+
class ProgressListener:
|
|
664
|
+
def onQueryProgress(self, event):
|
|
665
|
+
print(f"Progress: {event.progress}")
|
|
666
|
+
|
|
667
|
+
def onQueryTerminated(self, event):
|
|
668
|
+
print(f"Terminated: {event.exception}")
|
|
669
|
+
|
|
670
|
+
spark.streams.addListener(ProgressListener())
|
|
671
|
+
```
|
|
672
|
+
|
|
673
|
+
### Checkpointing
|
|
674
|
+
|
|
675
|
+
```python
|
|
676
|
+
# Checkpoint location is required for fault tolerance
|
|
677
|
+
query = df.writeStream \
|
|
678
|
+
.format("parquet") \
|
|
679
|
+
.option("path", "s3://bucket/output/") \
|
|
680
|
+
.option("checkpointLocation", "s3://bucket/checkpoints/query_name/") \
|
|
681
|
+
.start()
|
|
682
|
+
|
|
683
|
+
# Checkpoint contains:
|
|
684
|
+
# - Offsets (what data has been processed)
|
|
685
|
+
# - State (for stateful operations)
|
|
686
|
+
# - Commits (what batches completed)
|
|
687
|
+
|
|
688
|
+
# Recovery: Query restarts from last checkpoint automatically
|
|
689
|
+
# Clean start: Delete checkpoint directory (loses state!)
|
|
690
|
+
```
|
|
691
|
+
|
|
692
|
+
---
|
|
693
|
+
|
|
694
|
+
## Performance Patterns
|
|
695
|
+
|
|
696
|
+
### Optimizing Throughput
|
|
697
|
+
|
|
698
|
+
```python
|
|
699
|
+
# 1. Increase Kafka partitions for parallelism
|
|
700
|
+
# Consumer parallelism = Kafka partitions
|
|
701
|
+
|
|
702
|
+
# 2. Tune maxOffsetsPerTrigger
|
|
703
|
+
query = df.readStream \
|
|
704
|
+
.format("kafka") \
|
|
705
|
+
.option("maxOffsetsPerTrigger", 500000) \ # More data per batch
|
|
706
|
+
.load()
|
|
707
|
+
|
|
708
|
+
# 3. Optimize shuffle partitions
|
|
709
|
+
spark.conf.set("spark.sql.shuffle.partitions", 100)
|
|
710
|
+
|
|
711
|
+
# 4. Use appropriate trigger interval
|
|
712
|
+
query = df.writeStream \
|
|
713
|
+
.trigger(processingTime="30 seconds") \
|
|
714
|
+
.start()
|
|
715
|
+
|
|
716
|
+
# 5. Enable AQE for dynamic optimization
|
|
717
|
+
spark.conf.set("spark.sql.adaptive.enabled", "true")
|
|
718
|
+
```
|
|
719
|
+
|
|
720
|
+
### Managing State Size
|
|
721
|
+
|
|
722
|
+
```python
|
|
723
|
+
# 1. Always use watermarks for stateful operations
|
|
724
|
+
df.withWatermark("event_time", "1 hour")
|
|
725
|
+
|
|
726
|
+
# 2. Monitor state size in progress
|
|
727
|
+
progress = query.lastProgress
|
|
728
|
+
for operator in progress["stateOperators"]:
|
|
729
|
+
print(f"State rows: {operator['numRowsTotal']}")
|
|
730
|
+
print(f"Memory used: {operator['memoryUsedBytes']}")
|
|
731
|
+
|
|
732
|
+
# 3. Configure state store
|
|
733
|
+
spark.conf.set("spark.sql.streaming.stateStore.providerClass",
|
|
734
|
+
"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider")
|
|
735
|
+
# RocksDB handles larger state better than in-memory default
|
|
736
|
+
|
|
737
|
+
# 4. Set state cleanup mode
|
|
738
|
+
spark.conf.set("spark.sql.streaming.stateStore.stateSchemaCheck", "false")
|
|
739
|
+
```
|
|
740
|
+
|
|
741
|
+
---
|
|
742
|
+
|
|
743
|
+
## Common Anti-Patterns
|
|
744
|
+
|
|
745
|
+
```python
|
|
746
|
+
# BAD: No watermark with aggregation
|
|
747
|
+
df.groupBy("user_id").count() # Unbounded state growth!
|
|
748
|
+
|
|
749
|
+
# GOOD: Always use watermark
|
|
750
|
+
df.withWatermark("event_time", "1 hour").groupBy("user_id").count()
|
|
751
|
+
|
|
752
|
+
# BAD: Complete mode with large state
|
|
753
|
+
df.groupBy("user_id").count().writeStream.outputMode("complete") # Outputs entire state
|
|
754
|
+
|
|
755
|
+
# GOOD: Update mode for incremental
|
|
756
|
+
df.groupBy("user_id").count().writeStream.outputMode("update")
|
|
757
|
+
|
|
758
|
+
# BAD: No checkpoint location
|
|
759
|
+
query = df.writeStream.format("console").start() # No fault tolerance!
|
|
760
|
+
|
|
761
|
+
# GOOD: Always specify checkpoint
|
|
762
|
+
query = df.writeStream.format("console") \
|
|
763
|
+
.option("checkpointLocation", "/checkpoints/query") \
|
|
764
|
+
.start()
|
|
765
|
+
|
|
766
|
+
# BAD: foreach for high-throughput
|
|
767
|
+
df.writeStream.foreach(process_row).start() # Row-by-row overhead
|
|
768
|
+
|
|
769
|
+
# GOOD: foreachBatch for batched processing
|
|
770
|
+
df.writeStream.foreachBatch(process_batch).start() # Batch-level efficiency
|
|
771
|
+
```
|
|
772
|
+
|
|
773
|
+
---
|
|
774
|
+
|
|
775
|
+
## Best Practices Summary
|
|
776
|
+
|
|
777
|
+
1. **Always use watermarks** - Prevents unbounded state growth
|
|
778
|
+
2. **Choose appropriate output mode** - Append for ETL, Update for aggregations
|
|
779
|
+
3. **Set checkpoint locations** - Required for fault tolerance
|
|
780
|
+
4. **Use foreachBatch over foreach** - Better performance for custom sinks
|
|
781
|
+
5. **Monitor state size** - Watch for memory growth in progress metrics
|
|
782
|
+
6. **Tune trigger intervals** - Balance latency vs throughput
|
|
783
|
+
7. **Match Kafka partitions to parallelism** - Consumer tasks = Kafka partitions
|
|
784
|
+
8. **Use stream-static joins when possible** - Simpler than stream-stream
|
|
785
|
+
9. **Test with production data rates** - Performance varies with volume
|
|
786
|
+
10. **Enable structured streaming UI** - Detailed metrics in Spark UI
|