@sylix/coworker 2.0.11 → 2.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/dist/commands/slash/config.d.ts.map +1 -1
  2. package/dist/commands/slash/config.js +22 -4
  3. package/dist/commands/slash/config.js.map +1 -1
  4. package/dist/core/CoWorkerAgent.d.ts.map +1 -1
  5. package/dist/core/CoWorkerAgent.js +6 -3
  6. package/dist/core/CoWorkerAgent.js.map +1 -1
  7. package/dist/skills/defaults/accessibility/screen-reader-testing.md +545 -0
  8. package/dist/skills/defaults/accessibility/wcag-audit-patterns.md +555 -0
  9. package/dist/skills/defaults/ai-ml/rag.md +276 -0
  10. package/dist/skills/defaults/backend-development/api-design-principles.md +528 -0
  11. package/dist/skills/defaults/backend-development/api-design.md +285 -0
  12. package/dist/skills/defaults/backend-development/architecture-patterns.md +494 -0
  13. package/dist/skills/defaults/backend-development/async-python.md +237 -0
  14. package/dist/skills/defaults/backend-development/auth-implementation-patterns.md +638 -0
  15. package/dist/skills/defaults/backend-development/bazel-build-optimization.md +387 -0
  16. package/dist/skills/defaults/backend-development/billing-automation/SKILL.md +566 -0
  17. package/dist/skills/defaults/backend-development/code-review-excellence.md +538 -0
  18. package/dist/skills/defaults/backend-development/cqrs-implementation.md +554 -0
  19. package/dist/skills/defaults/backend-development/database-design.md +305 -0
  20. package/dist/skills/defaults/backend-development/debugging-strategies.md +536 -0
  21. package/dist/skills/defaults/backend-development/e2e-testing-patterns.md +544 -0
  22. package/dist/skills/defaults/backend-development/error-handling-patterns.md +641 -0
  23. package/dist/skills/defaults/backend-development/fastapi-templates.md +559 -0
  24. package/dist/skills/defaults/backend-development/fastapi.md +309 -0
  25. package/dist/skills/defaults/backend-development/git-advanced-workflows.md +405 -0
  26. package/dist/skills/defaults/backend-development/microservices-patterns.md +595 -0
  27. package/dist/skills/defaults/backend-development/microservices.md +284 -0
  28. package/dist/skills/defaults/backend-development/monorepo-management.md +623 -0
  29. package/dist/skills/defaults/backend-development/nodejs-backend-patterns.md +1048 -0
  30. package/dist/skills/defaults/backend-development/nx-workspace-patterns.md +457 -0
  31. package/dist/skills/defaults/backend-development/paypal-integration/SKILL.md +478 -0
  32. package/dist/skills/defaults/backend-development/pci-compliance/SKILL.md +480 -0
  33. package/dist/skills/defaults/backend-development/python-anti-patterns.md +349 -0
  34. package/dist/skills/defaults/backend-development/python-background-jobs.md +364 -0
  35. package/dist/skills/defaults/backend-development/python-code-style.md +360 -0
  36. package/dist/skills/defaults/backend-development/python-configuration.md +368 -0
  37. package/dist/skills/defaults/backend-development/python-design-patterns.md +296 -0
  38. package/dist/skills/defaults/backend-development/python-error-handling.md +323 -0
  39. package/dist/skills/defaults/backend-development/python-packaging.md +887 -0
  40. package/dist/skills/defaults/backend-development/python-performance-optimization.md +874 -0
  41. package/dist/skills/defaults/backend-development/python-project-structure.md +252 -0
  42. package/dist/skills/defaults/backend-development/python-resilience.md +376 -0
  43. package/dist/skills/defaults/backend-development/python-resource-management.md +421 -0
  44. package/dist/skills/defaults/backend-development/python-type-safety.md +428 -0
  45. package/dist/skills/defaults/backend-development/sql-optimization-patterns.md +509 -0
  46. package/dist/skills/defaults/backend-development/stripe-integration/SKILL.md +522 -0
  47. package/dist/skills/defaults/backend-development/turborepo-caching.md +376 -0
  48. package/dist/skills/defaults/blockchain/defi-protocol-templates.md +430 -0
  49. package/dist/skills/defaults/blockchain/nft-standards.md +364 -0
  50. package/dist/skills/defaults/blockchain/solidity-security.md +514 -0
  51. package/dist/skills/defaults/blockchain/web3-testing.md +360 -0
  52. package/dist/skills/defaults/business/competitive-landscape/SKILL.md +527 -0
  53. package/dist/skills/defaults/business/market-sizing-analysis/SKILL.md +451 -0
  54. package/dist/skills/defaults/business/startup-financial-modeling/SKILL.md +494 -0
  55. package/dist/skills/defaults/business/startup-metrics-framework/SKILL.md +564 -0
  56. package/dist/skills/defaults/business/team-composition-analysis.md +437 -0
  57. package/dist/skills/defaults/compliance/employment-contract-templates/SKILL.md +527 -0
  58. package/dist/skills/defaults/compliance/gdpr-data-handling/SKILL.md +630 -0
  59. package/dist/skills/defaults/data-engineering/airflow-dag-patterns.md +436 -0
  60. package/dist/skills/defaults/data-engineering/airflow.md +519 -0
  61. package/dist/skills/defaults/data-engineering/data-quality.md +583 -0
  62. package/dist/skills/defaults/data-engineering/dbt-transformation-patterns.md +482 -0
  63. package/dist/skills/defaults/data-engineering/dbt.md +556 -0
  64. package/dist/skills/defaults/data-engineering/ml-pipeline-workflow/SKILL.md +247 -0
  65. package/dist/skills/defaults/data-engineering/spark-optimization.md +348 -0
  66. package/dist/skills/defaults/data-engineering/spark.md +411 -0
  67. package/dist/skills/defaults/database/postgresql.md +202 -0
  68. package/dist/skills/defaults/debugging/systematic-debugging.md +249 -0
  69. package/dist/skills/defaults/devops/architecture-decision-records.md +448 -0
  70. package/dist/skills/defaults/devops/changelog-automation.md +580 -0
  71. package/dist/skills/defaults/devops/cicd.md +314 -0
  72. package/dist/skills/defaults/devops/cloud.md +263 -0
  73. package/dist/skills/defaults/devops/code-review-excellence.md +299 -0
  74. package/dist/skills/defaults/devops/cost-optimization.md +295 -0
  75. package/dist/skills/defaults/devops/deployment-pipeline-design.md +356 -0
  76. package/dist/skills/defaults/devops/docker.md +281 -0
  77. package/dist/skills/defaults/devops/git-workflows.md +205 -0
  78. package/dist/skills/defaults/devops/github-actions.md +311 -0
  79. package/dist/skills/defaults/devops/gitlab-ci-patterns.md +266 -0
  80. package/dist/skills/defaults/devops/hybrid-cloud-networking.md +241 -0
  81. package/dist/skills/defaults/devops/istio-traffic-management.md +327 -0
  82. package/dist/skills/defaults/devops/kubernetes.md +339 -0
  83. package/dist/skills/defaults/devops/linkerd-patterns.md +311 -0
  84. package/dist/skills/defaults/devops/multi-cloud-architecture.md +181 -0
  85. package/dist/skills/defaults/devops/observability.md +243 -0
  86. package/dist/skills/defaults/devops/openapi-spec-generation.md +1024 -0
  87. package/dist/skills/defaults/devops/postmortem-writing.md +396 -0
  88. package/dist/skills/defaults/devops/prometheus-configuration.md +265 -0
  89. package/dist/skills/defaults/devops/secrets-management.md +341 -0
  90. package/dist/skills/defaults/devops/service-mesh-observability.md +385 -0
  91. package/dist/skills/defaults/devops/terraform-module-library.md +244 -0
  92. package/dist/skills/defaults/finance/backtesting-frameworks/SKILL.md +663 -0
  93. package/dist/skills/defaults/finance/risk-metrics-calculation/SKILL.md +557 -0
  94. package/dist/skills/defaults/frontend/accessibility-compliance.md +420 -0
  95. package/dist/skills/defaults/frontend/design-system-patterns.md +337 -0
  96. package/dist/skills/defaults/frontend/interaction-design.md +327 -0
  97. package/dist/skills/defaults/frontend/javascript.md +311 -0
  98. package/dist/skills/defaults/frontend/modern-javascript-patterns.md +927 -0
  99. package/dist/skills/defaults/frontend/react-native-design.md +440 -0
  100. package/dist/skills/defaults/frontend/react.md +345 -0
  101. package/dist/skills/defaults/frontend/responsive-design.md +472 -0
  102. package/dist/skills/defaults/frontend/tailwind-design-system.md +337 -0
  103. package/dist/skills/defaults/frontend/typescript-advanced-types.md +724 -0
  104. package/dist/skills/defaults/frontend/typescript.md +334 -0
  105. package/dist/skills/defaults/frontend/visual-design-foundations.md +326 -0
  106. package/dist/skills/defaults/frontend/web-component-design.md +279 -0
  107. package/dist/skills/defaults/game-development/godot-gdscript-patterns.md +188 -0
  108. package/dist/skills/defaults/game-development/unity-ecs-patterns.md +594 -0
  109. package/dist/skills/defaults/kubernetes/gitops-workflow.md +285 -0
  110. package/dist/skills/defaults/kubernetes/gitops.md +280 -0
  111. package/dist/skills/defaults/kubernetes/helm-chart-scaffolding.md +553 -0
  112. package/dist/skills/defaults/kubernetes/helm.md +343 -0
  113. package/dist/skills/defaults/kubernetes/k8s-manifest-generator.md +501 -0
  114. package/dist/skills/defaults/kubernetes/k8s-security-policies.md +342 -0
  115. package/dist/skills/defaults/kubernetes/manifests.md +330 -0
  116. package/dist/skills/defaults/kubernetes/security.md +337 -0
  117. package/dist/skills/defaults/llm-application/embedding-strategies.md +608 -0
  118. package/dist/skills/defaults/llm-application/hybrid-search-implementation.md +570 -0
  119. package/dist/skills/defaults/llm-application/hybrid-search.md +570 -0
  120. package/dist/skills/defaults/llm-application/langchain-architecture.md +666 -0
  121. package/dist/skills/defaults/llm-application/langchain.md +259 -0
  122. package/dist/skills/defaults/llm-application/llm-evaluation.md +695 -0
  123. package/dist/skills/defaults/llm-application/prompt-engineering-patterns.md +449 -0
  124. package/dist/skills/defaults/llm-application/prompt-engineering.md +219 -0
  125. package/dist/skills/defaults/llm-application/rag-implementation.md +434 -0
  126. package/dist/skills/defaults/llm-application/similarity-search-patterns.md +560 -0
  127. package/dist/skills/defaults/llm-application/similarity-search.md +560 -0
  128. package/dist/skills/defaults/llm-application/vector-index-tuning.md +523 -0
  129. package/dist/skills/defaults/mobile/mobile-android-design.md +440 -0
  130. package/dist/skills/defaults/mobile/mobile-ios-design.md +266 -0
  131. package/dist/skills/defaults/monitoring/distributed-tracing.md +436 -0
  132. package/dist/skills/defaults/monitoring/grafana-dashboards.md +370 -0
  133. package/dist/skills/defaults/monitoring/prometheus-configuration.md +379 -0
  134. package/dist/skills/defaults/monitoring/slo-implementation.md +323 -0
  135. package/dist/skills/defaults/refactoring/code-refactoring.md +349 -0
  136. package/dist/skills/defaults/security/anti-reversing-techniques/SKILL.md +559 -0
  137. package/dist/skills/defaults/security/auditor.md +168 -0
  138. package/dist/skills/defaults/security/binary-analysis-patterns/SKILL.md +438 -0
  139. package/dist/skills/defaults/security/memory-forensics/SKILL.md +483 -0
  140. package/dist/skills/defaults/security/mtls-configuration.md +349 -0
  141. package/dist/skills/defaults/security/protocol-reverse-engineering/SKILL.md +520 -0
  142. package/dist/skills/defaults/security/sast-configuration.md +182 -0
  143. package/dist/skills/defaults/security/security.md +313 -0
  144. package/dist/skills/defaults/security/stride-analysis.md +273 -0
  145. package/dist/skills/defaults/security/threat-mitigation-mapping.md +290 -0
  146. package/dist/skills/defaults/systems/bash-defensive-patterns/SKILL.md +539 -0
  147. package/dist/skills/defaults/systems/bats-testing-patterns/SKILL.md +631 -0
  148. package/dist/skills/defaults/systems/go-concurrency-patterns.md +657 -0
  149. package/dist/skills/defaults/systems/memory-safety-patterns.md +605 -0
  150. package/dist/skills/defaults/systems/rust-async-patterns.md +519 -0
  151. package/dist/skills/defaults/systems/shellcheck-configuration/SKILL.md +456 -0
  152. package/dist/skills/defaults/team-collaboration/multi-reviewer-patterns.md +126 -0
  153. package/dist/skills/defaults/team-collaboration/parallel-feature-development.md +151 -0
  154. package/dist/skills/defaults/testing/javascript-testing-patterns.md +1021 -0
  155. package/dist/skills/defaults/testing/python-testing-patterns.md +351 -0
  156. package/dist/skills/defaults/testing/testing.md +332 -0
  157. package/dist/skills/defaults/workflows/context-driven-development.md +384 -0
  158. package/dist/skills/defaults/workflows/track-management.md +592 -0
  159. package/dist/skills/defaults/workflows/workflow-patterns.md +622 -0
  160. package/dist/skills/index.d.ts +11 -0
  161. package/dist/skills/index.d.ts.map +1 -0
  162. package/dist/skills/index.js +129 -0
  163. package/dist/skills/index.js.map +1 -0
  164. package/dist/utils/character.js +4 -4
  165. package/dist/utils/character.js.map +1 -1
  166. package/dist/utils/inputbar.d.ts.map +1 -1
  167. package/dist/utils/inputbar.js +7 -0
  168. package/dist/utils/inputbar.js.map +1 -1
  169. package/package.json +1 -1
@@ -0,0 +1,247 @@
1
+ ---
2
+ name: ml-pipeline-workflow
3
+ description: Build end-to-end MLOps pipelines from data preparation through model training, validation, and production deployment. Use when creating ML pipelines, implementing MLOps practices, or automating model training and deployment workflows.
4
+ ---
5
+
6
+ # ML Pipeline Workflow
7
+
8
+ Complete end-to-end MLOps pipeline orchestration from data preparation through model deployment.
9
+
10
+ ## Overview
11
+
12
+ This skill provides comprehensive guidance for building production ML pipelines that handle the full lifecycle: data ingestion → preparation → training → validation → deployment → monitoring.
13
+
14
+ ## When to Use This Skill
15
+
16
+ - Building new ML pipelines from scratch
17
+ - Designing workflow orchestration for ML systems
18
+ - Implementing data → model → deployment automation
19
+ - Setting up reproducible training workflows
20
+ - Creating DAG-based ML orchestration
21
+ - Integrating ML components into production systems
22
+
23
+ ## What This Skill Provides
24
+
25
+ ### Core Capabilities
26
+
27
+ 1. **Pipeline Architecture**
28
+ - End-to-end workflow design
29
+ - DAG orchestration patterns (Airflow, Dagster, Kubeflow)
30
+ - Component dependencies and data flow
31
+ - Error handling and retry strategies
32
+
33
+ 2. **Data Preparation**
34
+ - Data validation and quality checks
35
+ - Feature engineering pipelines
36
+ - Data versioning and lineage
37
+ - Train/validation/test splitting strategies
38
+
39
+ 3. **Model Training**
40
+ - Training job orchestration
41
+ - Hyperparameter management
42
+ - Experiment tracking integration
43
+ - Distributed training patterns
44
+
45
+ 4. **Model Validation**
46
+ - Validation frameworks and metrics
47
+ - A/B testing infrastructure
48
+ - Performance regression detection
49
+ - Model comparison workflows
50
+
51
+ 5. **Deployment Automation**
52
+ - Model serving patterns
53
+ - Canary deployments
54
+ - Blue-green deployment strategies
55
+ - Rollback mechanisms
56
+
57
+ ### Reference Documentation
58
+
59
+ See the `references/` directory for detailed guides:
60
+
61
+ - **data-preparation.md** - Data cleaning, validation, and feature engineering
62
+ - **model-training.md** - Training workflows and best practices
63
+ - **model-validation.md** - Validation strategies and metrics
64
+ - **model-deployment.md** - Deployment patterns and serving architectures
65
+
66
+ ### Assets and Templates
67
+
68
+ The `assets/` directory contains:
69
+
70
+ - **pipeline-dag.yaml.template** - DAG template for workflow orchestration
71
+ - **training-config.yaml** - Training configuration template
72
+ - **validation-checklist.md** - Pre-deployment validation checklist
73
+
74
+ ## Usage Patterns
75
+
76
+ ### Basic Pipeline Setup
77
+
78
+ ```python
79
+ # 1. Define pipeline stages
80
+ stages = [
81
+ "data_ingestion",
82
+ "data_validation",
83
+ "feature_engineering",
84
+ "model_training",
85
+ "model_validation",
86
+ "model_deployment"
87
+ ]
88
+
89
+ # 2. Configure dependencies
90
+ # See assets/pipeline-dag.yaml.template for full example
91
+ ```
92
+
93
+ ### Production Workflow
94
+
95
+ 1. **Data Preparation Phase**
96
+ - Ingest raw data from sources
97
+ - Run data quality checks
98
+ - Apply feature transformations
99
+ - Version processed datasets
100
+
101
+ 2. **Training Phase**
102
+ - Load versioned training data
103
+ - Execute training jobs
104
+ - Track experiments and metrics
105
+ - Save trained models
106
+
107
+ 3. **Validation Phase**
108
+ - Run validation test suite
109
+ - Compare against baseline
110
+ - Generate performance reports
111
+ - Approve for deployment
112
+
113
+ 4. **Deployment Phase**
114
+ - Package model artifacts
115
+ - Deploy to serving infrastructure
116
+ - Configure monitoring
117
+ - Validate production traffic
118
+
119
+ ## Best Practices
120
+
121
+ ### Pipeline Design
122
+
123
+ - **Modularity**: Each stage should be independently testable
124
+ - **Idempotency**: Re-running stages should be safe
125
+ - **Observability**: Log metrics at every stage
126
+ - **Versioning**: Track data, code, and model versions
127
+ - **Failure Handling**: Implement retry logic and alerting
128
+
129
+ ### Data Management
130
+
131
+ - Use data validation libraries (Great Expectations, TFX)
132
+ - Version datasets with DVC or similar tools
133
+ - Document feature engineering transformations
134
+ - Maintain data lineage tracking
135
+
136
+ ### Model Operations
137
+
138
+ - Separate training and serving infrastructure
139
+ - Use model registries (MLflow, Weights & Biases)
140
+ - Implement gradual rollouts for new models
141
+ - Monitor model performance drift
142
+ - Maintain rollback capabilities
143
+
144
+ ### Deployment Strategies
145
+
146
+ - Start with shadow deployments
147
+ - Use canary releases for validation
148
+ - Implement A/B testing infrastructure
149
+ - Set up automated rollback triggers
150
+ - Monitor latency and throughput
151
+
152
+ ## Integration Points
153
+
154
+ ### Orchestration Tools
155
+
156
+ - **Apache Airflow**: DAG-based workflow orchestration
157
+ - **Dagster**: Asset-based pipeline orchestration
158
+ - **Kubeflow Pipelines**: Kubernetes-native ML workflows
159
+ - **Prefect**: Modern dataflow automation
160
+
161
+ ### Experiment Tracking
162
+
163
+ - MLflow for experiment tracking and model registry
164
+ - Weights & Biases for visualization and collaboration
165
+ - TensorBoard for training metrics
166
+
167
+ ### Deployment Platforms
168
+
169
+ - AWS SageMaker for managed ML infrastructure
170
+ - Google Vertex AI for GCP deployments
171
+ - Azure ML for Azure cloud
172
+ - Kubernetes + KServe for cloud-agnostic serving
173
+
174
+ ## Progressive Disclosure
175
+
176
+ Start with the basics and gradually add complexity:
177
+
178
+ 1. **Level 1**: Simple linear pipeline (data → train → deploy)
179
+ 2. **Level 2**: Add validation and monitoring stages
180
+ 3. **Level 3**: Implement hyperparameter tuning
181
+ 4. **Level 4**: Add A/B testing and gradual rollouts
182
+ 5. **Level 5**: Multi-model pipelines with ensemble strategies
183
+
184
+ ## Common Patterns
185
+
186
+ ### Batch Training Pipeline
187
+
188
+ ```yaml
189
+ # See assets/pipeline-dag.yaml.template
190
+ stages:
191
+ - name: data_preparation
192
+ dependencies: []
193
+ - name: model_training
194
+ dependencies: [data_preparation]
195
+ - name: model_evaluation
196
+ dependencies: [model_training]
197
+ - name: model_deployment
198
+ dependencies: [model_evaluation]
199
+ ```
200
+
201
+ ### Real-time Feature Pipeline
202
+
203
+ ```python
204
+ # Stream processing for real-time features
205
+ # Combined with batch training
206
+ # See references/data-preparation.md
207
+ ```
208
+
209
+ ### Continuous Training
210
+
211
+ ```python
212
+ # Automated retraining on schedule
213
+ # Triggered by data drift detection
214
+ # See references/model-training.md
215
+ ```
216
+
217
+ ## Troubleshooting
218
+
219
+ ### Common Issues
220
+
221
+ - **Pipeline failures**: Check dependencies and data availability
222
+ - **Training instability**: Review hyperparameters and data quality
223
+ - **Deployment issues**: Validate model artifacts and serving config
224
+ - **Performance degradation**: Monitor data drift and model metrics
225
+
226
+ ### Debugging Steps
227
+
228
+ 1. Check pipeline logs for each stage
229
+ 2. Validate input/output data at boundaries
230
+ 3. Test components in isolation
231
+ 4. Review experiment tracking metrics
232
+ 5. Inspect model artifacts and metadata
233
+
234
+ ## Next Steps
235
+
236
+ After setting up your pipeline:
237
+
238
+ 1. Explore **hyperparameter-tuning** skill for optimization
239
+ 2. Learn **experiment-tracking-setup** for MLflow/W&B
240
+ 3. Review **model-deployment-patterns** for serving strategies
241
+ 4. Implement monitoring with observability tools
242
+
243
+ ## Related Skills
244
+
245
+ - **experiment-tracking-setup**: MLflow and Weights & Biases integration
246
+ - **hyperparameter-tuning**: Automated hyperparameter optimization
247
+ - **model-deployment-patterns**: Advanced deployment strategies
@@ -0,0 +1,348 @@
1
+ ---
2
+ name: spark-optimization
3
+ description: Optimize Apache Spark jobs with partitioning, caching, shuffle optimization, and memory tuning
4
+ ---
5
+
6
+ # Apache Spark Optimization
7
+
8
+ Production patterns for optimizing Apache Spark jobs including partitioning strategies, memory management, shuffle optimization, and performance tuning.
9
+
10
+ ## When to Use This Skill
11
+
12
+ - Optimizing slow Spark jobs
13
+ - Tuning memory and executor configuration
14
+ - Implementing efficient partitioning strategies
15
+ - Debugging Spark performance issues
16
+ - Scaling Spark pipelines for large datasets
17
+ - Reducing shuffle and data skew
18
+
19
+ ## Core Concepts
20
+
21
+ ### Spark Execution Model
22
+
23
+ ```
24
+ Driver Program
25
+
26
+ Job (triggered by action)
27
+
28
+ Stages (separated by shuffles)
29
+
30
+ Tasks (one per partition)
31
+ ```
32
+
33
+ ### Key Performance Factors
34
+
35
+ | Factor | Impact | Solution |
36
+ | ----------------- | --------------------- | ----------------------------- |
37
+ | **Shuffle** | Network I/O, disk I/O | Minimize wide transformations |
38
+ | **Data Skew** | Uneven task duration | Salting, broadcast joins |
39
+ | **Serialization** | CPU overhead | Use Kryo, columnar formats |
40
+ | **Memory** | GC pressure, spills | Tune executor memory |
41
+ | **Partitions** | Parallelism | Right-size partitions |
42
+
43
+ ## Quick Start
44
+
45
+ ```python
46
+ from pyspark.sql import SparkSession
47
+ from pyspark.sql import functions as F
48
+
49
+ spark = (SparkSession.builder
50
+ .appName("OptimizedJob")
51
+ .config("spark.sql.adaptive.enabled", "true")
52
+ .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
53
+ .config("spark.sql.adaptive.skewJoin.enabled", "true")
54
+ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
55
+ .config("spark.sql.shuffle.partitions", "200")
56
+ .getOrCreate())
57
+
58
+ df = (spark.read
59
+ .format("parquet")
60
+ .option("mergeSchema", "false")
61
+ .load("s3://bucket/data/"))
62
+
63
+ result = (df
64
+ .filter(F.col("date") >= "2024-01-01")
65
+ .select("id", "amount", "category")
66
+ .groupBy("category")
67
+ .agg(F.sum("amount").alias("total")))
68
+
69
+ result.write.mode("overwrite").parquet("s3://bucket/output/")
70
+ ```
71
+
72
+ ## Patterns
73
+
74
+ ### Pattern 1: Optimal Partitioning
75
+
76
+ ```python
77
+ def calculate_partitions(data_size_gb: float, partition_size_mb: int = 128) -> int:
78
+ return max(int(data_size_gb * 1024 / partition_size_mb), 1)
79
+
80
+ # Repartition for even distribution
81
+ df_repartitioned = df.repartition(200, "partition_key")
82
+
83
+ # Coalesce to reduce partitions (no shuffle)
84
+ df_coalesced = df.coalesce(100)
85
+
86
+ # Partition pruning with predicate pushdown
87
+ df = (spark.read.parquet("s3://bucket/data/")
88
+ .filter(F.col("date") == "2024-01-01"))
89
+
90
+ # Write with partitioning for future queries
91
+ (df.write
92
+ .partitionBy("year", "month", "day")
93
+ .mode("overwrite")
94
+ .parquet("s3://bucket/partitioned_output/"))
95
+ ```
96
+
97
+ ### Pattern 2: Join Optimization
98
+
99
+ ```python
100
+ from pyspark.sql import functions as F
101
+
102
+ # 1. Broadcast Join - Small table joins
103
+ small_df = spark.read.parquet("s3://bucket/small_table/")
104
+ large_df = spark.read.parquet("s3://bucket/large_table/")
105
+
106
+ result = large_df.join(
107
+ F.broadcast(small_df),
108
+ on="key",
109
+ how="left"
110
+ )
111
+
112
+ # 2. Sort-Merge Join - Default for large tables
113
+ result = large_df1.join(large_df2, on="key", how="inner")
114
+
115
+ # 3. Bucket Join - Pre-sorted, no shuffle at join time
116
+ (df.write
117
+ .bucketBy(200, "customer_id")
118
+ .sortBy("customer_id")
119
+ .mode("overwrite")
120
+ .saveAsTable("bucketed_orders"))
121
+
122
+ orders = spark.table("bucketed_orders")
123
+ customers = spark.table("bucketed_customers")
124
+ result = orders.join(customers, on="customer_id")
125
+
126
+ # 4. Skew Join Handling
127
+ spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")
128
+ spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionFactor", "5")
129
+
130
+ # Manual salting for severe skew
131
+ def salt_join(df_skewed, df_other, key_col, num_salts=10):
132
+ df_salted = df_skewed.withColumn(
133
+ "salt",
134
+ (F.rand() * num_salts).cast("int")
135
+ ).withColumn(
136
+ "salted_key",
137
+ F.concat(F.col(key_col), F.lit("_"), F.col("salt"))
138
+ )
139
+
140
+ df_exploded = df_other.crossJoin(
141
+ spark.range(num_salts).withColumnRenamed("id", "salt")
142
+ ).withColumn(
143
+ "salted_key",
144
+ F.concat(F.col(key_col), F.lit("_"), F.col("salt"))
145
+ )
146
+
147
+ return df_salted.join(df_exploded, on="salted_key", how="inner")
148
+ ```
149
+
150
+ ### Pattern 3: Caching and Persistence
151
+
152
+ ```python
153
+ from pyspark import StorageLevel
154
+
155
+ # Cache when reusing DataFrame multiple times
156
+ df = spark.read.parquet("s3://bucket/data/")
157
+ df_filtered = df.filter(F.col("status") == "active")
158
+
159
+ # Cache in memory
160
+ df_filtered.cache()
161
+
162
+ # Or with specific storage level
163
+ df_filtered.persist(StorageLevel.MEMORY_AND_DISK_SER)
164
+
165
+ # Force materialization
166
+ df_filtered.count()
167
+
168
+ # Use in multiple actions
169
+ agg1 = df_filtered.groupBy("category").count()
170
+ agg2 = df_filtered.groupBy("region").sum("amount")
171
+
172
+ # Unpersist when done
173
+ df_filtered.unpersist()
174
+
175
+ # Storage levels:
176
+ # MEMORY_ONLY - Fast, but may not fit
177
+ # MEMORY_AND_DISK - Spills to disk if needed (recommended)
178
+ # MEMORY_ONLY_SER - Serialized, less memory, more CPU
179
+ # DISK_ONLY - When memory is tight
180
+ # OFF_HEAP - Tungsten off-heap memory
181
+
182
+ # Checkpoint for complex lineage
183
+ spark.sparkContext.setCheckpointDir("s3://bucket/checkpoints/")
184
+ df_complex.checkpoint()
185
+ ```
186
+
187
+ ### Pattern 4: Memory Tuning
188
+
189
+ ```python
190
+ # Executor memory configuration
191
+ spark = (SparkSession.builder
192
+ .config("spark.executor.memory", "8g")
193
+ .config("spark.executor.memoryOverhead", "2g")
194
+ .config("spark.memory.fraction", "0.6")
195
+ .config("spark.memory.storageFraction", "0.5")
196
+ .config("spark.sql.shuffle.partitions", "200")
197
+ .config("spark.sql.autoBroadcastJoinThreshold", "50MB")
198
+ .config("spark.sql.files.maxPartitionBytes", "128MB")
199
+ .getOrCreate())
200
+
201
+ # Memory breakdown (8GB executor):
202
+ # - spark.memory.fraction = 0.6 (60% = 4.8GB for execution + storage)
203
+ # - spark.memory.storageFraction = 0.5 (50% of 4.8GB = 2.4GB for cache)
204
+ # - Remaining 2.4GB for execution (shuffles, joins, sorts)
205
+ # - 40% = 3.2GB for user data structures and internal metadata
206
+ ```
207
+
208
+ ### Pattern 5: Shuffle Optimization
209
+
210
+ ```python
211
+ # Reduce shuffle data size
212
+ spark.conf.set("spark.sql.shuffle.partitions", "auto")
213
+ spark.conf.set("spark.shuffle.compress", "true")
214
+ spark.conf.set("spark.shuffle.spill.compress", "true")
215
+
216
+ # Pre-aggregate before shuffle
217
+ df_optimized = (df
218
+ .groupBy("key", "partition_col")
219
+ .agg(F.sum("value").alias("partial_sum"))
220
+ .groupBy("key")
221
+ .agg(F.sum("partial_sum").alias("total")))
222
+
223
+ # Use coalesce instead of repartition when reducing partitions
224
+ df_reduced = df.coalesce(10)
225
+
226
+ # Optimize shuffle with compression
227
+ spark.conf.set("spark.io.compression.codec", "lz4")
228
+ ```
229
+
230
+ ### Pattern 6: Data Format Optimization
231
+
232
+ ```python
233
+ # Parquet optimizations
234
+ (df.write
235
+ .option("compression", "snappy")
236
+ .option("parquet.block.size", 128 * 1024 * 1024)
237
+ .parquet("s3://bucket/output/"))
238
+
239
+ # Column pruning - only read needed columns
240
+ df = (spark.read.parquet("s3://bucket/data/")
241
+ .select("id", "amount", "date"))
242
+
243
+ # Delta Lake optimizations
244
+ (df.write
245
+ .format("delta")
246
+ .option("optimizeWrite", "true")
247
+ .option("autoCompact", "true")
248
+ .mode("overwrite")
249
+ .save("s3://bucket/delta_table/"))
250
+
251
+ # Z-ordering for multi-dimensional queries
252
+ spark.sql("""
253
+ OPTIMIZE delta.`s3://bucket/delta_table/`
254
+ ZORDER BY (customer_id, date)
255
+ """)
256
+ ```
257
+
258
+ ### Pattern 7: Monitoring and Debugging
259
+
260
+ ```python
261
+ # Enable detailed metrics
262
+ spark.conf.set("spark.sql.codegen.wholeStage", "true")
263
+
264
+ # Explain query plan
265
+ df.explain(mode="extended")
266
+
267
+ # Monitor task metrics
268
+ def analyze_stage_metrics(spark):
269
+ status_tracker = spark.sparkContext.statusTracker()
270
+ for stage_id in status_tracker.getActiveStageIds():
271
+ stage_info = status_tracker.getStageInfo(stage_id)
272
+ print(f"Stage {stage_id}:")
273
+ print(f" Tasks: {stage_info.numTasks}")
274
+ print(f" Completed: {stage_info.numCompletedTasks}")
275
+
276
+ # Identify data skew
277
+ def check_partition_skew(df):
278
+ partition_counts = (df
279
+ .withColumn("partition_id", F.spark_partition_id())
280
+ .groupBy("partition_id")
281
+ .count()
282
+ .orderBy(F.desc("count")))
283
+
284
+ partition_counts.show(20)
285
+
286
+ stats = partition_counts.select(
287
+ F.min("count").alias("min"),
288
+ F.max("count").alias("max"),
289
+ F.avg("count").alias("avg"),
290
+ F.stddev("count").alias("stddev")
291
+ ).collect()[0]
292
+
293
+ skew_ratio = stats["max"] / stats["avg"]
294
+ print(f"Skew ratio: {skew_ratio:.2f}x (>2x indicates skew)")
295
+ ```
296
+
297
+ ## Configuration Cheat Sheet
298
+
299
+ ```python
300
+ spark_configs = {
301
+ # Adaptive Query Execution (AQE)
302
+ "spark.sql.adaptive.enabled": "true",
303
+ "spark.sql.adaptive.coalescePartitions.enabled": "true",
304
+ "spark.sql.adaptive.skewJoin.enabled": "true",
305
+
306
+ # Memory
307
+ "spark.executor.memory": "8g",
308
+ "spark.executor.memoryOverhead": "2g",
309
+ "spark.memory.fraction": "0.6",
310
+ "spark.memory.storageFraction": "0.5",
311
+
312
+ # Parallelism
313
+ "spark.sql.shuffle.partitions": "200",
314
+ "spark.default.parallelism": "200",
315
+
316
+ # Serialization
317
+ "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
318
+
319
+ # Compression
320
+ "spark.io.compression.codec": "lz4",
321
+ "spark.shuffle.compress": "true",
322
+
323
+ # Broadcast
324
+ "spark.sql.autoBroadcastJoinThreshold": "50MB",
325
+
326
+ # File handling
327
+ "spark.sql.files.maxPartitionBytes": "128MB",
328
+ "spark.sql.files.openCostInBytes": "4MB",
329
+ }
330
+ ```
331
+
332
+ ## Best Practices
333
+
334
+ ### Do's
335
+
336
+ - **Enable AQE** - Adaptive query execution handles many issues
337
+ - **Use Parquet/Delta** - Columnar formats with compression
338
+ - **Broadcast small tables** - Avoid shuffle for small joins
339
+ - **Monitor Spark UI** - Check for skew, spills, GC
340
+ - **Right-size partitions** - 128MB - 256MB per partition
341
+
342
+ ### Don'ts
343
+
344
+ - **Don't collect large data** - Keep data distributed
345
+ - **Don't use UDFs unnecessarily** - Use built-in functions
346
+ - **Don't over-cache** - Memory is limited
347
+ - **Don't ignore data skew** - It dominates job time
348
+ - **Don't use `.count()` for existence** - Use `.take(1)` or `.isEmpty()`