saltmill-spark 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,268 @@
1
+ Metadata-Version: 2.4
2
+ Name: saltmill-spark
3
+ Version: 0.2.0
4
+ Summary: Efficient large-CSV processing for Apache Spark / Databricks with auto-salting, skew detection, and partition tuning
5
+ License: Apache-2.0
6
+ Project-URL: Homepage, https://github.com/yuvaraj-munirathinam/saltmill
7
+ Project-URL: Repository, https://github.com/yuvaraj-munirathinam/saltmill
8
+ Project-URL: Issues, https://github.com/yuvaraj-munirathinam/saltmill/issues
9
+ Keywords: pyspark,databricks,csv,big-data,partitioning,salting,skew
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ Provides-Extra: databricks
23
+ Requires-Dist: databricks-sdk>=0.12; extra == "databricks"
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest>=8; extra == "dev"
26
+ Requires-Dist: pytest-asyncio; extra == "dev"
27
+ Requires-Dist: pyspark>=3.4; extra == "dev"
28
+ Requires-Dist: delta-spark>=2.4; extra == "dev"
29
+ Requires-Dist: ruff; extra == "dev"
30
+ Requires-Dist: mypy; extra == "dev"
31
+
32
+ # saltmill
33
+
34
+ **Efficient large CSV processing for PySpark and Databricks.**
35
+
36
+ saltmill automatically computes optimal salt buckets, partition counts, and Spark configuration for processing CSV files of any size — from a single API call.
37
+
38
+ ---
39
+
40
+ ## The problem
41
+
42
+ Reading a 500 GB CSV file naively in Spark causes data skew, memory pressure, and slow shuffles. Fixing it requires manually tuning:
43
+
44
+ - Salt bucket count
45
+ - Repartition strategy
46
+ - `spark.sql.shuffle.partitions`
47
+ - `spark.sql.files.maxPartitionBytes`
48
+ - Databricks Delta write optimizations
49
+
50
+ saltmill does all of this automatically, based on file size and cluster parallelism.
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ pip install saltmill
56
+ ```
57
+
58
+ > PySpark is a peer dependency — saltmill works with whatever version your cluster runs.
59
+
60
+ ## Quick start
61
+
62
+ ```python
63
+ import saltmill
64
+
65
+ df = saltmill.read(spark, "s3://my-bucket/large.csv")
66
+ ```
67
+
68
+ That's it. saltmill will:
69
+ 1. Detect file size via Hadoop FileSystem
70
+ 2. Auto-compute salt buckets and partition count
71
+ 3. Apply optimized Spark configs (shuffle partitions, maxPartitionBytes, Delta settings on Databricks)
72
+ 4. Infer schema from a 0.1% sample of the first file
73
+ 5. Read and return a well-partitioned DataFrame
74
+
75
+ ## Usage
76
+
77
+ ### Module-level function (simplest)
78
+
79
+ ```python
80
+ import saltmill
81
+
82
+ df = saltmill.read(spark, "s3://bucket/huge.csv")
83
+ ```
84
+
85
+ ### Class-based (more control)
86
+
87
+ ```python
88
+ from saltmill import SaltMill
89
+
90
+ sm = SaltMill(spark, workers=32)
91
+ df = sm.read("s3://bucket/huge.csv")
92
+ ```
93
+
94
+ ### Multiple files
95
+
96
+ ```python
97
+ df = saltmill.read(
98
+ spark,
99
+ ["s3://bucket/2024-01.csv", "s3://bucket/2024-02.csv"],
100
+ hint_size_gb=500,
101
+ )
102
+ ```
103
+
104
+ ### With explicit schema
105
+
106
+ ```python
107
+ df = saltmill.read(
108
+ spark,
109
+ "s3://bucket/sales.csv",
110
+ schema={
111
+ "order_id": "long",
112
+ "region": "string",
113
+ "amount": "double",
114
+ "created_at": "timestamp",
115
+ },
116
+ partition_col="region",
117
+ )
118
+ ```
119
+
120
+ ### With a PySpark StructType
121
+
122
+ ```python
123
+ from pyspark.sql.types import StructType, StructField, LongType, StringType
124
+
125
+ schema = StructType([
126
+ StructField("id", LongType(), True),
127
+ StructField("name", StringType(), True),
128
+ ])
129
+
130
+ df = saltmill.read(spark, "s3://bucket/data.csv", schema=schema)
131
+ ```
132
+
133
+ ### Preview tuning parameters without reading
134
+
135
+ ```python
136
+ sm = SaltMill(spark)
137
+ params = sm.tune("s3://bucket/huge.csv", hint_size_gb=500)
138
+ print(params.summary())
139
+ # saltmill tuning → file: 500.0 GB, workers: 64, salt_buckets: 64,
140
+ # partitions: 640, maxPartitionBytes: 64 MB
141
+ ```
142
+
143
+ ### Write to Delta Lake
144
+
145
+ ```python
146
+ sm = SaltMill(spark)
147
+ df = sm.read("s3://bucket/huge.csv", partition_col="region")
148
+ sm.write_delta(df, "s3://bucket/delta/sales", partition_by="region")
149
+ ```
150
+
151
+ ## How it works
152
+
153
+ ### Salting
154
+
155
+ saltmill assigns each row a random bucket using:
156
+
157
+ ```python
158
+ df.withColumn("_salt", pmod(monotonically_increasing_id(), salt_buckets))
159
+ .repartition(num_partitions, partition_col, "_salt")
160
+ .drop("_salt")
161
+ ```
162
+
163
+ This breaks data skew even when a join or group-by column is highly imbalanced.
164
+
165
+ ### Auto-tuning formula
166
+
167
+ | Input | Rule |
168
+ |---|---|
169
+ | File size | 1 salt bucket per 8 GB, rounded to nearest power of 2 |
170
+ | Salt buckets | Clamped to [8, 512] |
171
+ | Partitions | `salt_buckets × 10`, rounded up to nearest multiple of worker count |
172
+ | maxPartitionBytes | 64 MB (matches default HDFS block) |
173
+
174
+ ### Example: 500 GB file, 64 workers
175
+
176
+ ```
177
+ file_size_gb = 500
178
+ salt_buckets = round_pow2(500 / 8) = round_pow2(62.5) = 64
179
+ num_partitions = 64 × 10 = 640 (already a multiple of 64)
180
+ shuffle_partitions = 640
181
+ maxPartitionBytes = 64 MB
182
+ ```
183
+
184
+ This matches the pattern proven in production:
185
+
186
+ ```python
187
+ # What saltmill does internally
188
+ spark.conf.set("spark.sql.shuffle.partitions", 640)
189
+ spark.conf.set("spark.sql.files.maxPartitionBytes", 64 * 1024 * 1024)
190
+
191
+ dfw = (
192
+ df.withColumn("_salt", pmod(monotonically_increasing_id(), 64))
193
+ .repartition(640, "region", "_salt")
194
+ .drop("_salt")
195
+ )
196
+ ```
197
+
198
+ ### Databricks-specific settings
199
+
200
+ When running on Databricks, saltmill also sets:
201
+
202
+ ```
203
+ spark.databricks.delta.optimizeWrite.enabled = true
204
+ spark.databricks.delta.autoCompact.enabled = true
205
+ ```
206
+
207
+ ## Schema dict shorthand
208
+
209
+ | Alias | Spark type |
210
+ |---|---|
211
+ | `"str"`, `"string"` | StringType |
212
+ | `"int"`, `"integer"` | IntegerType |
213
+ | `"long"`, `"bigint"` | LongType |
214
+ | `"float"` | FloatType |
215
+ | `"double"` | DoubleType |
216
+ | `"bool"`, `"boolean"` | BooleanType |
217
+ | `"date"` | DateType |
218
+ | `"timestamp"` | TimestampType |
219
+ | `"decimal"` | DecimalType(38,10) |
220
+
221
+ Any Spark SQL type string is also accepted directly (e.g. `"decimal(10,2)"`).
222
+
223
+ ## API reference
224
+
225
+ ### `saltmill.read(spark, paths, *, schema, partition_col, workers, salt_buckets, num_partitions, hint_size_gb, delimiter, encoding, null_value, verbose)`
226
+
227
+ Module-level convenience function. See class docs for parameter details.
228
+
229
+ ### `SaltMill(spark, *, workers, verbose)`
230
+
231
+ | Parameter | Type | Default | Description |
232
+ |---|---|---|---|
233
+ | `spark` | SparkSession | required | Active session |
234
+ | `workers` | int | auto-detected | Worker node count |
235
+ | `verbose` | bool | True | Print tuning summary |
236
+
237
+ ### `SaltMill.read(paths, *, schema, partition_col, salt_buckets, num_partitions, hint_size_gb, delimiter, encoding, null_value)`
238
+
239
+ | Parameter | Type | Default | Description |
240
+ |---|---|---|---|
241
+ | `paths` | str or list | required | CSV path(s) |
242
+ | `schema` | StructType / dict / None | None | Schema or auto-infer |
243
+ | `partition_col` | str or list | None | Extra partition key(s) |
244
+ | `salt_buckets` | int | auto | Override salt bucket count |
245
+ | `num_partitions` | int | auto | Override total partitions |
246
+ | `hint_size_gb` | float | None | File size hint (when detection fails) |
247
+ | `delimiter` | str | `","` | CSV separator |
248
+ | `encoding` | str | `"UTF-8"` | File encoding |
249
+ | `null_value` | str | `""` | Null string |
250
+
251
+ ### `SaltMill.tune(paths, *, salt_buckets, num_partitions, hint_size_gb) → TuningParams`
252
+
253
+ Returns computed tuning parameters without reading any data.
254
+
255
+ ### `SaltMill.write_delta(df, path, *, partition_by, mode)`
256
+
257
+ Writes a DataFrame to Delta Lake with Databricks-optimized settings.
258
+
259
+ ## Development
260
+
261
+ ```bash
262
+ pip install -e ".[dev]"
263
+ pytest
264
+ ```
265
+
266
+ ## License
267
+
268
+ Apache 2.0
@@ -0,0 +1,237 @@
1
+ # saltmill
2
+
3
+ **Efficient large CSV processing for PySpark and Databricks.**
4
+
5
+ saltmill automatically computes optimal salt buckets, partition counts, and Spark configuration for processing CSV files of any size — from a single API call.
6
+
7
+ ---
8
+
9
+ ## The problem
10
+
11
+ Reading a 500 GB CSV file naively in Spark causes data skew, memory pressure, and slow shuffles. Fixing it requires manually tuning:
12
+
13
+ - Salt bucket count
14
+ - Repartition strategy
15
+ - `spark.sql.shuffle.partitions`
16
+ - `spark.sql.files.maxPartitionBytes`
17
+ - Databricks Delta write optimizations
18
+
19
+ saltmill does all of this automatically, based on file size and cluster parallelism.
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ pip install saltmill
25
+ ```
26
+
27
+ > PySpark is a peer dependency — saltmill works with whatever version your cluster runs.
28
+
29
+ ## Quick start
30
+
31
+ ```python
32
+ import saltmill
33
+
34
+ df = saltmill.read(spark, "s3://my-bucket/large.csv")
35
+ ```
36
+
37
+ That's it. saltmill will:
38
+ 1. Detect file size via Hadoop FileSystem
39
+ 2. Auto-compute salt buckets and partition count
40
+ 3. Apply optimized Spark configs (shuffle partitions, maxPartitionBytes, Delta settings on Databricks)
41
+ 4. Infer schema from a 0.1% sample of the first file
42
+ 5. Read and return a well-partitioned DataFrame
43
+
44
+ ## Usage
45
+
46
+ ### Module-level function (simplest)
47
+
48
+ ```python
49
+ import saltmill
50
+
51
+ df = saltmill.read(spark, "s3://bucket/huge.csv")
52
+ ```
53
+
54
+ ### Class-based (more control)
55
+
56
+ ```python
57
+ from saltmill import SaltMill
58
+
59
+ sm = SaltMill(spark, workers=32)
60
+ df = sm.read("s3://bucket/huge.csv")
61
+ ```
62
+
63
+ ### Multiple files
64
+
65
+ ```python
66
+ df = saltmill.read(
67
+ spark,
68
+ ["s3://bucket/2024-01.csv", "s3://bucket/2024-02.csv"],
69
+ hint_size_gb=500,
70
+ )
71
+ ```
72
+
73
+ ### With explicit schema
74
+
75
+ ```python
76
+ df = saltmill.read(
77
+ spark,
78
+ "s3://bucket/sales.csv",
79
+ schema={
80
+ "order_id": "long",
81
+ "region": "string",
82
+ "amount": "double",
83
+ "created_at": "timestamp",
84
+ },
85
+ partition_col="region",
86
+ )
87
+ ```
88
+
89
+ ### With a PySpark StructType
90
+
91
+ ```python
92
+ from pyspark.sql.types import StructType, StructField, LongType, StringType
93
+
94
+ schema = StructType([
95
+ StructField("id", LongType(), True),
96
+ StructField("name", StringType(), True),
97
+ ])
98
+
99
+ df = saltmill.read(spark, "s3://bucket/data.csv", schema=schema)
100
+ ```
101
+
102
+ ### Preview tuning parameters without reading
103
+
104
+ ```python
105
+ sm = SaltMill(spark)
106
+ params = sm.tune("s3://bucket/huge.csv", hint_size_gb=500)
107
+ print(params.summary())
108
+ # saltmill tuning → file: 500.0 GB, workers: 64, salt_buckets: 64,
109
+ # partitions: 640, maxPartitionBytes: 64 MB
110
+ ```
111
+
112
+ ### Write to Delta Lake
113
+
114
+ ```python
115
+ sm = SaltMill(spark)
116
+ df = sm.read("s3://bucket/huge.csv", partition_col="region")
117
+ sm.write_delta(df, "s3://bucket/delta/sales", partition_by="region")
118
+ ```
119
+
120
+ ## How it works
121
+
122
+ ### Salting
123
+
124
+ saltmill assigns each row a random bucket using:
125
+
126
+ ```python
127
+ df.withColumn("_salt", pmod(monotonically_increasing_id(), salt_buckets))
128
+ .repartition(num_partitions, partition_col, "_salt")
129
+ .drop("_salt")
130
+ ```
131
+
132
+ This breaks data skew even when a join or group-by column is highly imbalanced.
133
+
134
+ ### Auto-tuning formula
135
+
136
+ | Input | Rule |
137
+ |---|---|
138
+ | File size | 1 salt bucket per 8 GB, rounded to nearest power of 2 |
139
+ | Salt buckets | Clamped to [8, 512] |
140
+ | Partitions | `salt_buckets × 10`, rounded up to nearest multiple of worker count |
141
+ | maxPartitionBytes | 64 MB (matches default HDFS block) |
142
+
143
+ ### Example: 500 GB file, 64 workers
144
+
145
+ ```
146
+ file_size_gb = 500
147
+ salt_buckets = round_pow2(500 / 8) = round_pow2(62.5) = 64
148
+ num_partitions = 64 × 10 = 640 (already a multiple of 64)
149
+ shuffle_partitions = 640
150
+ maxPartitionBytes = 64 MB
151
+ ```
152
+
153
+ This matches the pattern proven in production:
154
+
155
+ ```python
156
+ # What saltmill does internally
157
+ spark.conf.set("spark.sql.shuffle.partitions", 640)
158
+ spark.conf.set("spark.sql.files.maxPartitionBytes", 64 * 1024 * 1024)
159
+
160
+ dfw = (
161
+ df.withColumn("_salt", pmod(monotonically_increasing_id(), 64))
162
+ .repartition(640, "region", "_salt")
163
+ .drop("_salt")
164
+ )
165
+ ```
166
+
167
+ ### Databricks-specific settings
168
+
169
+ When running on Databricks, saltmill also sets:
170
+
171
+ ```
172
+ spark.databricks.delta.optimizeWrite.enabled = true
173
+ spark.databricks.delta.autoCompact.enabled = true
174
+ ```
175
+
176
+ ## Schema dict shorthand
177
+
178
+ | Alias | Spark type |
179
+ |---|---|
180
+ | `"str"`, `"string"` | StringType |
181
+ | `"int"`, `"integer"` | IntegerType |
182
+ | `"long"`, `"bigint"` | LongType |
183
+ | `"float"` | FloatType |
184
+ | `"double"` | DoubleType |
185
+ | `"bool"`, `"boolean"` | BooleanType |
186
+ | `"date"` | DateType |
187
+ | `"timestamp"` | TimestampType |
188
+ | `"decimal"` | DecimalType(38,10) |
189
+
190
+ Any Spark SQL type string is also accepted directly (e.g. `"decimal(10,2)"`).
191
+
192
+ ## API reference
193
+
194
+ ### `saltmill.read(spark, paths, *, schema, partition_col, workers, salt_buckets, num_partitions, hint_size_gb, delimiter, encoding, null_value, verbose)`
195
+
196
+ Module-level convenience function. See class docs for parameter details.
197
+
198
+ ### `SaltMill(spark, *, workers, verbose)`
199
+
200
+ | Parameter | Type | Default | Description |
201
+ |---|---|---|---|
202
+ | `spark` | SparkSession | required | Active session |
203
+ | `workers` | int | auto-detected | Worker node count |
204
+ | `verbose` | bool | True | Print tuning summary |
205
+
206
+ ### `SaltMill.read(paths, *, schema, partition_col, salt_buckets, num_partitions, hint_size_gb, delimiter, encoding, null_value)`
207
+
208
+ | Parameter | Type | Default | Description |
209
+ |---|---|---|---|
210
+ | `paths` | str or list | required | CSV path(s) |
211
+ | `schema` | StructType / dict / None | None | Schema or auto-infer |
212
+ | `partition_col` | str or list | None | Extra partition key(s) |
213
+ | `salt_buckets` | int | auto | Override salt bucket count |
214
+ | `num_partitions` | int | auto | Override total partitions |
215
+ | `hint_size_gb` | float | None | File size hint (when detection fails) |
216
+ | `delimiter` | str | `","` | CSV separator |
217
+ | `encoding` | str | `"UTF-8"` | File encoding |
218
+ | `null_value` | str | `""` | Null string |
219
+
220
+ ### `SaltMill.tune(paths, *, salt_buckets, num_partitions, hint_size_gb) → TuningParams`
221
+
222
+ Returns computed tuning parameters without reading any data.
223
+
224
+ ### `SaltMill.write_delta(df, path, *, partition_by, mode)`
225
+
226
+ Writes a DataFrame to Delta Lake with Databricks-optimized settings.
227
+
228
+ ## Development
229
+
230
+ ```bash
231
+ pip install -e ".[dev]"
232
+ pytest
233
+ ```
234
+
235
+ ## License
236
+
237
+ Apache 2.0
@@ -0,0 +1,59 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "saltmill-spark"
7
+ version = "0.2.0"
8
+ description = "Efficient large-CSV processing for Apache Spark / Databricks with auto-salting, skew detection, and partition tuning"
9
+ readme = "README.md"
10
+ license = { text = "Apache-2.0" }
11
+ requires-python = ">=3.10"
12
+ keywords = ["pyspark", "databricks", "csv", "big-data", "partitioning", "salting", "skew"]
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Intended Audience :: Developers",
16
+ "Intended Audience :: Science/Research",
17
+ "License :: OSI Approved :: Apache Software License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Scientific/Engineering :: Information Analysis",
23
+ "Topic :: Software Development :: Libraries :: Python Modules",
24
+ ]
25
+
26
+ # pyspark is a peer dependency — users bring their own cluster version
27
+ dependencies = []
28
+
29
+ [project.optional-dependencies]
30
+ databricks = ["databricks-sdk>=0.12"]
31
+ dev = [
32
+ "pytest>=8",
33
+ "pytest-asyncio",
34
+ "pyspark>=3.4",
35
+ "delta-spark>=2.4",
36
+ "ruff",
37
+ "mypy",
38
+ ]
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/yuvaraj-munirathinam/saltmill"
42
+ Repository = "https://github.com/yuvaraj-munirathinam/saltmill"
43
+ Issues = "https://github.com/yuvaraj-munirathinam/saltmill/issues"
44
+
45
+ [tool.setuptools.packages.find]
46
+ where = ["."]
47
+ include = ["saltmill*"]
48
+
49
+ [tool.pytest.ini_options]
50
+ testpaths = ["tests"]
51
+ python_files = ["test_*.py"]
52
+
53
+ [tool.ruff]
54
+ line-length = 100
55
+ target-version = "py310"
56
+
57
+ [tool.mypy]
58
+ strict = true
59
+ ignore_missing_imports = true
@@ -0,0 +1,60 @@
1
+ """
2
+ saltmill — Efficient large-CSV processing for Apache Spark / Databricks.
3
+
4
+ Auto-detects skew, tunes salt buckets, partition keys, and Spark config
5
+ for 500GB+ CSV files.
6
+
7
+ Quick start (simple API)::
8
+
9
+ import saltmill
10
+ df = saltmill.read(spark, "abfss://container@account.dfs.core.windows.net/data/large.csv")
11
+
12
+ Advanced API (full pipeline with write)::
13
+
14
+ from saltmill import SaltmillProcessor, SaltmillConfig
15
+
16
+ result = SaltmillProcessor(SaltmillConfig(
17
+ input_path="abfss://raw@account.dfs.core.windows.net/data/*.csv",
18
+ output_path="abfss://curated@account.dfs.core.windows.net/output/delta/",
19
+ )).process()
20
+ """
21
+
22
+ from saltmill._version import __version__
23
+
24
+ # ── Advanced API ──────────────────────────────────────────────────────────────
25
+ from saltmill.config import CompressionCodec, SaltmillConfig, WriteFormat
26
+ from saltmill.exceptions import (
27
+ CheckpointError,
28
+ ConfigurationError,
29
+ SaltmillError,
30
+ SchemaInferenceError,
31
+ SkewDetectionError,
32
+ UnsupportedPathError,
33
+ )
34
+ from saltmill.models import PartitionPlan, ProcessingResult, SchemaInfo, SkewReport
35
+ from saltmill.processor import SaltmillProcessor
36
+
37
+ # ── Simple backward-compatible API ────────────────────────────────────────────
38
+ from saltmill.compat import SaltMill, read
39
+
40
+ __all__ = [
41
+ # Advanced API
42
+ "SaltmillProcessor",
43
+ "SaltmillConfig",
44
+ "WriteFormat",
45
+ "CompressionCodec",
46
+ "ProcessingResult",
47
+ "PartitionPlan",
48
+ "SchemaInfo",
49
+ "SkewReport",
50
+ "SaltmillError",
51
+ "ConfigurationError",
52
+ "SchemaInferenceError",
53
+ "SkewDetectionError",
54
+ "CheckpointError",
55
+ "UnsupportedPathError",
56
+ # Simple API
57
+ "SaltMill",
58
+ "read",
59
+ "__version__",
60
+ ]
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"