saltmill-spark 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- saltmill_spark-0.2.0/PKG-INFO +268 -0
- saltmill_spark-0.2.0/README.md +237 -0
- saltmill_spark-0.2.0/pyproject.toml +59 -0
- saltmill_spark-0.2.0/saltmill/__init__.py +60 -0
- saltmill_spark-0.2.0/saltmill/_version.py +1 -0
- saltmill_spark-0.2.0/saltmill/cardinality.py +89 -0
- saltmill_spark-0.2.0/saltmill/checkpoint.py +126 -0
- saltmill_spark-0.2.0/saltmill/compat.py +210 -0
- saltmill_spark-0.2.0/saltmill/config.py +137 -0
- saltmill_spark-0.2.0/saltmill/exceptions.py +25 -0
- saltmill_spark-0.2.0/saltmill/models.py +49 -0
- saltmill_spark-0.2.0/saltmill/processor.py +250 -0
- saltmill_spark-0.2.0/saltmill/progress.py +51 -0
- saltmill_spark-0.2.0/saltmill/py.typed +0 -0
- saltmill_spark-0.2.0/saltmill/reader.py +60 -0
- saltmill_spark-0.2.0/saltmill/salter.py +62 -0
- saltmill_spark-0.2.0/saltmill/schema.py +150 -0
- saltmill_spark-0.2.0/saltmill/skew.py +114 -0
- saltmill_spark-0.2.0/saltmill/spark_conf.py +69 -0
- saltmill_spark-0.2.0/saltmill/writer.py +65 -0
- saltmill_spark-0.2.0/saltmill_spark.egg-info/PKG-INFO +268 -0
- saltmill_spark-0.2.0/saltmill_spark.egg-info/SOURCES.txt +27 -0
- saltmill_spark-0.2.0/saltmill_spark.egg-info/dependency_links.txt +1 -0
- saltmill_spark-0.2.0/saltmill_spark.egg-info/requires.txt +11 -0
- saltmill_spark-0.2.0/saltmill_spark.egg-info/top_level.txt +1 -0
- saltmill_spark-0.2.0/setup.cfg +4 -0
- saltmill_spark-0.2.0/tests/test_config.py +86 -0
- saltmill_spark-0.2.0/tests/test_schema.py +28 -0
- saltmill_spark-0.2.0/tests/test_skew.py +70 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: saltmill-spark
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Efficient large-CSV processing for Apache Spark / Databricks with auto-salting, skew detection, and partition tuning
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Project-URL: Homepage, https://github.com/yuvaraj-munirathinam/saltmill
|
|
7
|
+
Project-URL: Repository, https://github.com/yuvaraj-munirathinam/saltmill
|
|
8
|
+
Project-URL: Issues, https://github.com/yuvaraj-munirathinam/saltmill/issues
|
|
9
|
+
Keywords: pyspark,databricks,csv,big-data,partitioning,salting,skew
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Provides-Extra: databricks
|
|
23
|
+
Requires-Dist: databricks-sdk>=0.12; extra == "databricks"
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
27
|
+
Requires-Dist: pyspark>=3.4; extra == "dev"
|
|
28
|
+
Requires-Dist: delta-spark>=2.4; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy; extra == "dev"
|
|
31
|
+
|
|
32
|
+
# saltmill
|
|
33
|
+
|
|
34
|
+
**Efficient large CSV processing for PySpark and Databricks.**
|
|
35
|
+
|
|
36
|
+
saltmill automatically computes optimal salt buckets, partition counts, and Spark configuration for processing CSV files of any size — from a single API call.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## The problem
|
|
41
|
+
|
|
42
|
+
Reading a 500 GB CSV file naively in Spark causes data skew, memory pressure, and slow shuffles. Fixing it requires manually tuning:
|
|
43
|
+
|
|
44
|
+
- Salt bucket count
|
|
45
|
+
- Repartition strategy
|
|
46
|
+
- `spark.sql.shuffle.partitions`
|
|
47
|
+
- `spark.sql.files.maxPartitionBytes`
|
|
48
|
+
- Databricks Delta write optimizations
|
|
49
|
+
|
|
50
|
+
saltmill does all of this automatically, based on file size and cluster parallelism.
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install saltmill
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
> PySpark is a peer dependency — saltmill works with whatever version your cluster runs.
|
|
59
|
+
|
|
60
|
+
## Quick start
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import saltmill
|
|
64
|
+
|
|
65
|
+
df = saltmill.read(spark, "s3://my-bucket/large.csv")
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
That's it. saltmill will:
|
|
69
|
+
1. Detect file size via Hadoop FileSystem
|
|
70
|
+
2. Auto-compute salt buckets and partition count
|
|
71
|
+
3. Apply optimized Spark configs (shuffle partitions, maxPartitionBytes, Delta settings on Databricks)
|
|
72
|
+
4. Infer schema from a 0.1% sample of the first file
|
|
73
|
+
5. Read and return a well-partitioned DataFrame
|
|
74
|
+
|
|
75
|
+
## Usage
|
|
76
|
+
|
|
77
|
+
### Module-level function (simplest)
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
import saltmill
|
|
81
|
+
|
|
82
|
+
df = saltmill.read(spark, "s3://bucket/huge.csv")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Class-based (more control)
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from saltmill import SaltMill
|
|
89
|
+
|
|
90
|
+
sm = SaltMill(spark, workers=32)
|
|
91
|
+
df = sm.read("s3://bucket/huge.csv")
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Multiple files
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
df = saltmill.read(
|
|
98
|
+
spark,
|
|
99
|
+
["s3://bucket/2024-01.csv", "s3://bucket/2024-02.csv"],
|
|
100
|
+
hint_size_gb=500,
|
|
101
|
+
)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### With explicit schema
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
df = saltmill.read(
|
|
108
|
+
spark,
|
|
109
|
+
"s3://bucket/sales.csv",
|
|
110
|
+
schema={
|
|
111
|
+
"order_id": "long",
|
|
112
|
+
"region": "string",
|
|
113
|
+
"amount": "double",
|
|
114
|
+
"created_at": "timestamp",
|
|
115
|
+
},
|
|
116
|
+
partition_col="region",
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### With a PySpark StructType
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from pyspark.sql.types import StructType, StructField, LongType, StringType
|
|
124
|
+
|
|
125
|
+
schema = StructType([
|
|
126
|
+
StructField("id", LongType(), True),
|
|
127
|
+
StructField("name", StringType(), True),
|
|
128
|
+
])
|
|
129
|
+
|
|
130
|
+
df = saltmill.read(spark, "s3://bucket/data.csv", schema=schema)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Preview tuning parameters without reading
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
sm = SaltMill(spark)
|
|
137
|
+
params = sm.tune("s3://bucket/huge.csv", hint_size_gb=500)
|
|
138
|
+
print(params.summary())
|
|
139
|
+
# saltmill tuning → file: 500.0 GB, workers: 64, salt_buckets: 64,
|
|
140
|
+
# partitions: 640, maxPartitionBytes: 64 MB
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Write to Delta Lake
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
sm = SaltMill(spark)
|
|
147
|
+
df = sm.read("s3://bucket/huge.csv", partition_col="region")
|
|
148
|
+
sm.write_delta(df, "s3://bucket/delta/sales", partition_by="region")
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## How it works
|
|
152
|
+
|
|
153
|
+
### Salting
|
|
154
|
+
|
|
155
|
+
saltmill assigns each row a random bucket using:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
df.withColumn("_salt", pmod(monotonically_increasing_id(), salt_buckets))
|
|
159
|
+
.repartition(num_partitions, partition_col, "_salt")
|
|
160
|
+
.drop("_salt")
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
This breaks data skew even when a join or group-by column is highly imbalanced.
|
|
164
|
+
|
|
165
|
+
### Auto-tuning formula
|
|
166
|
+
|
|
167
|
+
| Input | Rule |
|
|
168
|
+
|---|---|
|
|
169
|
+
| File size | 1 salt bucket per 8 GB, rounded to nearest power of 2 |
|
|
170
|
+
| Salt buckets | Clamped to [8, 512] |
|
|
171
|
+
| Partitions | `salt_buckets × 10`, rounded up to nearest multiple of worker count |
|
|
172
|
+
| maxPartitionBytes | 64 MB (matches default HDFS block) |
|
|
173
|
+
|
|
174
|
+
### Example: 500 GB file, 64 workers
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
file_size_gb = 500
|
|
178
|
+
salt_buckets = round_pow2(500 / 8) = round_pow2(62.5) = 64
|
|
179
|
+
num_partitions = 64 × 10 = 640 (already a multiple of 64)
|
|
180
|
+
shuffle_partitions = 640
|
|
181
|
+
maxPartitionBytes = 64 MB
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
This matches the pattern proven in production:
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
# What saltmill does internally
|
|
188
|
+
spark.conf.set("spark.sql.shuffle.partitions", 640)
|
|
189
|
+
spark.conf.set("spark.sql.files.maxPartitionBytes", 64 * 1024 * 1024)
|
|
190
|
+
|
|
191
|
+
dfw = (
|
|
192
|
+
df.withColumn("_salt", pmod(monotonically_increasing_id(), 64))
|
|
193
|
+
.repartition(640, "region", "_salt")
|
|
194
|
+
.drop("_salt")
|
|
195
|
+
)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Databricks-specific settings
|
|
199
|
+
|
|
200
|
+
When running on Databricks, saltmill also sets:
|
|
201
|
+
|
|
202
|
+
```
|
|
203
|
+
spark.databricks.delta.optimizeWrite.enabled = true
|
|
204
|
+
spark.databricks.delta.autoCompact.enabled = true
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Schema dict shorthand
|
|
208
|
+
|
|
209
|
+
| Alias | Spark type |
|
|
210
|
+
|---|---|
|
|
211
|
+
| `"str"`, `"string"` | StringType |
|
|
212
|
+
| `"int"`, `"integer"` | IntegerType |
|
|
213
|
+
| `"long"`, `"bigint"` | LongType |
|
|
214
|
+
| `"float"` | FloatType |
|
|
215
|
+
| `"double"` | DoubleType |
|
|
216
|
+
| `"bool"`, `"boolean"` | BooleanType |
|
|
217
|
+
| `"date"` | DateType |
|
|
218
|
+
| `"timestamp"` | TimestampType |
|
|
219
|
+
| `"decimal"` | DecimalType(38,10) |
|
|
220
|
+
|
|
221
|
+
Any Spark SQL type string is also accepted directly (e.g. `"decimal(10,2)"`).
|
|
222
|
+
|
|
223
|
+
## API reference
|
|
224
|
+
|
|
225
|
+
### `saltmill.read(spark, paths, *, schema, partition_col, workers, salt_buckets, num_partitions, hint_size_gb, delimiter, encoding, null_value, verbose)`
|
|
226
|
+
|
|
227
|
+
Module-level convenience function. See class docs for parameter details.
|
|
228
|
+
|
|
229
|
+
### `SaltMill(spark, *, workers, verbose)`
|
|
230
|
+
|
|
231
|
+
| Parameter | Type | Default | Description |
|
|
232
|
+
|---|---|---|---|
|
|
233
|
+
| `spark` | SparkSession | required | Active session |
|
|
234
|
+
| `workers` | int | auto-detected | Worker node count |
|
|
235
|
+
| `verbose` | bool | True | Print tuning summary |
|
|
236
|
+
|
|
237
|
+
### `SaltMill.read(paths, *, schema, partition_col, salt_buckets, num_partitions, hint_size_gb, delimiter, encoding, null_value)`
|
|
238
|
+
|
|
239
|
+
| Parameter | Type | Default | Description |
|
|
240
|
+
|---|---|---|---|
|
|
241
|
+
| `paths` | str or list | required | CSV path(s) |
|
|
242
|
+
| `schema` | StructType / dict / None | None | Schema or auto-infer |
|
|
243
|
+
| `partition_col` | str or list | None | Extra partition key(s) |
|
|
244
|
+
| `salt_buckets` | int | auto | Override salt bucket count |
|
|
245
|
+
| `num_partitions` | int | auto | Override total partitions |
|
|
246
|
+
| `hint_size_gb` | float | None | File size hint (when detection fails) |
|
|
247
|
+
| `delimiter` | str | `","` | CSV separator |
|
|
248
|
+
| `encoding` | str | `"UTF-8"` | File encoding |
|
|
249
|
+
| `null_value` | str | `""` | Null string |
|
|
250
|
+
|
|
251
|
+
### `SaltMill.tune(paths, *, salt_buckets, num_partitions, hint_size_gb) → TuningParams`
|
|
252
|
+
|
|
253
|
+
Returns computed tuning parameters without reading any data.
|
|
254
|
+
|
|
255
|
+
### `SaltMill.write_delta(df, path, *, partition_by, mode)`
|
|
256
|
+
|
|
257
|
+
Writes a DataFrame to Delta Lake with Databricks-optimized settings.
|
|
258
|
+
|
|
259
|
+
## Development
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
pip install -e ".[dev]"
|
|
263
|
+
pytest
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## License
|
|
267
|
+
|
|
268
|
+
Apache 2.0
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# saltmill
|
|
2
|
+
|
|
3
|
+
**Efficient large CSV processing for PySpark and Databricks.**
|
|
4
|
+
|
|
5
|
+
saltmill automatically computes optimal salt buckets, partition counts, and Spark configuration for processing CSV files of any size — from a single API call.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## The problem
|
|
10
|
+
|
|
11
|
+
Reading a 500 GB CSV file naively in Spark causes data skew, memory pressure, and slow shuffles. Fixing it requires manually tuning:
|
|
12
|
+
|
|
13
|
+
- Salt bucket count
|
|
14
|
+
- Repartition strategy
|
|
15
|
+
- `spark.sql.shuffle.partitions`
|
|
16
|
+
- `spark.sql.files.maxPartitionBytes`
|
|
17
|
+
- Databricks Delta write optimizations
|
|
18
|
+
|
|
19
|
+
saltmill does all of this automatically, based on file size and cluster parallelism.
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install saltmill
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
> PySpark is a peer dependency — saltmill works with whatever version your cluster runs.
|
|
28
|
+
|
|
29
|
+
## Quick start
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import saltmill
|
|
33
|
+
|
|
34
|
+
df = saltmill.read(spark, "s3://my-bucket/large.csv")
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
That's it. saltmill will:
|
|
38
|
+
1. Detect file size via Hadoop FileSystem
|
|
39
|
+
2. Auto-compute salt buckets and partition count
|
|
40
|
+
3. Apply optimized Spark configs (shuffle partitions, maxPartitionBytes, Delta settings on Databricks)
|
|
41
|
+
4. Infer schema from a 0.1% sample of the first file
|
|
42
|
+
5. Read and return a well-partitioned DataFrame
|
|
43
|
+
|
|
44
|
+
## Usage
|
|
45
|
+
|
|
46
|
+
### Module-level function (simplest)
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
import saltmill
|
|
50
|
+
|
|
51
|
+
df = saltmill.read(spark, "s3://bucket/huge.csv")
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Class-based (more control)
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from saltmill import SaltMill
|
|
58
|
+
|
|
59
|
+
sm = SaltMill(spark, workers=32)
|
|
60
|
+
df = sm.read("s3://bucket/huge.csv")
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Multiple files
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
df = saltmill.read(
|
|
67
|
+
spark,
|
|
68
|
+
["s3://bucket/2024-01.csv", "s3://bucket/2024-02.csv"],
|
|
69
|
+
hint_size_gb=500,
|
|
70
|
+
)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### With explicit schema
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
df = saltmill.read(
|
|
77
|
+
spark,
|
|
78
|
+
"s3://bucket/sales.csv",
|
|
79
|
+
schema={
|
|
80
|
+
"order_id": "long",
|
|
81
|
+
"region": "string",
|
|
82
|
+
"amount": "double",
|
|
83
|
+
"created_at": "timestamp",
|
|
84
|
+
},
|
|
85
|
+
partition_col="region",
|
|
86
|
+
)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### With a PySpark StructType
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from pyspark.sql.types import StructType, StructField, LongType, StringType
|
|
93
|
+
|
|
94
|
+
schema = StructType([
|
|
95
|
+
StructField("id", LongType(), True),
|
|
96
|
+
StructField("name", StringType(), True),
|
|
97
|
+
])
|
|
98
|
+
|
|
99
|
+
df = saltmill.read(spark, "s3://bucket/data.csv", schema=schema)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Preview tuning parameters without reading
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
sm = SaltMill(spark)
|
|
106
|
+
params = sm.tune("s3://bucket/huge.csv", hint_size_gb=500)
|
|
107
|
+
print(params.summary())
|
|
108
|
+
# saltmill tuning → file: 500.0 GB, workers: 64, salt_buckets: 64,
|
|
109
|
+
# partitions: 640, maxPartitionBytes: 64 MB
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Write to Delta Lake
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
sm = SaltMill(spark)
|
|
116
|
+
df = sm.read("s3://bucket/huge.csv", partition_col="region")
|
|
117
|
+
sm.write_delta(df, "s3://bucket/delta/sales", partition_by="region")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## How it works
|
|
121
|
+
|
|
122
|
+
### Salting
|
|
123
|
+
|
|
124
|
+
saltmill assigns each row a random bucket using:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
df.withColumn("_salt", pmod(monotonically_increasing_id(), salt_buckets))
|
|
128
|
+
.repartition(num_partitions, partition_col, "_salt")
|
|
129
|
+
.drop("_salt")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
This breaks data skew even when a join or group-by column is highly imbalanced.
|
|
133
|
+
|
|
134
|
+
### Auto-tuning formula
|
|
135
|
+
|
|
136
|
+
| Input | Rule |
|
|
137
|
+
|---|---|
|
|
138
|
+
| File size | 1 salt bucket per 8 GB, rounded to nearest power of 2 |
|
|
139
|
+
| Salt buckets | Clamped to [8, 512] |
|
|
140
|
+
| Partitions | `salt_buckets × 10`, rounded up to nearest multiple of worker count |
|
|
141
|
+
| maxPartitionBytes | 64 MB (matches default HDFS block) |
|
|
142
|
+
|
|
143
|
+
### Example: 500 GB file, 64 workers
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
file_size_gb = 500
|
|
147
|
+
salt_buckets = round_pow2(500 / 8) = round_pow2(62.5) = 64
|
|
148
|
+
num_partitions = 64 × 10 = 640 (already a multiple of 64)
|
|
149
|
+
shuffle_partitions = 640
|
|
150
|
+
maxPartitionBytes = 64 MB
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
This matches the pattern proven in production:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
# What saltmill does internally
|
|
157
|
+
spark.conf.set("spark.sql.shuffle.partitions", 640)
|
|
158
|
+
spark.conf.set("spark.sql.files.maxPartitionBytes", 64 * 1024 * 1024)
|
|
159
|
+
|
|
160
|
+
dfw = (
|
|
161
|
+
df.withColumn("_salt", pmod(monotonically_increasing_id(), 64))
|
|
162
|
+
.repartition(640, "region", "_salt")
|
|
163
|
+
.drop("_salt")
|
|
164
|
+
)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Databricks-specific settings
|
|
168
|
+
|
|
169
|
+
When running on Databricks, saltmill also sets:
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
spark.databricks.delta.optimizeWrite.enabled = true
|
|
173
|
+
spark.databricks.delta.autoCompact.enabled = true
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Schema dict shorthand
|
|
177
|
+
|
|
178
|
+
| Alias | Spark type |
|
|
179
|
+
|---|---|
|
|
180
|
+
| `"str"`, `"string"` | StringType |
|
|
181
|
+
| `"int"`, `"integer"` | IntegerType |
|
|
182
|
+
| `"long"`, `"bigint"` | LongType |
|
|
183
|
+
| `"float"` | FloatType |
|
|
184
|
+
| `"double"` | DoubleType |
|
|
185
|
+
| `"bool"`, `"boolean"` | BooleanType |
|
|
186
|
+
| `"date"` | DateType |
|
|
187
|
+
| `"timestamp"` | TimestampType |
|
|
188
|
+
| `"decimal"` | DecimalType(38,10) |
|
|
189
|
+
|
|
190
|
+
Any Spark SQL type string is also accepted directly (e.g. `"decimal(10,2)"`).
|
|
191
|
+
|
|
192
|
+
## API reference
|
|
193
|
+
|
|
194
|
+
### `saltmill.read(spark, paths, *, schema, partition_col, workers, salt_buckets, num_partitions, hint_size_gb, delimiter, encoding, null_value, verbose)`
|
|
195
|
+
|
|
196
|
+
Module-level convenience function. See class docs for parameter details.
|
|
197
|
+
|
|
198
|
+
### `SaltMill(spark, *, workers, verbose)`
|
|
199
|
+
|
|
200
|
+
| Parameter | Type | Default | Description |
|
|
201
|
+
|---|---|---|---|
|
|
202
|
+
| `spark` | SparkSession | required | Active session |
|
|
203
|
+
| `workers` | int | auto-detected | Worker node count |
|
|
204
|
+
| `verbose` | bool | True | Print tuning summary |
|
|
205
|
+
|
|
206
|
+
### `SaltMill.read(paths, *, schema, partition_col, salt_buckets, num_partitions, hint_size_gb, delimiter, encoding, null_value)`
|
|
207
|
+
|
|
208
|
+
| Parameter | Type | Default | Description |
|
|
209
|
+
|---|---|---|---|
|
|
210
|
+
| `paths` | str or list | required | CSV path(s) |
|
|
211
|
+
| `schema` | StructType / dict / None | None | Schema or auto-infer |
|
|
212
|
+
| `partition_col` | str or list | None | Extra partition key(s) |
|
|
213
|
+
| `salt_buckets` | int | auto | Override salt bucket count |
|
|
214
|
+
| `num_partitions` | int | auto | Override total partitions |
|
|
215
|
+
| `hint_size_gb` | float | None | File size hint (when detection fails) |
|
|
216
|
+
| `delimiter` | str | `","` | CSV separator |
|
|
217
|
+
| `encoding` | str | `"UTF-8"` | File encoding |
|
|
218
|
+
| `null_value` | str | `""` | Null string |
|
|
219
|
+
|
|
220
|
+
### `SaltMill.tune(paths, *, salt_buckets, num_partitions, hint_size_gb) → TuningParams`
|
|
221
|
+
|
|
222
|
+
Returns computed tuning parameters without reading any data.
|
|
223
|
+
|
|
224
|
+
### `SaltMill.write_delta(df, path, *, partition_by, mode)`
|
|
225
|
+
|
|
226
|
+
Writes a DataFrame to Delta Lake with Databricks-optimized settings.
|
|
227
|
+
|
|
228
|
+
## Development
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
pip install -e ".[dev]"
|
|
232
|
+
pytest
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## License
|
|
236
|
+
|
|
237
|
+
Apache 2.0
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "saltmill-spark"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Efficient large-CSV processing for Apache Spark / Databricks with auto-salting, skew detection, and partition tuning"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "Apache-2.0" }
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
keywords = ["pyspark", "databricks", "csv", "big-data", "partitioning", "salting", "skew"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: Apache Software License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
23
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
# pyspark is a peer dependency — users bring their own cluster version
|
|
27
|
+
dependencies = []
|
|
28
|
+
|
|
29
|
+
[project.optional-dependencies]
|
|
30
|
+
databricks = ["databricks-sdk>=0.12"]
|
|
31
|
+
dev = [
|
|
32
|
+
"pytest>=8",
|
|
33
|
+
"pytest-asyncio",
|
|
34
|
+
"pyspark>=3.4",
|
|
35
|
+
"delta-spark>=2.4",
|
|
36
|
+
"ruff",
|
|
37
|
+
"mypy",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/yuvaraj-munirathinam/saltmill"
|
|
42
|
+
Repository = "https://github.com/yuvaraj-munirathinam/saltmill"
|
|
43
|
+
Issues = "https://github.com/yuvaraj-munirathinam/saltmill/issues"
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
where = ["."]
|
|
47
|
+
include = ["saltmill*"]
|
|
48
|
+
|
|
49
|
+
[tool.pytest.ini_options]
|
|
50
|
+
testpaths = ["tests"]
|
|
51
|
+
python_files = ["test_*.py"]
|
|
52
|
+
|
|
53
|
+
[tool.ruff]
|
|
54
|
+
line-length = 100
|
|
55
|
+
target-version = "py310"
|
|
56
|
+
|
|
57
|
+
[tool.mypy]
|
|
58
|
+
strict = true
|
|
59
|
+
ignore_missing_imports = true
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
saltmill — Efficient large-CSV processing for Apache Spark / Databricks.
|
|
3
|
+
|
|
4
|
+
Auto-detects skew, tunes salt buckets, partition keys, and Spark config
|
|
5
|
+
for 500GB+ CSV files.
|
|
6
|
+
|
|
7
|
+
Quick start (simple API)::
|
|
8
|
+
|
|
9
|
+
import saltmill
|
|
10
|
+
df = saltmill.read(spark, "abfss://container@account.dfs.core.windows.net/data/large.csv")
|
|
11
|
+
|
|
12
|
+
Advanced API (full pipeline with write)::
|
|
13
|
+
|
|
14
|
+
from saltmill import SaltmillProcessor, SaltmillConfig
|
|
15
|
+
|
|
16
|
+
result = SaltmillProcessor(SaltmillConfig(
|
|
17
|
+
input_path="abfss://raw@account.dfs.core.windows.net/data/*.csv",
|
|
18
|
+
output_path="abfss://curated@account.dfs.core.windows.net/output/delta/",
|
|
19
|
+
)).process()
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from saltmill._version import __version__
|
|
23
|
+
|
|
24
|
+
# ── Advanced API ──────────────────────────────────────────────────────────────
|
|
25
|
+
from saltmill.config import CompressionCodec, SaltmillConfig, WriteFormat
|
|
26
|
+
from saltmill.exceptions import (
|
|
27
|
+
CheckpointError,
|
|
28
|
+
ConfigurationError,
|
|
29
|
+
SaltmillError,
|
|
30
|
+
SchemaInferenceError,
|
|
31
|
+
SkewDetectionError,
|
|
32
|
+
UnsupportedPathError,
|
|
33
|
+
)
|
|
34
|
+
from saltmill.models import PartitionPlan, ProcessingResult, SchemaInfo, SkewReport
|
|
35
|
+
from saltmill.processor import SaltmillProcessor
|
|
36
|
+
|
|
37
|
+
# ── Simple backward-compatible API ────────────────────────────────────────────
|
|
38
|
+
from saltmill.compat import SaltMill, read
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
# Advanced API
|
|
42
|
+
"SaltmillProcessor",
|
|
43
|
+
"SaltmillConfig",
|
|
44
|
+
"WriteFormat",
|
|
45
|
+
"CompressionCodec",
|
|
46
|
+
"ProcessingResult",
|
|
47
|
+
"PartitionPlan",
|
|
48
|
+
"SchemaInfo",
|
|
49
|
+
"SkewReport",
|
|
50
|
+
"SaltmillError",
|
|
51
|
+
"ConfigurationError",
|
|
52
|
+
"SchemaInferenceError",
|
|
53
|
+
"SkewDetectionError",
|
|
54
|
+
"CheckpointError",
|
|
55
|
+
"UnsupportedPathError",
|
|
56
|
+
# Simple API
|
|
57
|
+
"SaltMill",
|
|
58
|
+
"read",
|
|
59
|
+
"__version__",
|
|
60
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0"
|