databricks4py 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks4py-0.2.0/LICENSE +21 -0
- databricks4py-0.2.0/PKG-INFO +589 -0
- databricks4py-0.2.0/README.md +556 -0
- databricks4py-0.2.0/pyproject.toml +78 -0
- databricks4py-0.2.0/setup.cfg +4 -0
- databricks4py-0.2.0/src/databricks4py/__init__.py +56 -0
- databricks4py-0.2.0/src/databricks4py/catalog.py +65 -0
- databricks4py-0.2.0/src/databricks4py/config/__init__.py +6 -0
- databricks4py-0.2.0/src/databricks4py/config/base.py +119 -0
- databricks4py-0.2.0/src/databricks4py/config/unity.py +72 -0
- databricks4py-0.2.0/src/databricks4py/filters/__init__.py +17 -0
- databricks4py-0.2.0/src/databricks4py/filters/base.py +154 -0
- databricks4py-0.2.0/src/databricks4py/io/__init__.py +40 -0
- databricks4py-0.2.0/src/databricks4py/io/checkpoint.py +98 -0
- databricks4py-0.2.0/src/databricks4py/io/dbfs.py +91 -0
- databricks4py-0.2.0/src/databricks4py/io/delta.py +564 -0
- databricks4py-0.2.0/src/databricks4py/io/merge.py +176 -0
- databricks4py-0.2.0/src/databricks4py/io/streaming.py +281 -0
- databricks4py-0.2.0/src/databricks4py/logging.py +39 -0
- databricks4py-0.2.0/src/databricks4py/metrics/__init__.py +22 -0
- databricks4py-0.2.0/src/databricks4py/metrics/base.py +66 -0
- databricks4py-0.2.0/src/databricks4py/metrics/delta_sink.py +75 -0
- databricks4py-0.2.0/src/databricks4py/metrics/logging_sink.py +20 -0
- databricks4py-0.2.0/src/databricks4py/migrations/__init__.py +27 -0
- databricks4py-0.2.0/src/databricks4py/migrations/alter.py +114 -0
- databricks4py-0.2.0/src/databricks4py/migrations/runner.py +241 -0
- databricks4py-0.2.0/src/databricks4py/migrations/schema_diff.py +136 -0
- databricks4py-0.2.0/src/databricks4py/migrations/validators.py +195 -0
- databricks4py-0.2.0/src/databricks4py/observability/__init__.py +24 -0
- databricks4py-0.2.0/src/databricks4py/observability/_utils.py +24 -0
- databricks4py-0.2.0/src/databricks4py/observability/batch_context.py +134 -0
- databricks4py-0.2.0/src/databricks4py/observability/health.py +223 -0
- databricks4py-0.2.0/src/databricks4py/observability/query_listener.py +236 -0
- databricks4py-0.2.0/src/databricks4py/py.typed +0 -0
- databricks4py-0.2.0/src/databricks4py/quality/__init__.py +26 -0
- databricks4py-0.2.0/src/databricks4py/quality/base.py +54 -0
- databricks4py-0.2.0/src/databricks4py/quality/expectations.py +184 -0
- databricks4py-0.2.0/src/databricks4py/quality/gate.py +90 -0
- databricks4py-0.2.0/src/databricks4py/retry.py +102 -0
- databricks4py-0.2.0/src/databricks4py/secrets.py +69 -0
- databricks4py-0.2.0/src/databricks4py/spark_session.py +68 -0
- databricks4py-0.2.0/src/databricks4py/testing/__init__.py +35 -0
- databricks4py-0.2.0/src/databricks4py/testing/assertions.py +111 -0
- databricks4py-0.2.0/src/databricks4py/testing/builders.py +127 -0
- databricks4py-0.2.0/src/databricks4py/testing/fixtures.py +134 -0
- databricks4py-0.2.0/src/databricks4py/testing/mocks.py +106 -0
- databricks4py-0.2.0/src/databricks4py/testing/temp_table.py +73 -0
- databricks4py-0.2.0/src/databricks4py/workflow.py +219 -0
- databricks4py-0.2.0/src/databricks4py.egg-info/PKG-INFO +589 -0
- databricks4py-0.2.0/src/databricks4py.egg-info/SOURCES.txt +87 -0
- databricks4py-0.2.0/src/databricks4py.egg-info/dependency_links.txt +1 -0
- databricks4py-0.2.0/src/databricks4py.egg-info/requires.txt +8 -0
- databricks4py-0.2.0/src/databricks4py.egg-info/top_level.txt +1 -0
- databricks4py-0.2.0/tests/test_assertions.py +68 -0
- databricks4py-0.2.0/tests/test_batch_context.py +129 -0
- databricks4py-0.2.0/tests/test_builders.py +59 -0
- databricks4py-0.2.0/tests/test_catalog.py +45 -0
- databricks4py-0.2.0/tests/test_checkpoint.py +100 -0
- databricks4py-0.2.0/tests/test_config.py +137 -0
- databricks4py-0.2.0/tests/test_dbfs.py +99 -0
- databricks4py-0.2.0/tests/test_delta.py +444 -0
- databricks4py-0.2.0/tests/test_delta_advanced_integration.py +207 -0
- databricks4py-0.2.0/tests/test_delta_metrics_sink_integration.py +103 -0
- databricks4py-0.2.0/tests/test_filters.py +119 -0
- databricks4py-0.2.0/tests/test_health.py +168 -0
- databricks4py-0.2.0/tests/test_logging.py +39 -0
- databricks4py-0.2.0/tests/test_merge.py +154 -0
- databricks4py-0.2.0/tests/test_metrics.py +136 -0
- databricks4py-0.2.0/tests/test_migration_runner.py +241 -0
- databricks4py-0.2.0/tests/test_migration_runner_integration.py +196 -0
- databricks4py-0.2.0/tests/test_migrations.py +218 -0
- databricks4py-0.2.0/tests/test_mocks.py +53 -0
- databricks4py-0.2.0/tests/test_observability_integration.py +185 -0
- databricks4py-0.2.0/tests/test_quality.py +119 -0
- databricks4py-0.2.0/tests/test_quality_integration.py +207 -0
- databricks4py-0.2.0/tests/test_query_listener.py +147 -0
- databricks4py-0.2.0/tests/test_retry.py +128 -0
- databricks4py-0.2.0/tests/test_schema_diff.py +178 -0
- databricks4py-0.2.0/tests/test_secrets.py +44 -0
- databricks4py-0.2.0/tests/test_spark_session.py +45 -0
- databricks4py-0.2.0/tests/test_streaming.py +337 -0
- databricks4py-0.2.0/tests/test_streaming_dlq.py +177 -0
- databricks4py-0.2.0/tests/test_streaming_integration.py +308 -0
- databricks4py-0.2.0/tests/test_table_alter.py +111 -0
- databricks4py-0.2.0/tests/test_table_alter_integration.py +137 -0
- databricks4py-0.2.0/tests/test_temp_table.py +50 -0
- databricks4py-0.2.0/tests/test_workflow.py +71 -0
- databricks4py-0.2.0/tests/test_workflow_e2e_integration.py +214 -0
- databricks4py-0.2.0/tests/test_workflow_v2.py +132 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 kirankbs
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: databricks4py
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Spark, Delta Lake, and Databricks utility library for Python
|
|
5
|
+
Author: kirankbs
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/kirankbs/databricks4py
|
|
8
|
+
Project-URL: Repository, https://github.com/kirankbs/databricks4py
|
|
9
|
+
Project-URL: Changelog, https://github.com/kirankbs/databricks4py/blob/main/CHANGELOG.md
|
|
10
|
+
Project-URL: Issues, https://github.com/kirankbs/databricks4py/issues
|
|
11
|
+
Keywords: spark,pyspark,delta-lake,databricks,etl,data-engineering
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Operating System :: OS Independent
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pyspark>=3.4
|
|
26
|
+
Requires-Dist: delta-spark>=2.4
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-timeout>=2.0; extra == "dev"
|
|
30
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
31
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# databricks4py
|
|
35
|
+
|
|
36
|
+
[](https://pypi.org/project/databricks4py/)
|
|
37
|
+
[](https://pypi.org/project/databricks4py/)
|
|
38
|
+
[](https://github.com/kirankbs/databricks4py/actions)
|
|
39
|
+
[](LICENSE)
|
|
40
|
+
|
|
41
|
+
Spark, Delta Lake, and Databricks utility library for Python.
|
|
42
|
+
|
|
43
|
+
The patterns you keep re-implementing across PySpark jobs — Delta table management, streaming foreachBatch wiring, schema migrations, data quality checks — packaged as a library. Runs on Databricks and locally with open-source Spark + Delta Lake.
|
|
44
|
+
|
|
45
|
+
## Install
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install databricks4py
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**Requirements:** Python >= 3.10, Java 17+ (for PySpark). `pyspark.dbutils` only available on Databricks Runtime.
|
|
52
|
+
|
|
53
|
+
## Core modules
|
|
54
|
+
|
|
55
|
+
### Delta table management
|
|
56
|
+
|
|
57
|
+
Wraps the delta-spark API into a single object that handles table creation, reads, writes, metadata, and schema validation.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from databricks4py.io import DeltaTable, GeneratedColumn
|
|
61
|
+
|
|
62
|
+
# Create a table with generated columns and partitioning
|
|
63
|
+
table = DeltaTable(
|
|
64
|
+
table_name="catalog.schema.events",
|
|
65
|
+
schema={"id": "int", "name": "string", "event_ts": "timestamp", "event_date": "date"},
|
|
66
|
+
location="/data/events",
|
|
67
|
+
partition_by="event_date",
|
|
68
|
+
generated_columns=[GeneratedColumn("event_date", "DATE", "CAST(event_ts AS DATE)")],
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
df = table.dataframe() # Read
|
|
72
|
+
table.write(df, mode="append") # Write (schema_check=True by default)
|
|
73
|
+
table.detail() # Metadata DataFrame
|
|
74
|
+
table.partition_columns() # ["event_date"]
|
|
75
|
+
table.size_in_bytes() # Physical size in bytes
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**Convenience wrappers:**
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from databricks4py.io import DeltaTableAppender, DeltaTableOverwriter
|
|
82
|
+
|
|
83
|
+
appender = DeltaTableAppender("target", schema=my_schema)
|
|
84
|
+
appender.append(df)
|
|
85
|
+
|
|
86
|
+
overwriter = DeltaTableOverwriter("target", schema=my_schema)
|
|
87
|
+
overwriter.overwrite(df)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**Table maintenance:**
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from databricks4py.io.delta import optimize_table, vacuum_table
|
|
94
|
+
|
|
95
|
+
optimize_table("catalog.schema.events", z_order_by=["id"])
|
|
96
|
+
vacuum_table("catalog.schema.events", retention_hours=168)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Merge operations
|
|
100
|
+
|
|
101
|
+
Fluent builder for Delta `MERGE INTO`. Chain conditions, execute, get back row counts.
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from databricks4py.io import MergeBuilder
|
|
105
|
+
|
|
106
|
+
result = (
|
|
107
|
+
MergeBuilder("catalog.schema.target", source_df, spark)
|
|
108
|
+
.on("id", "date") # Join keys
|
|
109
|
+
.when_matched_update(["name", "value"]) # Update specific columns
|
|
110
|
+
.when_not_matched_insert() # Insert new rows
|
|
111
|
+
.when_not_matched_by_source_delete() # Remove stale rows
|
|
112
|
+
.execute()
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
print(f"Inserted: {result.rows_inserted}, Updated: {result.rows_updated}")
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Custom join conditions:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
result = (
|
|
122
|
+
MergeBuilder("target", source_df, spark)
|
|
123
|
+
.on_condition("target.id = source.id AND target.region = source.region")
|
|
124
|
+
.when_matched_update()
|
|
125
|
+
.when_not_matched_insert()
|
|
126
|
+
.execute()
|
|
127
|
+
)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
SCD Type 2 is built into DeltaTable:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
table.scd_type2(source_df, keys=["id"])
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Streaming
|
|
137
|
+
|
|
138
|
+
Subclass `StreamingTableReader`, implement `process_batch`, and the base class handles the `foreachBatch` wiring, empty-batch skipping, row filtering, metrics, and DLQ routing.
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from databricks4py.io import StreamingTableReader, StreamingTriggerOptions
|
|
142
|
+
|
|
143
|
+
class EventProcessor(StreamingTableReader):
|
|
144
|
+
def process_batch(self, df, batch_id):
|
|
145
|
+
clean = df.where("status IS NOT NULL")
|
|
146
|
+
clean.write.format("delta").mode("append").saveAsTable("output")
|
|
147
|
+
|
|
148
|
+
reader = EventProcessor(
|
|
149
|
+
source_table="catalog.schema.raw_events",
|
|
150
|
+
trigger=StreamingTriggerOptions.PROCESSING_TIME_1M,
|
|
151
|
+
checkpoint_location="/checkpoints/events",
|
|
152
|
+
dead_letter_table="catalog.schema.dlq", # Failed batches go here
|
|
153
|
+
)
|
|
154
|
+
query = reader.start()
|
|
155
|
+
reader.stop(timeout_seconds=30) # Graceful shutdown
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
If `dead_letter_table` is set and `process_batch` raises, the batch DataFrame is written to the DLQ table with `_dlq_error_message`, `_dlq_error_timestamp`, and `_dlq_batch_id` columns appended. The stream keeps running.
|
|
159
|
+
|
|
160
|
+
**Checkpoint management:**
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from databricks4py.io import CheckpointManager
|
|
164
|
+
|
|
165
|
+
mgr = CheckpointManager(base_path="/checkpoints")
|
|
166
|
+
reader = EventProcessor(
|
|
167
|
+
source_table="input",
|
|
168
|
+
checkpoint_manager=mgr, # Auto-generates checkpoint path
|
|
169
|
+
)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Migration framework
|
|
173
|
+
|
|
174
|
+
Ordered migration runner, same idea as Flyway or Alembic but for Delta tables. Steps are Python callables, history is tracked in a Delta table, and each version runs exactly once.
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from databricks4py.migrations import MigrationRunner, MigrationStep
|
|
178
|
+
|
|
179
|
+
def add_audit_columns(spark):
|
|
180
|
+
spark.sql("ALTER TABLE catalog.schema.events ADD COLUMNS (created_at TIMESTAMP)")
|
|
181
|
+
|
|
182
|
+
def backfill_defaults(spark):
|
|
183
|
+
spark.sql("UPDATE catalog.schema.events SET created_at = current_timestamp()")
|
|
184
|
+
|
|
185
|
+
runner = MigrationRunner(history_table="catalog.schema._migrations")
|
|
186
|
+
runner.register(
|
|
187
|
+
MigrationStep(version="V001", description="Add audit columns", up=add_audit_columns),
|
|
188
|
+
MigrationStep(version="V002", description="Backfill defaults", up=backfill_defaults),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Check what needs to run
|
|
192
|
+
pending = runner.pending() # [MigrationStep(V002, ...)]
|
|
193
|
+
|
|
194
|
+
# Execute (idempotent — already-applied steps are skipped)
|
|
195
|
+
result = runner.run()
|
|
196
|
+
print(result.applied) # ["V002"]
|
|
197
|
+
print(result.skipped) # ["V001"]
|
|
198
|
+
|
|
199
|
+
# Dry run (logs what would run, no side effects)
|
|
200
|
+
result = runner.run(dry_run=True)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
**Pre/post validation per step:**
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
MigrationStep(
|
|
207
|
+
version="V003",
|
|
208
|
+
description="Rename column",
|
|
209
|
+
up=lambda spark: spark.sql("ALTER TABLE t RENAME COLUMN old TO new"),
|
|
210
|
+
pre_validate=lambda spark: spark.catalog.tableExists("t"),
|
|
211
|
+
post_validate=lambda spark: "new" in spark.read.table("t").columns,
|
|
212
|
+
)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Table alter (fluent DDL)
|
|
216
|
+
|
|
217
|
+
Queue up `ALTER TABLE` operations and apply them in one go.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
from databricks4py.migrations import TableAlter
|
|
221
|
+
|
|
222
|
+
TableAlter("catalog.schema.events") \
|
|
223
|
+
.add_column("region", "STRING", comment="ISO-3166 region code") \
|
|
224
|
+
.set_property("delta.enableChangeDataFeed", "true") \
|
|
225
|
+
.apply()
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
Rename and drop require Delta column mapping:
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
TableAlter("catalog.schema.events") \
|
|
232
|
+
.set_property("delta.columnMapping.mode", "name") \
|
|
233
|
+
.set_property("delta.minReaderVersion", "2") \
|
|
234
|
+
.set_property("delta.minWriterVersion", "5") \
|
|
235
|
+
.apply()
|
|
236
|
+
|
|
237
|
+
TableAlter("catalog.schema.events") \
|
|
238
|
+
.rename_column("old_name", "new_name") \
|
|
239
|
+
.drop_column("deprecated_col") \
|
|
240
|
+
.apply()
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Schema diff
|
|
244
|
+
|
|
245
|
+
Compare two schemas (or a live table vs. an incoming DataFrame) and get back a list of column-level changes with severity.
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
from databricks4py.migrations import SchemaDiff
|
|
249
|
+
|
|
250
|
+
diff = SchemaDiff.from_tables("catalog.schema.events", new_df)
|
|
251
|
+
for change in diff.changes():
|
|
252
|
+
print(f"{change.column}: {change.change_type} [{change.severity}]")
|
|
253
|
+
|
|
254
|
+
if diff.has_breaking_changes():
|
|
255
|
+
raise RuntimeError(diff.summary())
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### Table validation
|
|
259
|
+
|
|
260
|
+
Check that a Delta table matches expected columns, partitions, and location before or after a migration.
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
from databricks4py.migrations import TableValidator
|
|
264
|
+
|
|
265
|
+
validator = TableValidator(
|
|
266
|
+
table_name="catalog.schema.events",
|
|
267
|
+
expected_columns=["id", "name", "event_date"],
|
|
268
|
+
expected_partition_columns=["event_date"],
|
|
269
|
+
expected_location_contains="/data/events",
|
|
270
|
+
)
|
|
271
|
+
result = validator.validate()
|
|
272
|
+
if not result.is_valid:
|
|
273
|
+
print(result.errors) # ["Missing required columns: ['event_date']"]
|
|
274
|
+
print(result.warnings) # ["Unexpected extra columns: ['debug_flag']"]
|
|
275
|
+
result.raise_if_invalid("catalog.schema.events") # Raises MigrationError
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### Data quality
|
|
279
|
+
|
|
280
|
+
Row-level expectations you can run individually or bundle into a gate that raises, warns, or quarantines bad rows.
|
|
281
|
+
|
|
282
|
+
```python
|
|
283
|
+
from databricks4py.quality.expectations import NotNull, InRange, Unique, RowCount, MatchesRegex
|
|
284
|
+
from databricks4py.quality.gate import QualityGate
|
|
285
|
+
|
|
286
|
+
# Individual expectations
|
|
287
|
+
result = NotNull("id", "name").validate(df)
|
|
288
|
+
result = InRange("score", min_val=0, max_val=100).validate(df)
|
|
289
|
+
result = Unique("id").validate(df)
|
|
290
|
+
result = RowCount(min_count=1, max_count=1_000_000).validate(df)
|
|
291
|
+
result = MatchesRegex("email", r"^.+@.+\..+$").validate(df)
|
|
292
|
+
# result.passed, result.failing_rows, result.total_rows
|
|
293
|
+
|
|
294
|
+
# Quality gate — enforce multiple expectations
|
|
295
|
+
gate = QualityGate(
|
|
296
|
+
NotNull("id"),
|
|
297
|
+
InRange("score", min_val=0, max_val=100),
|
|
298
|
+
on_fail="raise", # or "warn" or "quarantine"
|
|
299
|
+
)
|
|
300
|
+
clean_df = gate.enforce(df) # Raises QualityError if checks fail
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
**Quarantine mode** splits bad rows and routes them to a handler:
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
gate = QualityGate(
|
|
307
|
+
NotNull("id"),
|
|
308
|
+
on_fail="quarantine",
|
|
309
|
+
quarantine_handler=lambda bad_df: bad_df.write.saveAsTable("quarantine_table"),
|
|
310
|
+
)
|
|
311
|
+
clean_df = gate.enforce(df) # Returns only clean rows
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### Filter pipeline
|
|
315
|
+
|
|
316
|
+
Chain DataFrame transformations. Each filter is a callable that takes and returns a DataFrame.
|
|
317
|
+
|
|
318
|
+
```python
|
|
319
|
+
from databricks4py.filters import FilterPipeline, DropDuplicates, WhereFilter, ColumnFilter
|
|
320
|
+
|
|
321
|
+
pipeline = FilterPipeline([
|
|
322
|
+
DropDuplicates(subset=["id"]),
|
|
323
|
+
WhereFilter("status = 'active'"),
|
|
324
|
+
ColumnFilter(columns=["id", "name", "status"]),
|
|
325
|
+
])
|
|
326
|
+
clean_df = pipeline(raw_df)
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
### Workflow
|
|
330
|
+
|
|
331
|
+
Base class for Databricks jobs. Handles SparkSession init, config application, lifecycle metrics (`job_start`/`job_complete`/`job_failed`), quality gates, and optional retry.
|
|
332
|
+
|
|
333
|
+
```python
|
|
334
|
+
from databricks4py import Workflow
|
|
335
|
+
from databricks4py.quality.expectations import NotNull
|
|
336
|
+
from databricks4py.quality.gate import QualityGate
|
|
337
|
+
|
|
338
|
+
class MyETL(Workflow):
|
|
339
|
+
def run(self):
|
|
340
|
+
source = self.spark.read.table("raw_events")
|
|
341
|
+
|
|
342
|
+
# Quality check with metrics emission
|
|
343
|
+
gate = QualityGate(NotNull("id", "event_ts"), on_fail="raise")
|
|
344
|
+
clean = self.quality_check(source, gate, table_name="raw_events")
|
|
345
|
+
|
|
346
|
+
clean.write.format("delta").mode("append").saveAsTable("clean_events")
|
|
347
|
+
self.emit_metric("write_complete", row_count=clean.count())
|
|
348
|
+
|
|
349
|
+
# With config and metrics
|
|
350
|
+
from databricks4py.config import JobConfig
|
|
351
|
+
from databricks4py.metrics import DeltaMetricsSink
|
|
352
|
+
|
|
353
|
+
config = JobConfig(tables={"source": "raw_events"}, spark_configs={"spark.sql.shuffle.partitions": "8"})
|
|
354
|
+
sink = DeltaMetricsSink("catalog.schema.job_metrics")
|
|
355
|
+
MyETL(config=config, metrics=sink).execute()
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
### Configuration
|
|
359
|
+
|
|
360
|
+
Environment auto-detected from Databricks widgets or `ENV`/`ENVIRONMENT` env vars, defaults to DEV.
|
|
361
|
+
|
|
362
|
+
```python
|
|
363
|
+
from databricks4py.config import JobConfig, Environment
|
|
364
|
+
|
|
365
|
+
config = JobConfig(
|
|
366
|
+
tables={"events": "catalog.bronze.events", "users": "catalog.silver.users"},
|
|
367
|
+
secret_scope="my-scope",
|
|
368
|
+
spark_configs={"spark.sql.shuffle.partitions": "8"},
|
|
369
|
+
)
|
|
370
|
+
config.env # Environment.DEV (auto-detected)
|
|
371
|
+
config.table("events") # "catalog.bronze.events"
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
**Unity Catalog** — environment-aware catalog resolution:
|
|
375
|
+
|
|
376
|
+
```python
|
|
377
|
+
from databricks4py.config import UnityConfig
|
|
378
|
+
|
|
379
|
+
config = UnityConfig(catalog_prefix="myapp", schemas=["bronze", "silver"])
|
|
380
|
+
config.catalog # "myapp_dev" (or myapp_prod in production)
|
|
381
|
+
config.table("bronze.events") # "myapp_dev.bronze.events"
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
### Metrics
|
|
385
|
+
|
|
386
|
+
Buffer events and write them to a Delta table, log them as JSON, or both.
|
|
387
|
+
|
|
388
|
+
```python
|
|
389
|
+
from databricks4py.metrics import DeltaMetricsSink, LoggingMetricsSink, CompositeMetricsSink, MetricEvent
|
|
390
|
+
|
|
391
|
+
# Write metrics to a Delta table
|
|
392
|
+
delta_sink = DeltaMetricsSink("catalog.schema.metrics", buffer_size=50)
|
|
393
|
+
|
|
394
|
+
# Log metrics as JSON
|
|
395
|
+
log_sink = LoggingMetricsSink()
|
|
396
|
+
|
|
397
|
+
# Fan out to multiple sinks
|
|
398
|
+
sink = CompositeMetricsSink(delta_sink, log_sink)
|
|
399
|
+
sink.emit(MetricEvent(job_name="etl", event_type="batch_complete", timestamp=now, row_count=1000))
|
|
400
|
+
sink.flush()
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
### Observability
|
|
404
|
+
|
|
405
|
+
**Structured batch logging** — JSON log records per batch with correlation IDs, queryable in any log aggregation system.
|
|
406
|
+
|
|
407
|
+
```python
|
|
408
|
+
from databricks4py.observability import BatchContext, BatchLogger
|
|
409
|
+
|
|
410
|
+
logger = BatchLogger(extra_fields={"pipeline": "events", "env": "prod"})
|
|
411
|
+
|
|
412
|
+
# Inside your StreamingTableReader.process_batch:
|
|
413
|
+
ctx = BatchContext.create(batch_id=batch_id, source_table="catalog.schema.events")
|
|
414
|
+
logger.batch_start(ctx)
|
|
415
|
+
# ... process ...
|
|
416
|
+
logger.batch_complete(ctx, row_count=df.count(), duration_ms=ctx.elapsed_ms())
|
|
417
|
+
# On error:
|
|
418
|
+
logger.batch_error(ctx, error=str(exc))
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
Each log line is single-line JSON: `{"event": "batch_complete", "batch_id": 42, "correlation_id": "a1b2c3d4e5f6", "row_count": 1000, ...}`
|
|
422
|
+
|
|
423
|
+
**Query progress listener** — wraps PySpark 3.4+ `StreamingQueryListener` to collect progress snapshots and route them to a MetricsSink.
|
|
424
|
+
|
|
425
|
+
```python
|
|
426
|
+
from databricks4py.observability import QueryProgressObserver
|
|
427
|
+
|
|
428
|
+
observer = QueryProgressObserver(metrics_sink=my_sink, query_name_filter="events_processor")
|
|
429
|
+
observer.attach()
|
|
430
|
+
|
|
431
|
+
# After the stream runs:
|
|
432
|
+
latest = observer.latest_progress()
|
|
433
|
+
print(f"Batch {latest.batch_id}: {latest.processed_rows_per_second} rows/sec")
|
|
434
|
+
|
|
435
|
+
history = observer.history(limit=10)
|
|
436
|
+
observer.detach()
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
**Health checks** — poll a streaming query for stuck detection, slow batches, and low throughput.
|
|
440
|
+
|
|
441
|
+
```python
|
|
442
|
+
from databricks4py.observability import StreamingHealthCheck, HealthStatus
|
|
443
|
+
|
|
444
|
+
check = StreamingHealthCheck(
|
|
445
|
+
query,
|
|
446
|
+
max_batch_duration_ms=60_000, # DEGRADED if batch > 60s
|
|
447
|
+
min_processing_rate=100.0, # DEGRADED if < 100 rows/sec
|
|
448
|
+
stale_timeout_seconds=300, # UNHEALTHY if no progress for 5min
|
|
449
|
+
)
|
|
450
|
+
result = check.evaluate()
|
|
451
|
+
if result.status != HealthStatus.HEALTHY:
|
|
452
|
+
print(result.summary())
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
### Retry
|
|
456
|
+
|
|
457
|
+
```python
|
|
458
|
+
from databricks4py.retry import retry, RetryConfig
|
|
459
|
+
|
|
460
|
+
@retry(RetryConfig(max_attempts=5, base_delay_seconds=2.0, backoff_factor=3.0))
|
|
461
|
+
def fetch_from_api():
|
|
462
|
+
return requests.get(url).json()
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
### Testing utilities
|
|
466
|
+
|
|
467
|
+
Session-scoped SparkSession (one JVM per test run), function-scoped cleanup, and helpers for building test data.
|
|
468
|
+
|
|
469
|
+
```python
|
|
470
|
+
# conftest.py — register fixtures
|
|
471
|
+
from databricks4py.testing.fixtures import * # noqa: F401,F403
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
**DataFrameBuilder** — fluent test data construction:
|
|
475
|
+
|
|
476
|
+
```python
|
|
477
|
+
def test_my_transform(spark_session_function):
|
|
478
|
+
df = (
|
|
479
|
+
DataFrameBuilder(spark_session_function)
|
|
480
|
+
.with_columns({"id": "int", "name": "string", "score": "int"})
|
|
481
|
+
.with_rows((1, "Alice", 95), (2, "Bob", 80))
|
|
482
|
+
.build()
|
|
483
|
+
)
|
|
484
|
+
assert df.count() == 2
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
**TempDeltaTable** — ephemeral Delta tables for test isolation:
|
|
488
|
+
|
|
489
|
+
```python
|
|
490
|
+
def test_merge(spark_session_function, tmp_path):
|
|
491
|
+
with TempDeltaTable(spark_session_function, schema={"id": "int"}, data=[(1,), (2,)]) as table:
|
|
492
|
+
assert table.dataframe().count() == 2
|
|
493
|
+
# Table is auto-dropped after the context exits
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
**Assertions:**
|
|
497
|
+
|
|
498
|
+
```python
|
|
499
|
+
from databricks4py.testing.assertions import assert_frame_equal, assert_schema_equal
|
|
500
|
+
|
|
501
|
+
assert_frame_equal(actual_df, expected_df, check_order=False)
|
|
502
|
+
assert_schema_equal(actual_df.schema, expected_schema, check_nullable=False)
|
|
503
|
+
```
|
|
504
|
+
|
|
505
|
+
**Mock Databricks utilities:**
|
|
506
|
+
|
|
507
|
+
```python
|
|
508
|
+
from databricks4py.testing.mocks import MockDBUtils, MockDBUtilsModule
|
|
509
|
+
|
|
510
|
+
mock = MockDBUtils()
|
|
511
|
+
mock.secrets.put("scope", "key", "secret-value")
|
|
512
|
+
assert mock.secrets.get("scope", "key") == "secret-value"
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
## Project structure
|
|
516
|
+
|
|
517
|
+
```
|
|
518
|
+
src/databricks4py/
|
|
519
|
+
├── __init__.py # Top-level exports
|
|
520
|
+
├── spark_session.py # get_active(), active_fallback(), get_or_create_local_session()
|
|
521
|
+
├── catalog.py # CatalogSchema for schema-qualified table access
|
|
522
|
+
├── logging.py # configure_logging(), get_logger()
|
|
523
|
+
├── secrets.py # SecretFetcher with injectable dbutils
|
|
524
|
+
├── retry.py # retry() decorator with exponential backoff
|
|
525
|
+
├── workflow.py # Workflow ABC for Databricks job entry points
|
|
526
|
+
├── config/
|
|
527
|
+
│ ├── base.py # JobConfig, Environment
|
|
528
|
+
│ └── unity.py # UnityConfig (Unity Catalog-aware)
|
|
529
|
+
├── io/
|
|
530
|
+
│ ├── delta.py # DeltaTable, Appender, Overwriter, optimize, vacuum
|
|
531
|
+
│ ├── merge.py # MergeBuilder, MergeResult
|
|
532
|
+
│ ├── streaming.py # StreamingTableReader, StreamingTriggerOptions
|
|
533
|
+
│ ├── checkpoint.py # CheckpointManager, CheckpointInfo
|
|
534
|
+
│ └── dbfs.py # DBFS file operations (Databricks only)
|
|
535
|
+
├── filters/
|
|
536
|
+
│ └── base.py # Filter, FilterPipeline, DropDuplicates, WhereFilter, ColumnFilter
|
|
537
|
+
├── migrations/
|
|
538
|
+
│ ├── runner.py # MigrationRunner, MigrationStep, MigrationRunResult
|
|
539
|
+
│ ├── alter.py # TableAlter (fluent DDL builder)
|
|
540
|
+
│ ├── validators.py # TableValidator, ValidationResult, MigrationError
|
|
541
|
+
│ └── schema_diff.py # SchemaDiff, ColumnChange, SchemaEvolutionError
|
|
542
|
+
├── quality/
|
|
543
|
+
│ ├── base.py # Expectation, ExpectationResult, QualityReport
|
|
544
|
+
│ ├── expectations.py # NotNull, InRange, Unique, RowCount, MatchesRegex, ColumnExists
|
|
545
|
+
│ └── gate.py # QualityGate, QualityError
|
|
546
|
+
├── metrics/
|
|
547
|
+
│ ├── base.py # MetricEvent, MetricsSink, CompositeMetricsSink
|
|
548
|
+
│ ├── delta_sink.py # DeltaMetricsSink (buffered Delta table writer)
|
|
549
|
+
│ └── logging_sink.py # LoggingMetricsSink (JSON to logger)
|
|
550
|
+
├── observability/
|
|
551
|
+
│ ├── batch_context.py # BatchContext, BatchLogger (structured per-batch JSON logging)
|
|
552
|
+
│ ├── query_listener.py # QueryProgressObserver (StreamingQueryListener wrapper)
|
|
553
|
+
│ └── health.py # StreamingHealthCheck, HealthStatus, HealthResult
|
|
554
|
+
└── testing/
|
|
555
|
+
├── fixtures.py # spark_session, spark_session_function, df_builder, temp_delta
|
|
556
|
+
├── builders.py # DataFrameBuilder (fluent test data)
|
|
557
|
+
├── temp_table.py # TempDeltaTable (auto-cleanup context manager)
|
|
558
|
+
├── assertions.py # assert_frame_equal, assert_schema_equal
|
|
559
|
+
└── mocks.py # MockDBUtils, MockDBUtilsModule
|
|
560
|
+
```
|
|
561
|
+
|
|
562
|
+
## Development setup
|
|
563
|
+
|
|
564
|
+
```bash
|
|
565
|
+
git clone https://github.com/kirankbs/databricks4py.git
|
|
566
|
+
cd databricks4py
|
|
567
|
+
pip install -e ".[dev]"
|
|
568
|
+
|
|
569
|
+
# Lint
|
|
570
|
+
ruff check src/ tests/ docs/
|
|
571
|
+
ruff format --check src/ tests/ docs/
|
|
572
|
+
|
|
573
|
+
# Tests
|
|
574
|
+
pytest -m no_pyspark --timeout=30 # Fast, no Spark/Java
|
|
575
|
+
pytest -m "integration or unit" --timeout=120 # Integration (requires Java 17+)
|
|
576
|
+
pytest -v --timeout=120 # Everything
|
|
577
|
+
```
|
|
578
|
+
|
|
579
|
+
## Compatibility matrix
|
|
580
|
+
|
|
581
|
+
| PySpark | delta-spark | Python |
|
|
582
|
+
|---------|-------------|--------|
|
|
583
|
+
| 3.5.x | 3.2.x | >= 3.10 |
|
|
584
|
+
| 3.4.x | 2.4.x | >= 3.10 |
|
|
585
|
+
| 4.x | 4.x | >= 3.10 |
|
|
586
|
+
|
|
587
|
+
## License
|
|
588
|
+
|
|
589
|
+
MIT
|