databricks4py 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. databricks4py-0.2.0/LICENSE +21 -0
  2. databricks4py-0.2.0/PKG-INFO +589 -0
  3. databricks4py-0.2.0/README.md +556 -0
  4. databricks4py-0.2.0/pyproject.toml +78 -0
  5. databricks4py-0.2.0/setup.cfg +4 -0
  6. databricks4py-0.2.0/src/databricks4py/__init__.py +56 -0
  7. databricks4py-0.2.0/src/databricks4py/catalog.py +65 -0
  8. databricks4py-0.2.0/src/databricks4py/config/__init__.py +6 -0
  9. databricks4py-0.2.0/src/databricks4py/config/base.py +119 -0
  10. databricks4py-0.2.0/src/databricks4py/config/unity.py +72 -0
  11. databricks4py-0.2.0/src/databricks4py/filters/__init__.py +17 -0
  12. databricks4py-0.2.0/src/databricks4py/filters/base.py +154 -0
  13. databricks4py-0.2.0/src/databricks4py/io/__init__.py +40 -0
  14. databricks4py-0.2.0/src/databricks4py/io/checkpoint.py +98 -0
  15. databricks4py-0.2.0/src/databricks4py/io/dbfs.py +91 -0
  16. databricks4py-0.2.0/src/databricks4py/io/delta.py +564 -0
  17. databricks4py-0.2.0/src/databricks4py/io/merge.py +176 -0
  18. databricks4py-0.2.0/src/databricks4py/io/streaming.py +281 -0
  19. databricks4py-0.2.0/src/databricks4py/logging.py +39 -0
  20. databricks4py-0.2.0/src/databricks4py/metrics/__init__.py +22 -0
  21. databricks4py-0.2.0/src/databricks4py/metrics/base.py +66 -0
  22. databricks4py-0.2.0/src/databricks4py/metrics/delta_sink.py +75 -0
  23. databricks4py-0.2.0/src/databricks4py/metrics/logging_sink.py +20 -0
  24. databricks4py-0.2.0/src/databricks4py/migrations/__init__.py +27 -0
  25. databricks4py-0.2.0/src/databricks4py/migrations/alter.py +114 -0
  26. databricks4py-0.2.0/src/databricks4py/migrations/runner.py +241 -0
  27. databricks4py-0.2.0/src/databricks4py/migrations/schema_diff.py +136 -0
  28. databricks4py-0.2.0/src/databricks4py/migrations/validators.py +195 -0
  29. databricks4py-0.2.0/src/databricks4py/observability/__init__.py +24 -0
  30. databricks4py-0.2.0/src/databricks4py/observability/_utils.py +24 -0
  31. databricks4py-0.2.0/src/databricks4py/observability/batch_context.py +134 -0
  32. databricks4py-0.2.0/src/databricks4py/observability/health.py +223 -0
  33. databricks4py-0.2.0/src/databricks4py/observability/query_listener.py +236 -0
  34. databricks4py-0.2.0/src/databricks4py/py.typed +0 -0
  35. databricks4py-0.2.0/src/databricks4py/quality/__init__.py +26 -0
  36. databricks4py-0.2.0/src/databricks4py/quality/base.py +54 -0
  37. databricks4py-0.2.0/src/databricks4py/quality/expectations.py +184 -0
  38. databricks4py-0.2.0/src/databricks4py/quality/gate.py +90 -0
  39. databricks4py-0.2.0/src/databricks4py/retry.py +102 -0
  40. databricks4py-0.2.0/src/databricks4py/secrets.py +69 -0
  41. databricks4py-0.2.0/src/databricks4py/spark_session.py +68 -0
  42. databricks4py-0.2.0/src/databricks4py/testing/__init__.py +35 -0
  43. databricks4py-0.2.0/src/databricks4py/testing/assertions.py +111 -0
  44. databricks4py-0.2.0/src/databricks4py/testing/builders.py +127 -0
  45. databricks4py-0.2.0/src/databricks4py/testing/fixtures.py +134 -0
  46. databricks4py-0.2.0/src/databricks4py/testing/mocks.py +106 -0
  47. databricks4py-0.2.0/src/databricks4py/testing/temp_table.py +73 -0
  48. databricks4py-0.2.0/src/databricks4py/workflow.py +219 -0
  49. databricks4py-0.2.0/src/databricks4py.egg-info/PKG-INFO +589 -0
  50. databricks4py-0.2.0/src/databricks4py.egg-info/SOURCES.txt +87 -0
  51. databricks4py-0.2.0/src/databricks4py.egg-info/dependency_links.txt +1 -0
  52. databricks4py-0.2.0/src/databricks4py.egg-info/requires.txt +8 -0
  53. databricks4py-0.2.0/src/databricks4py.egg-info/top_level.txt +1 -0
  54. databricks4py-0.2.0/tests/test_assertions.py +68 -0
  55. databricks4py-0.2.0/tests/test_batch_context.py +129 -0
  56. databricks4py-0.2.0/tests/test_builders.py +59 -0
  57. databricks4py-0.2.0/tests/test_catalog.py +45 -0
  58. databricks4py-0.2.0/tests/test_checkpoint.py +100 -0
  59. databricks4py-0.2.0/tests/test_config.py +137 -0
  60. databricks4py-0.2.0/tests/test_dbfs.py +99 -0
  61. databricks4py-0.2.0/tests/test_delta.py +444 -0
  62. databricks4py-0.2.0/tests/test_delta_advanced_integration.py +207 -0
  63. databricks4py-0.2.0/tests/test_delta_metrics_sink_integration.py +103 -0
  64. databricks4py-0.2.0/tests/test_filters.py +119 -0
  65. databricks4py-0.2.0/tests/test_health.py +168 -0
  66. databricks4py-0.2.0/tests/test_logging.py +39 -0
  67. databricks4py-0.2.0/tests/test_merge.py +154 -0
  68. databricks4py-0.2.0/tests/test_metrics.py +136 -0
  69. databricks4py-0.2.0/tests/test_migration_runner.py +241 -0
  70. databricks4py-0.2.0/tests/test_migration_runner_integration.py +196 -0
  71. databricks4py-0.2.0/tests/test_migrations.py +218 -0
  72. databricks4py-0.2.0/tests/test_mocks.py +53 -0
  73. databricks4py-0.2.0/tests/test_observability_integration.py +185 -0
  74. databricks4py-0.2.0/tests/test_quality.py +119 -0
  75. databricks4py-0.2.0/tests/test_quality_integration.py +207 -0
  76. databricks4py-0.2.0/tests/test_query_listener.py +147 -0
  77. databricks4py-0.2.0/tests/test_retry.py +128 -0
  78. databricks4py-0.2.0/tests/test_schema_diff.py +178 -0
  79. databricks4py-0.2.0/tests/test_secrets.py +44 -0
  80. databricks4py-0.2.0/tests/test_spark_session.py +45 -0
  81. databricks4py-0.2.0/tests/test_streaming.py +337 -0
  82. databricks4py-0.2.0/tests/test_streaming_dlq.py +177 -0
  83. databricks4py-0.2.0/tests/test_streaming_integration.py +308 -0
  84. databricks4py-0.2.0/tests/test_table_alter.py +111 -0
  85. databricks4py-0.2.0/tests/test_table_alter_integration.py +137 -0
  86. databricks4py-0.2.0/tests/test_temp_table.py +50 -0
  87. databricks4py-0.2.0/tests/test_workflow.py +71 -0
  88. databricks4py-0.2.0/tests/test_workflow_e2e_integration.py +214 -0
  89. databricks4py-0.2.0/tests/test_workflow_v2.py +132 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 kirankbs
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,589 @@
1
+ Metadata-Version: 2.4
2
+ Name: databricks4py
3
+ Version: 0.2.0
4
+ Summary: Spark, Delta Lake, and Databricks utility library for Python
5
+ Author: kirankbs
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/kirankbs/databricks4py
8
+ Project-URL: Repository, https://github.com/kirankbs/databricks4py
9
+ Project-URL: Changelog, https://github.com/kirankbs/databricks4py/blob/main/CHANGELOG.md
10
+ Project-URL: Issues, https://github.com/kirankbs/databricks4py/issues
11
+ Keywords: spark,pyspark,delta-lake,databricks,etl,data-engineering
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries
20
+ Classifier: Topic :: Database
21
+ Classifier: Operating System :: OS Independent
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: pyspark>=3.4
26
+ Requires-Dist: delta-spark>=2.4
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=8.0; extra == "dev"
29
+ Requires-Dist: pytest-timeout>=2.0; extra == "dev"
30
+ Requires-Dist: ruff>=0.4; extra == "dev"
31
+ Requires-Dist: mypy>=1.10; extra == "dev"
32
+ Dynamic: license-file
33
+
34
+ # databricks4py
35
+
36
+ [![PyPI](https://img.shields.io/pypi/v/databricks4py)](https://pypi.org/project/databricks4py/)
37
+ [![Python](https://img.shields.io/pypi/pyversions/databricks4py)](https://pypi.org/project/databricks4py/)
38
+ [![CI](https://github.com/kirankbs/databricks4py/actions/workflows/ci.yml/badge.svg)](https://github.com/kirankbs/databricks4py/actions)
39
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
40
+
41
+ Spark, Delta Lake, and Databricks utility library for Python.
42
+
43
+ The patterns you keep re-implementing across PySpark jobs — Delta table management, streaming foreachBatch wiring, schema migrations, data quality checks — packaged as a library. Runs on Databricks and locally with open-source Spark + Delta Lake.
44
+
45
+ ## Install
46
+
47
+ ```bash
48
+ pip install databricks4py
49
+ ```
50
+
51
+ **Requirements:** Python >= 3.10, Java 17+ (for PySpark). `pyspark.dbutils` only available on Databricks Runtime.
52
+
53
+ ## Core modules
54
+
55
+ ### Delta table management
56
+
57
+ Wraps the delta-spark API into a single object that handles table creation, reads, writes, metadata, and schema validation.
58
+
59
+ ```python
60
+ from databricks4py.io import DeltaTable, GeneratedColumn
61
+
62
+ # Create a table with generated columns and partitioning
63
+ table = DeltaTable(
64
+ table_name="catalog.schema.events",
65
+ schema={"id": "int", "name": "string", "event_ts": "timestamp", "event_date": "date"},
66
+ location="/data/events",
67
+ partition_by="event_date",
68
+ generated_columns=[GeneratedColumn("event_date", "DATE", "CAST(event_ts AS DATE)")],
69
+ )
70
+
71
+ df = table.dataframe() # Read
72
+ table.write(df, mode="append") # Write (schema_check=True by default)
73
+ table.detail() # Metadata DataFrame
74
+ table.partition_columns() # ["event_date"]
75
+ table.size_in_bytes() # Physical size in bytes
76
+ ```
77
+
78
+ **Convenience wrappers:**
79
+
80
+ ```python
81
+ from databricks4py.io import DeltaTableAppender, DeltaTableOverwriter
82
+
83
+ appender = DeltaTableAppender("target", schema=my_schema)
84
+ appender.append(df)
85
+
86
+ overwriter = DeltaTableOverwriter("target", schema=my_schema)
87
+ overwriter.overwrite(df)
88
+ ```
89
+
90
+ **Table maintenance:**
91
+
92
+ ```python
93
+ from databricks4py.io.delta import optimize_table, vacuum_table
94
+
95
+ optimize_table("catalog.schema.events", z_order_by=["id"])
96
+ vacuum_table("catalog.schema.events", retention_hours=168)
97
+ ```
98
+
99
+ ### Merge operations
100
+
101
+ Fluent builder for Delta `MERGE INTO`. Chain conditions, execute, get back row counts.
102
+
103
+ ```python
104
+ from databricks4py.io import MergeBuilder
105
+
106
+ result = (
107
+ MergeBuilder("catalog.schema.target", source_df, spark)
108
+ .on("id", "date") # Join keys
109
+ .when_matched_update(["name", "value"]) # Update specific columns
110
+ .when_not_matched_insert() # Insert new rows
111
+ .when_not_matched_by_source_delete() # Remove stale rows
112
+ .execute()
113
+ )
114
+
115
+ print(f"Inserted: {result.rows_inserted}, Updated: {result.rows_updated}")
116
+ ```
117
+
118
+ Custom join conditions:
119
+
120
+ ```python
121
+ result = (
122
+ MergeBuilder("target", source_df, spark)
123
+ .on_condition("target.id = source.id AND target.region = source.region")
124
+ .when_matched_update()
125
+ .when_not_matched_insert()
126
+ .execute()
127
+ )
128
+ ```
129
+
130
+ SCD Type 2 is built into DeltaTable:
131
+
132
+ ```python
133
+ table.scd_type2(source_df, keys=["id"])
134
+ ```
135
+
136
+ ### Streaming
137
+
138
+ Subclass `StreamingTableReader`, implement `process_batch`, and the base class handles the `foreachBatch` wiring, empty-batch skipping, row filtering, metrics, and DLQ routing.
139
+
140
+ ```python
141
+ from databricks4py.io import StreamingTableReader, StreamingTriggerOptions
142
+
143
+ class EventProcessor(StreamingTableReader):
144
+ def process_batch(self, df, batch_id):
145
+ clean = df.where("status IS NOT NULL")
146
+ clean.write.format("delta").mode("append").saveAsTable("output")
147
+
148
+ reader = EventProcessor(
149
+ source_table="catalog.schema.raw_events",
150
+ trigger=StreamingTriggerOptions.PROCESSING_TIME_1M,
151
+ checkpoint_location="/checkpoints/events",
152
+ dead_letter_table="catalog.schema.dlq", # Failed batches go here
153
+ )
154
+ query = reader.start()
155
+ reader.stop(timeout_seconds=30) # Graceful shutdown
156
+ ```
157
+
158
+ If `dead_letter_table` is set and `process_batch` raises, the batch DataFrame is written to the DLQ table with `_dlq_error_message`, `_dlq_error_timestamp`, and `_dlq_batch_id` columns appended. The stream keeps running.
159
+
160
+ **Checkpoint management:**
161
+
162
+ ```python
163
+ from databricks4py.io import CheckpointManager
164
+
165
+ mgr = CheckpointManager(base_path="/checkpoints")
166
+ reader = EventProcessor(
167
+ source_table="input",
168
+ checkpoint_manager=mgr, # Auto-generates checkpoint path
169
+ )
170
+ ```
171
+
172
+ ### Migration framework
173
+
174
+ Ordered migration runner, same idea as Flyway or Alembic but for Delta tables. Steps are Python callables, history is tracked in a Delta table, and each version runs exactly once.
175
+
176
+ ```python
177
+ from databricks4py.migrations import MigrationRunner, MigrationStep
178
+
179
+ def add_audit_columns(spark):
180
+ spark.sql("ALTER TABLE catalog.schema.events ADD COLUMNS (created_at TIMESTAMP)")
181
+
182
+ def backfill_defaults(spark):
183
+ spark.sql("UPDATE catalog.schema.events SET created_at = current_timestamp()")
184
+
185
+ runner = MigrationRunner(history_table="catalog.schema._migrations")
186
+ runner.register(
187
+ MigrationStep(version="V001", description="Add audit columns", up=add_audit_columns),
188
+ MigrationStep(version="V002", description="Backfill defaults", up=backfill_defaults),
189
+ )
190
+
191
+ # Check what needs to run
192
+ pending = runner.pending() # [MigrationStep(V002, ...)]
193
+
194
+ # Execute (idempotent — already-applied steps are skipped)
195
+ result = runner.run()
196
+ print(result.applied) # ["V002"]
197
+ print(result.skipped) # ["V001"]
198
+
199
+ # Dry run (logs what would run, no side effects)
200
+ result = runner.run(dry_run=True)
201
+ ```
202
+
203
+ **Pre/post validation per step:**
204
+
205
+ ```python
206
+ MigrationStep(
207
+ version="V003",
208
+ description="Rename column",
209
+ up=lambda spark: spark.sql("ALTER TABLE t RENAME COLUMN old TO new"),
210
+ pre_validate=lambda spark: spark.catalog.tableExists("t"),
211
+ post_validate=lambda spark: "new" in spark.read.table("t").columns,
212
+ )
213
+ ```
214
+
215
+ ### Table alter (fluent DDL)
216
+
217
+ Queue up `ALTER TABLE` operations and apply them in one go.
218
+
219
+ ```python
220
+ from databricks4py.migrations import TableAlter
221
+
222
+ TableAlter("catalog.schema.events") \
223
+ .add_column("region", "STRING", comment="ISO-3166 region code") \
224
+ .set_property("delta.enableChangeDataFeed", "true") \
225
+ .apply()
226
+ ```
227
+
228
+ Rename and drop require Delta column mapping:
229
+
230
+ ```python
231
+ TableAlter("catalog.schema.events") \
232
+ .set_property("delta.columnMapping.mode", "name") \
233
+ .set_property("delta.minReaderVersion", "2") \
234
+ .set_property("delta.minWriterVersion", "5") \
235
+ .apply()
236
+
237
+ TableAlter("catalog.schema.events") \
238
+ .rename_column("old_name", "new_name") \
239
+ .drop_column("deprecated_col") \
240
+ .apply()
241
+ ```
242
+
243
+ ### Schema diff
244
+
245
+ Compare two schemas (or a live table vs. an incoming DataFrame) and get back a list of column-level changes with severity.
246
+
247
+ ```python
248
+ from databricks4py.migrations import SchemaDiff
249
+
250
+ diff = SchemaDiff.from_tables("catalog.schema.events", new_df)
251
+ for change in diff.changes():
252
+ print(f"{change.column}: {change.change_type} [{change.severity}]")
253
+
254
+ if diff.has_breaking_changes():
255
+ raise RuntimeError(diff.summary())
256
+ ```
257
+
258
+ ### Table validation
259
+
260
+ Check that a Delta table matches expected columns, partitions, and location before or after a migration.
261
+
262
+ ```python
263
+ from databricks4py.migrations import TableValidator
264
+
265
+ validator = TableValidator(
266
+ table_name="catalog.schema.events",
267
+ expected_columns=["id", "name", "event_date"],
268
+ expected_partition_columns=["event_date"],
269
+ expected_location_contains="/data/events",
270
+ )
271
+ result = validator.validate()
272
+ if not result.is_valid:
273
+ print(result.errors) # ["Missing required columns: ['event_date']"]
274
+ print(result.warnings) # ["Unexpected extra columns: ['debug_flag']"]
275
+ result.raise_if_invalid("catalog.schema.events") # Raises MigrationError
276
+ ```
277
+
278
+ ### Data quality
279
+
280
+ Row-level expectations you can run individually or bundle into a gate that raises, warns, or quarantines bad rows.
281
+
282
+ ```python
283
+ from databricks4py.quality.expectations import NotNull, InRange, Unique, RowCount, MatchesRegex
284
+ from databricks4py.quality.gate import QualityGate
285
+
286
+ # Individual expectations
287
+ result = NotNull("id", "name").validate(df)
288
+ result = InRange("score", min_val=0, max_val=100).validate(df)
289
+ result = Unique("id").validate(df)
290
+ result = RowCount(min_count=1, max_count=1_000_000).validate(df)
291
+ result = MatchesRegex("email", r"^.+@.+\..+$").validate(df)
292
+ # result.passed, result.failing_rows, result.total_rows
293
+
294
+ # Quality gate — enforce multiple expectations
295
+ gate = QualityGate(
296
+ NotNull("id"),
297
+ InRange("score", min_val=0, max_val=100),
298
+ on_fail="raise", # or "warn" or "quarantine"
299
+ )
300
+ clean_df = gate.enforce(df) # Raises QualityError if checks fail
301
+ ```
302
+
303
+ **Quarantine mode** splits bad rows and routes them to a handler:
304
+
305
+ ```python
306
+ gate = QualityGate(
307
+ NotNull("id"),
308
+ on_fail="quarantine",
309
+ quarantine_handler=lambda bad_df: bad_df.write.saveAsTable("quarantine_table"),
310
+ )
311
+ clean_df = gate.enforce(df) # Returns only clean rows
312
+ ```
313
+
314
+ ### Filter pipeline
315
+
316
+ Chain DataFrame transformations. Each filter is a callable that takes and returns a DataFrame.
317
+
318
+ ```python
319
+ from databricks4py.filters import FilterPipeline, DropDuplicates, WhereFilter, ColumnFilter
320
+
321
+ pipeline = FilterPipeline([
322
+ DropDuplicates(subset=["id"]),
323
+ WhereFilter("status = 'active'"),
324
+ ColumnFilter(columns=["id", "name", "status"]),
325
+ ])
326
+ clean_df = pipeline(raw_df)
327
+ ```
328
+
329
+ ### Workflow
330
+
331
+ Base class for Databricks jobs. Handles SparkSession init, config application, lifecycle metrics (`job_start`/`job_complete`/`job_failed`), quality gates, and optional retry.
332
+
333
+ ```python
334
+ from databricks4py import Workflow
335
+ from databricks4py.quality.expectations import NotNull
336
+ from databricks4py.quality.gate import QualityGate
337
+
338
+ class MyETL(Workflow):
339
+ def run(self):
340
+ source = self.spark.read.table("raw_events")
341
+
342
+ # Quality check with metrics emission
343
+ gate = QualityGate(NotNull("id", "event_ts"), on_fail="raise")
344
+ clean = self.quality_check(source, gate, table_name="raw_events")
345
+
346
+ clean.write.format("delta").mode("append").saveAsTable("clean_events")
347
+ self.emit_metric("write_complete", row_count=clean.count())
348
+
349
+ # With config and metrics
350
+ from databricks4py.config import JobConfig
351
+ from databricks4py.metrics import DeltaMetricsSink
352
+
353
+ config = JobConfig(tables={"source": "raw_events"}, spark_configs={"spark.sql.shuffle.partitions": "8"})
354
+ sink = DeltaMetricsSink("catalog.schema.job_metrics")
355
+ MyETL(config=config, metrics=sink).execute()
356
+ ```
357
+
358
+ ### Configuration
359
+
360
+ Environment auto-detected from Databricks widgets or `ENV`/`ENVIRONMENT` env vars, defaults to DEV.
361
+
362
+ ```python
363
+ from databricks4py.config import JobConfig, Environment
364
+
365
+ config = JobConfig(
366
+ tables={"events": "catalog.bronze.events", "users": "catalog.silver.users"},
367
+ secret_scope="my-scope",
368
+ spark_configs={"spark.sql.shuffle.partitions": "8"},
369
+ )
370
+ config.env # Environment.DEV (auto-detected)
371
+ config.table("events") # "catalog.bronze.events"
372
+ ```
373
+
374
+ **Unity Catalog** — environment-aware catalog resolution:
375
+
376
+ ```python
377
+ from databricks4py.config import UnityConfig
378
+
379
+ config = UnityConfig(catalog_prefix="myapp", schemas=["bronze", "silver"])
380
+ config.catalog # "myapp_dev" (or myapp_prod in production)
381
+ config.table("bronze.events") # "myapp_dev.bronze.events"
382
+ ```
383
+
384
+ ### Metrics
385
+
386
+ Buffer events and write them to a Delta table, log them as JSON, or both.
387
+
388
+ ```python
389
+ from databricks4py.metrics import DeltaMetricsSink, LoggingMetricsSink, CompositeMetricsSink, MetricEvent
390
+
391
+ # Write metrics to a Delta table
392
+ delta_sink = DeltaMetricsSink("catalog.schema.metrics", buffer_size=50)
393
+
394
+ # Log metrics as JSON
395
+ log_sink = LoggingMetricsSink()
396
+
397
+ # Fan out to multiple sinks
398
+ sink = CompositeMetricsSink(delta_sink, log_sink)
399
+ sink.emit(MetricEvent(job_name="etl", event_type="batch_complete", timestamp=now, row_count=1000))
400
+ sink.flush()
401
+ ```
402
+
403
+ ### Observability
404
+
405
+ **Structured batch logging** — JSON log records per batch with correlation IDs, queryable in any log aggregation system.
406
+
407
+ ```python
408
+ from databricks4py.observability import BatchContext, BatchLogger
409
+
410
+ logger = BatchLogger(extra_fields={"pipeline": "events", "env": "prod"})
411
+
412
+ # Inside your StreamingTableReader.process_batch:
413
+ ctx = BatchContext.create(batch_id=batch_id, source_table="catalog.schema.events")
414
+ logger.batch_start(ctx)
415
+ # ... process ...
416
+ logger.batch_complete(ctx, row_count=df.count(), duration_ms=ctx.elapsed_ms())
417
+ # On error:
418
+ logger.batch_error(ctx, error=str(exc))
419
+ ```
420
+
421
+ Each log line is single-line JSON: `{"event": "batch_complete", "batch_id": 42, "correlation_id": "a1b2c3d4e5f6", "row_count": 1000, ...}`
422
+
423
+ **Query progress listener** — wraps PySpark 3.4+ `StreamingQueryListener` to collect progress snapshots and route them to a MetricsSink.
424
+
425
+ ```python
426
+ from databricks4py.observability import QueryProgressObserver
427
+
428
+ observer = QueryProgressObserver(metrics_sink=my_sink, query_name_filter="events_processor")
429
+ observer.attach()
430
+
431
+ # After the stream runs:
432
+ latest = observer.latest_progress()
433
+ print(f"Batch {latest.batch_id}: {latest.processed_rows_per_second} rows/sec")
434
+
435
+ history = observer.history(limit=10)
436
+ observer.detach()
437
+ ```
438
+
439
+ **Health checks** — poll a streaming query for stuck detection, slow batches, and low throughput.
440
+
441
+ ```python
442
+ from databricks4py.observability import StreamingHealthCheck, HealthStatus
443
+
444
+ check = StreamingHealthCheck(
445
+ query,
446
+ max_batch_duration_ms=60_000, # DEGRADED if batch > 60s
447
+ min_processing_rate=100.0, # DEGRADED if < 100 rows/sec
448
+ stale_timeout_seconds=300, # UNHEALTHY if no progress for 5min
449
+ )
450
+ result = check.evaluate()
451
+ if result.status != HealthStatus.HEALTHY:
452
+ print(result.summary())
453
+ ```
454
+
455
+ ### Retry
456
+
457
+ ```python
458
+ from databricks4py.retry import retry, RetryConfig
459
+
460
+ @retry(RetryConfig(max_attempts=5, base_delay_seconds=2.0, backoff_factor=3.0))
461
+ def fetch_from_api():
462
+ return requests.get(url).json()
463
+ ```
464
+
465
+ ### Testing utilities
466
+
467
+ Session-scoped SparkSession (one JVM per test run), function-scoped cleanup, and helpers for building test data.
468
+
469
+ ```python
470
+ # conftest.py — register fixtures
471
+ from databricks4py.testing.fixtures import * # noqa: F401,F403
472
+ ```
473
+
474
+ **DataFrameBuilder** — fluent test data construction:
475
+
476
+ ```python
477
+ def test_my_transform(spark_session_function):
478
+ df = (
479
+ DataFrameBuilder(spark_session_function)
480
+ .with_columns({"id": "int", "name": "string", "score": "int"})
481
+ .with_rows((1, "Alice", 95), (2, "Bob", 80))
482
+ .build()
483
+ )
484
+ assert df.count() == 2
485
+ ```
486
+
487
+ **TempDeltaTable** — ephemeral Delta tables for test isolation:
488
+
489
+ ```python
490
+ def test_merge(spark_session_function, tmp_path):
491
+ with TempDeltaTable(spark_session_function, schema={"id": "int"}, data=[(1,), (2,)]) as table:
492
+ assert table.dataframe().count() == 2
493
+ # Table is auto-dropped after the context exits
494
+ ```
495
+
496
+ **Assertions:**
497
+
498
+ ```python
499
+ from databricks4py.testing.assertions import assert_frame_equal, assert_schema_equal
500
+
501
+ assert_frame_equal(actual_df, expected_df, check_order=False)
502
+ assert_schema_equal(actual_df.schema, expected_schema, check_nullable=False)
503
+ ```
504
+
505
+ **Mock Databricks utilities:**
506
+
507
+ ```python
508
+ from databricks4py.testing.mocks import MockDBUtils, MockDBUtilsModule
509
+
510
+ mock = MockDBUtils()
511
+ mock.secrets.put("scope", "key", "secret-value")
512
+ assert mock.secrets.get("scope", "key") == "secret-value"
513
+ ```
514
+
515
+ ## Project structure
516
+
517
+ ```
518
+ src/databricks4py/
519
+ ├── __init__.py # Top-level exports
520
+ ├── spark_session.py # get_active(), active_fallback(), get_or_create_local_session()
521
+ ├── catalog.py # CatalogSchema for schema-qualified table access
522
+ ├── logging.py # configure_logging(), get_logger()
523
+ ├── secrets.py # SecretFetcher with injectable dbutils
524
+ ├── retry.py # retry() decorator with exponential backoff
525
+ ├── workflow.py # Workflow ABC for Databricks job entry points
526
+ ├── config/
527
+ │ ├── base.py # JobConfig, Environment
528
+ │ └── unity.py # UnityConfig (Unity Catalog-aware)
529
+ ├── io/
530
+ │ ├── delta.py # DeltaTable, Appender, Overwriter, optimize, vacuum
531
+ │ ├── merge.py # MergeBuilder, MergeResult
532
+ │ ├── streaming.py # StreamingTableReader, StreamingTriggerOptions
533
+ │ ├── checkpoint.py # CheckpointManager, CheckpointInfo
534
+ │ └── dbfs.py # DBFS file operations (Databricks only)
535
+ ├── filters/
536
+ │ └── base.py # Filter, FilterPipeline, DropDuplicates, WhereFilter, ColumnFilter
537
+ ├── migrations/
538
+ │ ├── runner.py # MigrationRunner, MigrationStep, MigrationRunResult
539
+ │ ├── alter.py # TableAlter (fluent DDL builder)
540
+ │ ├── validators.py # TableValidator, ValidationResult, MigrationError
541
+ │ └── schema_diff.py # SchemaDiff, ColumnChange, SchemaEvolutionError
542
+ ├── quality/
543
+ │ ├── base.py # Expectation, ExpectationResult, QualityReport
544
+ │ ├── expectations.py # NotNull, InRange, Unique, RowCount, MatchesRegex, ColumnExists
545
+ │ └── gate.py # QualityGate, QualityError
546
+ ├── metrics/
547
+ │ ├── base.py # MetricEvent, MetricsSink, CompositeMetricsSink
548
+ │ ├── delta_sink.py # DeltaMetricsSink (buffered Delta table writer)
549
+ │ └── logging_sink.py # LoggingMetricsSink (JSON to logger)
550
+ ├── observability/
551
+ │ ├── batch_context.py # BatchContext, BatchLogger (structured per-batch JSON logging)
552
+ │ ├── query_listener.py # QueryProgressObserver (StreamingQueryListener wrapper)
553
+ │ └── health.py # StreamingHealthCheck, HealthStatus, HealthResult
554
+ └── testing/
555
+ ├── fixtures.py # spark_session, spark_session_function, df_builder, temp_delta
556
+ ├── builders.py # DataFrameBuilder (fluent test data)
557
+ ├── temp_table.py # TempDeltaTable (auto-cleanup context manager)
558
+ ├── assertions.py # assert_frame_equal, assert_schema_equal
559
+ └── mocks.py # MockDBUtils, MockDBUtilsModule
560
+ ```
561
+
562
+ ## Development setup
563
+
564
+ ```bash
565
+ git clone https://github.com/kirankbs/databricks4py.git
566
+ cd databricks4py
567
+ pip install -e ".[dev]"
568
+
569
+ # Lint
570
+ ruff check src/ tests/ docs/
571
+ ruff format --check src/ tests/ docs/
572
+
573
+ # Tests
574
+ pytest -m no_pyspark --timeout=30 # Fast, no Spark/Java
575
+ pytest -m "integration or unit" --timeout=120 # Integration (requires Java 17+)
576
+ pytest -v --timeout=120 # Everything
577
+ ```
578
+
579
+ ## Compatibility matrix
580
+
581
+ | PySpark | delta-spark | Python |
582
+ |---------|-------------|--------|
583
+ | 3.5.x | 3.2.x | >= 3.10 |
584
+ | 3.4.x | 2.4.x | >= 3.10 |
585
+ | 4.x | 4.x | >= 3.10 |
586
+
587
+ ## License
588
+
589
+ MIT