phlo-delta 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.4
2
+ Name: phlo-delta
3
+ Version: 0.1.0
4
+ Summary: Delta Lake table-store capability plugin for Phlo
5
+ Author-email: Phlo Team <team@phlo.dev>
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/plain
9
+ Requires-Dist: phlo>=0.1.0
10
+ Requires-Dist: pandera>=0.26.1
11
+ Requires-Dist: deltalake>=0.25.0
12
+ Provides-Extra: minio
13
+ Requires-Dist: phlo-minio>=0.1.0; extra == "minio"
14
+ Provides-Extra: dev
15
+ Requires-Dist: pytest>=7.0; extra == "dev"
16
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
17
+
18
+ Delta Lake table-store capability plugin for Phlo.
@@ -0,0 +1,62 @@
1
+ # phlo-delta
2
+
3
+ Delta Lake table-store integration for Phlo.
4
+
5
+ ## Description
6
+
7
+ Provides Delta Lake table-store resources using the `deltalake` (delta-rs) Python library. Enables ACID transactions, schema evolution, and time travel on the data lakehouse.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ pip install phlo-delta
13
+ # or
14
+ phlo plugin install delta
15
+ ```
16
+
17
+ ## Configuration
18
+
19
+ | Variable | Required | Default | Description |
20
+ | ----------------------------- | -------- | ------------------------------ | -------------------------------- |
21
+ | `DELTA_WAREHOUSE_PATH` | Yes | `s3://lake/warehouse/delta` | S3 path for Delta tables |
22
+ | `DELTA_STAGING_PATH` | No | `s3://lake/stage` | S3 path for staging |
23
+ | `DELTA_DEFAULT_NAMESPACE` | No | `raw` | Default namespace/schema |
24
+ | `DELTA_S3_ENDPOINT` | No | `http://minio:10001` | S3 endpoint URL for Delta I/O |
25
+ | `DELTA_S3_ALLOW_UNSAFE_RENAME`| No | `true` | Allow unsafe rename for S3 |
26
+
27
+ > **S3 Access**: Configure AWS credentials via `~/.aws/credentials` or `AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY` env vars. When using MinIO, these are set automatically.
28
+
29
+ ## Auto-Configuration
30
+
31
+ Works out-of-the-box when MinIO is running:
32
+
33
+ | Feature | How It Works |
34
+ | ------------------------ | ------------------------------------------------------------- |
35
+ | **Resource Provider** | `DeltaResource` published as runtime resource `table_store` |
36
+ | **Table Store Capability** | Registers `table_store:delta` capability |
37
+ | **Schema Migration** | Registers `schema_migrator:delta` capability |
38
+
39
+ ## Usage
40
+
41
+ ### Resource Usage
42
+
43
+ ```python
44
+ from phlo_delta.resource import DeltaResource
45
+
46
+ delta = DeltaResource()
47
+ dt = delta.get_table("bronze.users")
48
+ df = dt.to_pandas()
49
+ ```
50
+
51
+ ### Direct Usage
52
+
53
+ ```python
54
+ from phlo_delta.settings import get_settings
55
+
56
+ opts = get_settings().get_storage_options()
57
+ # Use opts with deltalake
58
+ ```
59
+
60
+ ## Entry Points
61
+
62
+ - `phlo.plugins.resources` — Provides `DeltaResourceProvider`
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "phlo-delta"
7
+ version = "0.1.0"
8
+ description = "Delta Lake table-store capability plugin for Phlo"
9
+ readme = {text = "Delta Lake table-store capability plugin for Phlo.", content-type = "text/plain"}
10
+ requires-python = ">=3.11"
11
+ authors = [
12
+ {name = "Phlo Team", email = "team@phlo.dev"},
13
+ ]
14
+ license = {text = "MIT"}
15
+ dependencies = [
16
+ "phlo>=0.1.0",
17
+ "pandera>=0.26.1",
18
+ "deltalake>=0.25.0",
19
+ ]
20
+
21
+ [project.optional-dependencies]
22
+ minio = [
23
+ "phlo-minio>=0.1.0",
24
+ ]
25
+ dev = [
26
+ "pytest>=7.0",
27
+ "ruff>=0.1.0",
28
+ ]
29
+
30
+ [project.entry-points."phlo.plugins.resources"]
31
+ delta = "phlo_delta.plugin:DeltaResourceProvider"
32
+
33
+ [tool.setuptools]
34
+ package-dir = {"" = "src"}
35
+
36
+ [tool.setuptools.packages.find]
37
+ where = ["src"]
38
+
39
+ [tool.ruff]
40
+ line-length = 100
41
+ target-version = "py311"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,37 @@
1
+ from phlo_delta.plugin import DeltaResourceProvider
2
+ from phlo_delta.resource import DeltaResource
3
+ from phlo_delta.schema_conversion import SchemaConversionError, pandera_to_delta
4
+ from phlo_delta.schema_migrator import DeltaSchemaMigrator
5
+ from phlo_delta.settings import DeltaSettings, get_settings
6
+ from phlo_delta.tables import (
7
+ append_to_table,
8
+ delete_rows_from_table,
9
+ ensure_table,
10
+ expire_snapshots,
11
+ get_table_stats,
12
+ list_table_versions,
13
+ merge_to_table,
14
+ overwrite_table,
15
+ remove_orphan_files,
16
+ rollback_table_to_version,
17
+ )
18
+
19
+ __all__ = [
20
+ "DeltaResource",
21
+ "DeltaResourceProvider",
22
+ "DeltaSchemaMigrator",
23
+ "DeltaSettings",
24
+ "SchemaConversionError",
25
+ "append_to_table",
26
+ "delete_rows_from_table",
27
+ "ensure_table",
28
+ "expire_snapshots",
29
+ "get_settings",
30
+ "get_table_stats",
31
+ "list_table_versions",
32
+ "merge_to_table",
33
+ "overwrite_table",
34
+ "pandera_to_delta",
35
+ "remove_orphan_files",
36
+ "rollback_table_to_version",
37
+ ]
@@ -0,0 +1,69 @@
1
+ from __future__ import annotations
2
+
3
+ from phlo.capabilities import CapabilitySupport, ResourceSpec, SchemaMigrationSpec, TableStoreSpec
4
+ from phlo.plugins.base import PluginMetadata, ResourceProviderPlugin
5
+
6
+ from phlo_delta.resource import DeltaResource
7
+ from phlo_delta.schema_migrator import DeltaSchemaMigrator
8
+
9
+
10
+ class DeltaResourceProvider(ResourceProviderPlugin):
11
+ """Resource provider plugin for Delta Lake access."""
12
+
13
+ @property
14
+ def metadata(self) -> PluginMetadata:
15
+ """Get plugin metadata.
16
+
17
+ Returns:
18
+ PluginMetadata: Metadata for the Delta Lake resource plugin.
19
+ """
20
+ return PluginMetadata(
21
+ name="delta",
22
+ version="0.1.0",
23
+ description="Delta Lake table-store resource for Phlo",
24
+ support=CapabilitySupport(
25
+ supports_snapshots=True,
26
+ supports_schema_evolution=True,
27
+ supports_time_travel=True,
28
+ ),
29
+ )
30
+
31
+ def get_resources(self) -> list[ResourceSpec]:
32
+ """Get resource specs exposed by this plugin.
33
+
34
+ Returns:
35
+ list[ResourceSpec]: Delta resource specifications.
36
+ """
37
+ return [ResourceSpec(name="table_store", resource=DeltaResource())]
38
+
39
+ def get_table_stores(self) -> list[TableStoreSpec]:
40
+ """Get table-store capability specs exposed by this plugin.
41
+
42
+ Returns:
43
+ list[TableStoreSpec]: Delta table-store capability specifications.
44
+ """
45
+ return [
46
+ TableStoreSpec(
47
+ name="delta",
48
+ provider=DeltaResource(),
49
+ support=CapabilitySupport(
50
+ supports_snapshots=True,
51
+ supports_schema_evolution=True,
52
+ supports_time_travel=True,
53
+ ),
54
+ )
55
+ ]
56
+
57
+ def get_schema_migrators(self) -> list[SchemaMigrationSpec]:
58
+ """Get schema-migrator capability specs exposed by this plugin.
59
+
60
+ Returns:
61
+ list[SchemaMigrationSpec]: Delta schema migrator specifications.
62
+ """
63
+ return [
64
+ SchemaMigrationSpec(
65
+ name="delta",
66
+ provider=DeltaSchemaMigrator(),
67
+ support=CapabilitySupport(supports_schema_evolution=True),
68
+ )
69
+ ]
@@ -0,0 +1,429 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ from collections.abc import Sequence
5
+ from dataclasses import dataclass
6
+ from typing import Any, cast
7
+
8
+ import pyarrow as pa
9
+ from pandera.pandas import DataFrameModel
10
+
11
+ from phlo.exceptions import PhloConfigError
12
+ from phlo.logging import get_logger
13
+ from phlo_delta.settings import get_settings
14
+ from phlo_delta.tables import (
15
+ append_to_table,
16
+ delete_rows_from_table,
17
+ ensure_table,
18
+ list_table_versions,
19
+ merge_to_table,
20
+ overwrite_table,
21
+ remove_orphan_files,
22
+ rollback_table_to_version,
23
+ )
24
+
25
+ logger = get_logger(__name__)
26
+
27
+
28
+ def _load_delta_table() -> type[Any]:
29
+ """Load the optional DeltaTable runtime only when needed."""
30
+ return cast(Any, importlib.import_module("deltalake")).DeltaTable
31
+
32
+
33
+ def _resolve_delta_ref(override_ref: str | None) -> None:
34
+ """Validate the requested override ref for Delta operations.
35
+
36
+ Delta tables in Phlo are not branch-aware. Accept the default ``main`` ref
37
+ for table-store interface compatibility and reject any branch-like override.
38
+ """
39
+ if override_ref in (None, "", "main"):
40
+ return
41
+ raise PhloConfigError(
42
+ message=f"Delta table_store does not support override_ref={override_ref!r}",
43
+ suggestions=[
44
+ "Use the default main ref when writing to Delta tables",
45
+ "Use phlo-iceberg if you need Nessie branch-aware table writes",
46
+ ],
47
+ )
48
+
49
+
50
+ def _partition_columns_from_spec(
51
+ partition_spec: Sequence[tuple[str, str] | str] | None,
52
+ ) -> list[str] | None:
53
+ """Convert shared partition_spec tuples into Delta partition columns.
54
+
55
+ Delta Lake only supports identity partitioning here, so transforms such as
56
+ ``day`` or ``bucket`` must be rejected explicitly.
57
+ """
58
+ if not partition_spec:
59
+ return None
60
+
61
+ partition_columns: list[str] = []
62
+ for entry in partition_spec:
63
+ if isinstance(entry, str):
64
+ partition_columns.append(entry)
65
+ continue
66
+
67
+ if not isinstance(entry, (tuple, list)) or len(entry) != 2:
68
+ raise PhloConfigError(
69
+ message="Delta partition_spec entries must be column names or (column, transform) pairs",
70
+ suggestions=[
71
+ "Use partition_spec=[('column', 'identity')] for Delta tables",
72
+ "Or omit partition_spec entirely for unpartitioned Delta tables",
73
+ ],
74
+ )
75
+
76
+ source_name, transform_name = entry
77
+ if not isinstance(source_name, str) or not isinstance(transform_name, str):
78
+ raise PhloConfigError(
79
+ message="Delta partition_spec entries must contain string column and transform names",
80
+ suggestions=[
81
+ "Use partition_spec=[('column', 'identity')] for Delta tables",
82
+ ],
83
+ )
84
+ if transform_name != "identity":
85
+ raise PhloConfigError(
86
+ message=f"Delta table_store only supports identity partition transforms, got {transform_name!r}",
87
+ suggestions=[
88
+ "Use partition_spec=[('column', 'identity')] with Delta",
89
+ "Use phlo-iceberg for transform-based partitioning like day/month/bucket",
90
+ ],
91
+ )
92
+ partition_columns.append(source_name)
93
+
94
+ return partition_columns
95
+
96
+
97
+ @dataclass
98
+ class DeltaResource:
99
+ """Resource wrapper for Delta Lake table storage."""
100
+
101
+ def table_uri(self, table_name: str) -> str:
102
+ """Construct the full S3 path for a Delta table.
103
+
104
+ Args:
105
+ table_name: Fully qualified table name (namespace.table).
106
+
107
+ Returns:
108
+ str: S3 URI for the Delta table.
109
+ """
110
+ from phlo_delta.tables import _resolve_table_uri
111
+
112
+ return _resolve_table_uri(table_name)
113
+
114
+ def get_table(self, table_name: str) -> Any:
115
+ """Return a DeltaTable handle for the given table.
116
+
117
+ Args:
118
+ table_name: Fully qualified table name (namespace.table).
119
+
120
+ Returns:
121
+ DeltaTable: Configured Delta table instance.
122
+ """
123
+ from deltalake import DeltaTable
124
+
125
+ from phlo_delta.tables import _resolve_table_uri
126
+
127
+ table_uri = _resolve_table_uri(table_name)
128
+ opts = get_settings().get_storage_options()
129
+ return DeltaTable(table_uri, storage_options=opts)
130
+
131
+ def schema_from_validation_schema(
132
+ self, validation_schema: type[DataFrameModel] | type[Any]
133
+ ) -> pa.Schema:
134
+ """Build a PyArrow schema from a validation model for ingestion flows."""
135
+ from phlo_delta.schema_conversion import pandera_to_delta
136
+
137
+ return pandera_to_delta(validation_schema)
138
+
139
+ def ensure_table(
140
+ self,
141
+ table_name: str,
142
+ schema: pa.Schema,
143
+ partition_spec: Sequence[tuple[str, str] | str] | None = None,
144
+ override_ref: str | None = None,
145
+ ) -> Any:
146
+ """Ensure a table exists and return its handle.
147
+
148
+ Args:
149
+ table_name: Fully qualified table name (namespace.table).
150
+ schema: PyArrow table schema.
151
+ partition_spec: Optional shared partition specification.
152
+ override_ref: Optional branch override for interface compatibility.
153
+
154
+ Returns:
155
+ DeltaTable: Existing or newly created Delta table.
156
+ """
157
+ _resolve_delta_ref(override_ref)
158
+ return ensure_table(
159
+ table_name=table_name,
160
+ schema=schema,
161
+ partition_columns=_partition_columns_from_spec(partition_spec),
162
+ )
163
+
164
+ def append_parquet(
165
+ self,
166
+ table_name: str,
167
+ data_path: str,
168
+ override_ref: str | None = None,
169
+ ) -> dict[str, int]:
170
+ """Append parquet data into a Delta table.
171
+
172
+ Args:
173
+ table_name: Fully qualified table name (namespace.table).
174
+ data_path: Path to parquet input data.
175
+ override_ref: Optional branch override for interface compatibility.
176
+
177
+ Returns:
178
+ dict[str, int]: Write statistics from the append operation.
179
+ """
180
+ _resolve_delta_ref(override_ref)
181
+ logger.info(
182
+ "delta_resource_append_requested",
183
+ table_name=table_name,
184
+ source=data_path,
185
+ )
186
+ try:
187
+ result = append_to_table(table_name=table_name, data_path=data_path)
188
+ except Exception as exc:
189
+ logger.error(
190
+ "delta_resource_append_failed",
191
+ table_name=table_name,
192
+ source=data_path,
193
+ error_type=type(exc).__name__,
194
+ exc_info=True,
195
+ )
196
+ raise
197
+ logger.info(
198
+ "delta_resource_append_completed",
199
+ table_name=table_name,
200
+ source=data_path,
201
+ rows_inserted=result.get("rows_inserted", 0),
202
+ rows_deleted=result.get("rows_deleted", 0),
203
+ )
204
+ return result
205
+
206
+ def merge_parquet(
207
+ self,
208
+ table_name: str,
209
+ data_path: str,
210
+ unique_key: str,
211
+ override_ref: str | None = None,
212
+ ) -> dict[str, int]:
213
+ """Merge parquet data into a Delta table using a unique key.
214
+
215
+ Args:
216
+ table_name: Fully qualified table name (namespace.table).
217
+ data_path: Path to parquet input data.
218
+ unique_key: Column used to match existing rows.
219
+ override_ref: Optional branch override for interface compatibility.
220
+
221
+ Returns:
222
+ dict[str, int]: Write statistics from the merge operation.
223
+ """
224
+ _resolve_delta_ref(override_ref)
225
+ logger.info(
226
+ "delta_resource_merge_requested",
227
+ table_name=table_name,
228
+ source=data_path,
229
+ unique_key=unique_key,
230
+ )
231
+ try:
232
+ result = merge_to_table(
233
+ table_name=table_name,
234
+ data_path=data_path,
235
+ unique_key=unique_key,
236
+ )
237
+ except Exception as exc:
238
+ logger.error(
239
+ "delta_resource_merge_failed",
240
+ table_name=table_name,
241
+ source=data_path,
242
+ unique_key=unique_key,
243
+ error_type=type(exc).__name__,
244
+ exc_info=True,
245
+ )
246
+ raise
247
+ logger.info(
248
+ "delta_resource_merge_completed",
249
+ table_name=table_name,
250
+ source=data_path,
251
+ unique_key=unique_key,
252
+ rows_inserted=result.get("rows_inserted", 0),
253
+ rows_deleted=result.get("rows_deleted", 0),
254
+ )
255
+ return result
256
+
257
+ def overwrite_parquet(
258
+ self,
259
+ *,
260
+ table_name: str,
261
+ data_path: str,
262
+ override_ref: str | None = None,
263
+ ) -> dict[str, int]:
264
+ """Overwrite a Delta table with staged parquet data.
265
+
266
+ Args:
267
+ table_name: Fully qualified table name (namespace.table).
268
+ data_path: Path to parquet input data.
269
+ override_ref: Optional branch override for interface compatibility.
270
+
271
+ Returns:
272
+ dict[str, int]: Write statistics from the overwrite operation.
273
+ """
274
+ _resolve_delta_ref(override_ref)
275
+ logger.info(
276
+ "delta_resource_overwrite_requested",
277
+ table_name=table_name,
278
+ source=data_path,
279
+ )
280
+ try:
281
+ result = overwrite_table(table_name=table_name, data_path=data_path)
282
+ except Exception as exc:
283
+ logger.error(
284
+ "delta_resource_overwrite_failed",
285
+ table_name=table_name,
286
+ source=data_path,
287
+ error_type=type(exc).__name__,
288
+ exc_info=True,
289
+ )
290
+ raise
291
+ logger.info(
292
+ "delta_resource_overwrite_completed",
293
+ table_name=table_name,
294
+ source=data_path,
295
+ rows_inserted=result.get("rows_inserted", 0),
296
+ rows_deleted=result.get("rows_deleted", 0),
297
+ )
298
+ return result
299
+
300
+ def delete_rows(
301
+ self,
302
+ *,
303
+ table_name: str,
304
+ predicate: str,
305
+ override_ref: str | None = None,
306
+ ) -> dict[str, int]:
307
+ """Delete rows matching a predicate expression.
308
+
309
+ Args:
310
+ table_name: Fully qualified table name (namespace.table).
311
+ predicate: Filter expression (e.g. ``"status = 'inactive'"``).
312
+ override_ref: Optional branch override for interface compatibility.
313
+
314
+ Returns:
315
+ dict[str, int]: Delete statistics (rows_deleted is -1 as Delta
316
+ does not return a count from predicate deletes).
317
+ """
318
+ _resolve_delta_ref(override_ref)
319
+ logger.info(
320
+ "delta_resource_delete_rows_requested",
321
+ table_name=table_name,
322
+ predicate=predicate,
323
+ )
324
+ try:
325
+ result = delete_rows_from_table(table_name=table_name, predicate=predicate)
326
+ except Exception as exc:
327
+ logger.error(
328
+ "delta_resource_delete_rows_failed",
329
+ table_name=table_name,
330
+ predicate=predicate,
331
+ error_type=type(exc).__name__,
332
+ exc_info=True,
333
+ )
334
+ raise
335
+ logger.info(
336
+ "delta_resource_delete_rows_completed",
337
+ table_name=table_name,
338
+ predicate=predicate,
339
+ )
340
+ return result
341
+
342
+ def compact(self, *, table_name: str) -> dict[str, object]:
343
+ """Compact small files in a table using Delta OPTIMIZE.
344
+
345
+ Args:
346
+ table_name: Fully qualified table name (namespace.table).
347
+
348
+ Returns:
349
+ dict[str, object]: Compaction results.
350
+ """
351
+ from phlo_delta.tables import _resolve_table_uri
352
+
353
+ table_uri = _resolve_table_uri(table_name)
354
+ opts = get_settings().get_storage_options()
355
+ delta_table_cls = _load_delta_table()
356
+
357
+ logger.info("delta_resource_compact_requested", table_name=table_name)
358
+ dt = delta_table_cls(table_uri, storage_options=opts)
359
+ result = dt.optimize.compact()
360
+ logger.info("delta_resource_compact_completed", table_name=table_name)
361
+ return {"compaction": result}
362
+
363
+ def list_snapshots(self, *, table_name: str, limit: int = 10) -> list[dict]:
364
+ """List recent table versions (Delta equivalent of snapshots).
365
+
366
+ Args:
367
+ table_name: Fully qualified table name (namespace.table).
368
+ limit: Maximum number of versions to return.
369
+
370
+ Returns:
371
+ list[dict]: Version metadata dicts, most recent first.
372
+ """
373
+ return list_table_versions(table_name=table_name, limit=limit)
374
+
375
+ def rollback_to_snapshot(self, *, table_name: str, snapshot_id: int | str) -> dict:
376
+ """Roll back a table to a previous version.
377
+
378
+ Args:
379
+ table_name: Fully qualified table name (namespace.table).
380
+ snapshot_id: Target version number.
381
+
382
+ Returns:
383
+ dict: Contains ``rolled_back_to`` version.
384
+ """
385
+ logger.info(
386
+ "delta_resource_rollback_requested",
387
+ table_name=table_name,
388
+ version=snapshot_id,
389
+ )
390
+ try:
391
+ result = rollback_table_to_version(table_name=table_name, version=int(snapshot_id))
392
+ except Exception as exc:
393
+ logger.error(
394
+ "delta_resource_rollback_failed",
395
+ table_name=table_name,
396
+ version=snapshot_id,
397
+ error_type=type(exc).__name__,
398
+ exc_info=True,
399
+ )
400
+ raise
401
+ logger.info(
402
+ "delta_resource_rollback_completed",
403
+ table_name=table_name,
404
+ version=snapshot_id,
405
+ )
406
+ return result
407
+
408
+ def vacuum(self, *, table_name: str, retain_hours: int = 168) -> dict:
409
+ """Remove old files via Delta vacuum.
410
+
411
+ Args:
412
+ table_name: Fully qualified table name (namespace.table).
413
+ retain_hours: Retention period in hours (default 168 = 7 days).
414
+
415
+ Returns:
416
+ dict: Vacuum results.
417
+ """
418
+ logger.info(
419
+ "delta_resource_vacuum_requested",
420
+ table_name=table_name,
421
+ retain_hours=retain_hours,
422
+ )
423
+ result = remove_orphan_files(table_name=table_name, retain_hours=retain_hours)
424
+ logger.info(
425
+ "delta_resource_vacuum_completed",
426
+ table_name=table_name,
427
+ files_removed=result["files_removed"],
428
+ )
429
+ return result