phlo-delta 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phlo_delta-0.1.0/PKG-INFO +18 -0
- phlo_delta-0.1.0/README.md +62 -0
- phlo_delta-0.1.0/pyproject.toml +41 -0
- phlo_delta-0.1.0/setup.cfg +4 -0
- phlo_delta-0.1.0/src/phlo_delta/__init__.py +37 -0
- phlo_delta-0.1.0/src/phlo_delta/plugin.py +69 -0
- phlo_delta-0.1.0/src/phlo_delta/resource.py +429 -0
- phlo_delta-0.1.0/src/phlo_delta/schema_conversion.py +209 -0
- phlo_delta-0.1.0/src/phlo_delta/schema_migrator.py +325 -0
- phlo_delta-0.1.0/src/phlo_delta/settings.py +75 -0
- phlo_delta-0.1.0/src/phlo_delta/tables.py +569 -0
- phlo_delta-0.1.0/src/phlo_delta.egg-info/PKG-INFO +18 -0
- phlo_delta-0.1.0/src/phlo_delta.egg-info/SOURCES.txt +19 -0
- phlo_delta-0.1.0/src/phlo_delta.egg-info/dependency_links.txt +1 -0
- phlo_delta-0.1.0/src/phlo_delta.egg-info/entry_points.txt +2 -0
- phlo_delta-0.1.0/src/phlo_delta.egg-info/requires.txt +10 -0
- phlo_delta-0.1.0/src/phlo_delta.egg-info/top_level.txt +1 -0
- phlo_delta-0.1.0/tests/test_delta_plugin.py +45 -0
- phlo_delta-0.1.0/tests/test_delta_resource.py +77 -0
- phlo_delta-0.1.0/tests/test_delta_settings.py +18 -0
- phlo_delta-0.1.0/tests/test_integration_delta_trino.py +243 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: phlo-delta
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Delta Lake table-store capability plugin for Phlo
|
|
5
|
+
Author-email: Phlo Team <team@phlo.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Description-Content-Type: text/plain
|
|
9
|
+
Requires-Dist: phlo>=0.1.0
|
|
10
|
+
Requires-Dist: pandera>=0.26.1
|
|
11
|
+
Requires-Dist: deltalake>=0.25.0
|
|
12
|
+
Provides-Extra: minio
|
|
13
|
+
Requires-Dist: phlo-minio>=0.1.0; extra == "minio"
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
16
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
17
|
+
|
|
18
|
+
Delta Lake table-store capability plugin for Phlo.
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# phlo-delta
|
|
2
|
+
|
|
3
|
+
Delta Lake table-store integration for Phlo.
|
|
4
|
+
|
|
5
|
+
## Description
|
|
6
|
+
|
|
7
|
+
Provides Delta Lake table-store resources using the `deltalake` (delta-rs) Python library. Enables ACID transactions, schema evolution, and time travel on the data lakehouse.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install phlo-delta
|
|
13
|
+
# or
|
|
14
|
+
phlo plugin install delta
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Configuration
|
|
18
|
+
|
|
19
|
+
| Variable | Required | Default | Description |
|
|
20
|
+
| ----------------------------- | -------- | ------------------------------ | -------------------------------- |
|
|
21
|
+
| `DELTA_WAREHOUSE_PATH` | Yes | `s3://lake/warehouse/delta` | S3 path for Delta tables |
|
|
22
|
+
| `DELTA_STAGING_PATH` | No | `s3://lake/stage` | S3 path for staging |
|
|
23
|
+
| `DELTA_DEFAULT_NAMESPACE` | No | `raw` | Default namespace/schema |
|
|
24
|
+
| `DELTA_S3_ENDPOINT` | No | `http://minio:10001` | S3 endpoint URL for Delta I/O |
|
|
25
|
+
| `DELTA_S3_ALLOW_UNSAFE_RENAME`| No | `true` | Allow unsafe rename for S3 |
|
|
26
|
+
|
|
27
|
+
> **S3 Access**: Configure AWS credentials via `~/.aws/credentials` or `AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY` env vars. When using MinIO, these are set automatically.
|
|
28
|
+
|
|
29
|
+
## Auto-Configuration
|
|
30
|
+
|
|
31
|
+
Works out-of-the-box when MinIO is running:
|
|
32
|
+
|
|
33
|
+
| Feature | How It Works |
|
|
34
|
+
| ------------------------ | ------------------------------------------------------------- |
|
|
35
|
+
| **Resource Provider** | `DeltaResource` published as runtime resource `table_store` |
|
|
36
|
+
| **Table Store Capability** | Registers `table_store:delta` capability |
|
|
37
|
+
| **Schema Migration** | Registers `schema_migrator:delta` capability |
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
### Resource Usage
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from phlo_delta.resource import DeltaResource
|
|
45
|
+
|
|
46
|
+
delta = DeltaResource()
|
|
47
|
+
dt = delta.get_table("bronze.users")
|
|
48
|
+
df = dt.to_pandas()
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Direct Usage
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from phlo_delta.settings import get_settings
|
|
55
|
+
|
|
56
|
+
opts = get_settings().get_storage_options()
|
|
57
|
+
# Use opts with deltalake
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Entry Points
|
|
61
|
+
|
|
62
|
+
- `phlo.plugins.resources` — Provides `DeltaResourceProvider`
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=45", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "phlo-delta"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Delta Lake table-store capability plugin for Phlo"
|
|
9
|
+
readme = {text = "Delta Lake table-store capability plugin for Phlo.", content-type = "text/plain"}
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "Phlo Team", email = "team@phlo.dev"},
|
|
13
|
+
]
|
|
14
|
+
license = {text = "MIT"}
|
|
15
|
+
dependencies = [
|
|
16
|
+
"phlo>=0.1.0",
|
|
17
|
+
"pandera>=0.26.1",
|
|
18
|
+
"deltalake>=0.25.0",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
minio = [
|
|
23
|
+
"phlo-minio>=0.1.0",
|
|
24
|
+
]
|
|
25
|
+
dev = [
|
|
26
|
+
"pytest>=7.0",
|
|
27
|
+
"ruff>=0.1.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.entry-points."phlo.plugins.resources"]
|
|
31
|
+
delta = "phlo_delta.plugin:DeltaResourceProvider"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools]
|
|
34
|
+
package-dir = {"" = "src"}
|
|
35
|
+
|
|
36
|
+
[tool.setuptools.packages.find]
|
|
37
|
+
where = ["src"]
|
|
38
|
+
|
|
39
|
+
[tool.ruff]
|
|
40
|
+
line-length = 100
|
|
41
|
+
target-version = "py311"
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from phlo_delta.plugin import DeltaResourceProvider
|
|
2
|
+
from phlo_delta.resource import DeltaResource
|
|
3
|
+
from phlo_delta.schema_conversion import SchemaConversionError, pandera_to_delta
|
|
4
|
+
from phlo_delta.schema_migrator import DeltaSchemaMigrator
|
|
5
|
+
from phlo_delta.settings import DeltaSettings, get_settings
|
|
6
|
+
from phlo_delta.tables import (
|
|
7
|
+
append_to_table,
|
|
8
|
+
delete_rows_from_table,
|
|
9
|
+
ensure_table,
|
|
10
|
+
expire_snapshots,
|
|
11
|
+
get_table_stats,
|
|
12
|
+
list_table_versions,
|
|
13
|
+
merge_to_table,
|
|
14
|
+
overwrite_table,
|
|
15
|
+
remove_orphan_files,
|
|
16
|
+
rollback_table_to_version,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"DeltaResource",
|
|
21
|
+
"DeltaResourceProvider",
|
|
22
|
+
"DeltaSchemaMigrator",
|
|
23
|
+
"DeltaSettings",
|
|
24
|
+
"SchemaConversionError",
|
|
25
|
+
"append_to_table",
|
|
26
|
+
"delete_rows_from_table",
|
|
27
|
+
"ensure_table",
|
|
28
|
+
"expire_snapshots",
|
|
29
|
+
"get_settings",
|
|
30
|
+
"get_table_stats",
|
|
31
|
+
"list_table_versions",
|
|
32
|
+
"merge_to_table",
|
|
33
|
+
"overwrite_table",
|
|
34
|
+
"pandera_to_delta",
|
|
35
|
+
"remove_orphan_files",
|
|
36
|
+
"rollback_table_to_version",
|
|
37
|
+
]
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from phlo.capabilities import CapabilitySupport, ResourceSpec, SchemaMigrationSpec, TableStoreSpec
|
|
4
|
+
from phlo.plugins.base import PluginMetadata, ResourceProviderPlugin
|
|
5
|
+
|
|
6
|
+
from phlo_delta.resource import DeltaResource
|
|
7
|
+
from phlo_delta.schema_migrator import DeltaSchemaMigrator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DeltaResourceProvider(ResourceProviderPlugin):
|
|
11
|
+
"""Resource provider plugin for Delta Lake access."""
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def metadata(self) -> PluginMetadata:
|
|
15
|
+
"""Get plugin metadata.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
PluginMetadata: Metadata for the Delta Lake resource plugin.
|
|
19
|
+
"""
|
|
20
|
+
return PluginMetadata(
|
|
21
|
+
name="delta",
|
|
22
|
+
version="0.1.0",
|
|
23
|
+
description="Delta Lake table-store resource for Phlo",
|
|
24
|
+
support=CapabilitySupport(
|
|
25
|
+
supports_snapshots=True,
|
|
26
|
+
supports_schema_evolution=True,
|
|
27
|
+
supports_time_travel=True,
|
|
28
|
+
),
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def get_resources(self) -> list[ResourceSpec]:
|
|
32
|
+
"""Get resource specs exposed by this plugin.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
list[ResourceSpec]: Delta resource specifications.
|
|
36
|
+
"""
|
|
37
|
+
return [ResourceSpec(name="table_store", resource=DeltaResource())]
|
|
38
|
+
|
|
39
|
+
def get_table_stores(self) -> list[TableStoreSpec]:
|
|
40
|
+
"""Get table-store capability specs exposed by this plugin.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
list[TableStoreSpec]: Delta table-store capability specifications.
|
|
44
|
+
"""
|
|
45
|
+
return [
|
|
46
|
+
TableStoreSpec(
|
|
47
|
+
name="delta",
|
|
48
|
+
provider=DeltaResource(),
|
|
49
|
+
support=CapabilitySupport(
|
|
50
|
+
supports_snapshots=True,
|
|
51
|
+
supports_schema_evolution=True,
|
|
52
|
+
supports_time_travel=True,
|
|
53
|
+
),
|
|
54
|
+
)
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
def get_schema_migrators(self) -> list[SchemaMigrationSpec]:
|
|
58
|
+
"""Get schema-migrator capability specs exposed by this plugin.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
list[SchemaMigrationSpec]: Delta schema migrator specifications.
|
|
62
|
+
"""
|
|
63
|
+
return [
|
|
64
|
+
SchemaMigrationSpec(
|
|
65
|
+
name="delta",
|
|
66
|
+
provider=DeltaSchemaMigrator(),
|
|
67
|
+
support=CapabilitySupport(supports_schema_evolution=True),
|
|
68
|
+
)
|
|
69
|
+
]
|
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, cast
|
|
7
|
+
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
from pandera.pandas import DataFrameModel
|
|
10
|
+
|
|
11
|
+
from phlo.exceptions import PhloConfigError
|
|
12
|
+
from phlo.logging import get_logger
|
|
13
|
+
from phlo_delta.settings import get_settings
|
|
14
|
+
from phlo_delta.tables import (
|
|
15
|
+
append_to_table,
|
|
16
|
+
delete_rows_from_table,
|
|
17
|
+
ensure_table,
|
|
18
|
+
list_table_versions,
|
|
19
|
+
merge_to_table,
|
|
20
|
+
overwrite_table,
|
|
21
|
+
remove_orphan_files,
|
|
22
|
+
rollback_table_to_version,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger = get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _load_delta_table() -> type[Any]:
|
|
29
|
+
"""Load the optional DeltaTable runtime only when needed."""
|
|
30
|
+
return cast(Any, importlib.import_module("deltalake")).DeltaTable
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _resolve_delta_ref(override_ref: str | None) -> None:
|
|
34
|
+
"""Validate the requested override ref for Delta operations.
|
|
35
|
+
|
|
36
|
+
Delta tables in Phlo are not branch-aware. Accept the default ``main`` ref
|
|
37
|
+
for table-store interface compatibility and reject any branch-like override.
|
|
38
|
+
"""
|
|
39
|
+
if override_ref in (None, "", "main"):
|
|
40
|
+
return
|
|
41
|
+
raise PhloConfigError(
|
|
42
|
+
message=f"Delta table_store does not support override_ref={override_ref!r}",
|
|
43
|
+
suggestions=[
|
|
44
|
+
"Use the default main ref when writing to Delta tables",
|
|
45
|
+
"Use phlo-iceberg if you need Nessie branch-aware table writes",
|
|
46
|
+
],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _partition_columns_from_spec(
|
|
51
|
+
partition_spec: Sequence[tuple[str, str] | str] | None,
|
|
52
|
+
) -> list[str] | None:
|
|
53
|
+
"""Convert shared partition_spec tuples into Delta partition columns.
|
|
54
|
+
|
|
55
|
+
Delta Lake only supports identity partitioning here, so transforms such as
|
|
56
|
+
``day`` or ``bucket`` must be rejected explicitly.
|
|
57
|
+
"""
|
|
58
|
+
if not partition_spec:
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
partition_columns: list[str] = []
|
|
62
|
+
for entry in partition_spec:
|
|
63
|
+
if isinstance(entry, str):
|
|
64
|
+
partition_columns.append(entry)
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
if not isinstance(entry, (tuple, list)) or len(entry) != 2:
|
|
68
|
+
raise PhloConfigError(
|
|
69
|
+
message="Delta partition_spec entries must be column names or (column, transform) pairs",
|
|
70
|
+
suggestions=[
|
|
71
|
+
"Use partition_spec=[('column', 'identity')] for Delta tables",
|
|
72
|
+
"Or omit partition_spec entirely for unpartitioned Delta tables",
|
|
73
|
+
],
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
source_name, transform_name = entry
|
|
77
|
+
if not isinstance(source_name, str) or not isinstance(transform_name, str):
|
|
78
|
+
raise PhloConfigError(
|
|
79
|
+
message="Delta partition_spec entries must contain string column and transform names",
|
|
80
|
+
suggestions=[
|
|
81
|
+
"Use partition_spec=[('column', 'identity')] for Delta tables",
|
|
82
|
+
],
|
|
83
|
+
)
|
|
84
|
+
if transform_name != "identity":
|
|
85
|
+
raise PhloConfigError(
|
|
86
|
+
message=f"Delta table_store only supports identity partition transforms, got {transform_name!r}",
|
|
87
|
+
suggestions=[
|
|
88
|
+
"Use partition_spec=[('column', 'identity')] with Delta",
|
|
89
|
+
"Use phlo-iceberg for transform-based partitioning like day/month/bucket",
|
|
90
|
+
],
|
|
91
|
+
)
|
|
92
|
+
partition_columns.append(source_name)
|
|
93
|
+
|
|
94
|
+
return partition_columns
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass
|
|
98
|
+
class DeltaResource:
|
|
99
|
+
"""Resource wrapper for Delta Lake table storage."""
|
|
100
|
+
|
|
101
|
+
def table_uri(self, table_name: str) -> str:
|
|
102
|
+
"""Construct the full S3 path for a Delta table.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
table_name: Fully qualified table name (namespace.table).
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
str: S3 URI for the Delta table.
|
|
109
|
+
"""
|
|
110
|
+
from phlo_delta.tables import _resolve_table_uri
|
|
111
|
+
|
|
112
|
+
return _resolve_table_uri(table_name)
|
|
113
|
+
|
|
114
|
+
def get_table(self, table_name: str) -> Any:
|
|
115
|
+
"""Return a DeltaTable handle for the given table.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
table_name: Fully qualified table name (namespace.table).
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
DeltaTable: Configured Delta table instance.
|
|
122
|
+
"""
|
|
123
|
+
from deltalake import DeltaTable
|
|
124
|
+
|
|
125
|
+
from phlo_delta.tables import _resolve_table_uri
|
|
126
|
+
|
|
127
|
+
table_uri = _resolve_table_uri(table_name)
|
|
128
|
+
opts = get_settings().get_storage_options()
|
|
129
|
+
return DeltaTable(table_uri, storage_options=opts)
|
|
130
|
+
|
|
131
|
+
def schema_from_validation_schema(
|
|
132
|
+
self, validation_schema: type[DataFrameModel] | type[Any]
|
|
133
|
+
) -> pa.Schema:
|
|
134
|
+
"""Build a PyArrow schema from a validation model for ingestion flows."""
|
|
135
|
+
from phlo_delta.schema_conversion import pandera_to_delta
|
|
136
|
+
|
|
137
|
+
return pandera_to_delta(validation_schema)
|
|
138
|
+
|
|
139
|
+
def ensure_table(
|
|
140
|
+
self,
|
|
141
|
+
table_name: str,
|
|
142
|
+
schema: pa.Schema,
|
|
143
|
+
partition_spec: Sequence[tuple[str, str] | str] | None = None,
|
|
144
|
+
override_ref: str | None = None,
|
|
145
|
+
) -> Any:
|
|
146
|
+
"""Ensure a table exists and return its handle.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
table_name: Fully qualified table name (namespace.table).
|
|
150
|
+
schema: PyArrow table schema.
|
|
151
|
+
partition_spec: Optional shared partition specification.
|
|
152
|
+
override_ref: Optional branch override for interface compatibility.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
DeltaTable: Existing or newly created Delta table.
|
|
156
|
+
"""
|
|
157
|
+
_resolve_delta_ref(override_ref)
|
|
158
|
+
return ensure_table(
|
|
159
|
+
table_name=table_name,
|
|
160
|
+
schema=schema,
|
|
161
|
+
partition_columns=_partition_columns_from_spec(partition_spec),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def append_parquet(
|
|
165
|
+
self,
|
|
166
|
+
table_name: str,
|
|
167
|
+
data_path: str,
|
|
168
|
+
override_ref: str | None = None,
|
|
169
|
+
) -> dict[str, int]:
|
|
170
|
+
"""Append parquet data into a Delta table.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
table_name: Fully qualified table name (namespace.table).
|
|
174
|
+
data_path: Path to parquet input data.
|
|
175
|
+
override_ref: Optional branch override for interface compatibility.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
dict[str, int]: Write statistics from the append operation.
|
|
179
|
+
"""
|
|
180
|
+
_resolve_delta_ref(override_ref)
|
|
181
|
+
logger.info(
|
|
182
|
+
"delta_resource_append_requested",
|
|
183
|
+
table_name=table_name,
|
|
184
|
+
source=data_path,
|
|
185
|
+
)
|
|
186
|
+
try:
|
|
187
|
+
result = append_to_table(table_name=table_name, data_path=data_path)
|
|
188
|
+
except Exception as exc:
|
|
189
|
+
logger.error(
|
|
190
|
+
"delta_resource_append_failed",
|
|
191
|
+
table_name=table_name,
|
|
192
|
+
source=data_path,
|
|
193
|
+
error_type=type(exc).__name__,
|
|
194
|
+
exc_info=True,
|
|
195
|
+
)
|
|
196
|
+
raise
|
|
197
|
+
logger.info(
|
|
198
|
+
"delta_resource_append_completed",
|
|
199
|
+
table_name=table_name,
|
|
200
|
+
source=data_path,
|
|
201
|
+
rows_inserted=result.get("rows_inserted", 0),
|
|
202
|
+
rows_deleted=result.get("rows_deleted", 0),
|
|
203
|
+
)
|
|
204
|
+
return result
|
|
205
|
+
|
|
206
|
+
def merge_parquet(
|
|
207
|
+
self,
|
|
208
|
+
table_name: str,
|
|
209
|
+
data_path: str,
|
|
210
|
+
unique_key: str,
|
|
211
|
+
override_ref: str | None = None,
|
|
212
|
+
) -> dict[str, int]:
|
|
213
|
+
"""Merge parquet data into a Delta table using a unique key.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
table_name: Fully qualified table name (namespace.table).
|
|
217
|
+
data_path: Path to parquet input data.
|
|
218
|
+
unique_key: Column used to match existing rows.
|
|
219
|
+
override_ref: Optional branch override for interface compatibility.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
dict[str, int]: Write statistics from the merge operation.
|
|
223
|
+
"""
|
|
224
|
+
_resolve_delta_ref(override_ref)
|
|
225
|
+
logger.info(
|
|
226
|
+
"delta_resource_merge_requested",
|
|
227
|
+
table_name=table_name,
|
|
228
|
+
source=data_path,
|
|
229
|
+
unique_key=unique_key,
|
|
230
|
+
)
|
|
231
|
+
try:
|
|
232
|
+
result = merge_to_table(
|
|
233
|
+
table_name=table_name,
|
|
234
|
+
data_path=data_path,
|
|
235
|
+
unique_key=unique_key,
|
|
236
|
+
)
|
|
237
|
+
except Exception as exc:
|
|
238
|
+
logger.error(
|
|
239
|
+
"delta_resource_merge_failed",
|
|
240
|
+
table_name=table_name,
|
|
241
|
+
source=data_path,
|
|
242
|
+
unique_key=unique_key,
|
|
243
|
+
error_type=type(exc).__name__,
|
|
244
|
+
exc_info=True,
|
|
245
|
+
)
|
|
246
|
+
raise
|
|
247
|
+
logger.info(
|
|
248
|
+
"delta_resource_merge_completed",
|
|
249
|
+
table_name=table_name,
|
|
250
|
+
source=data_path,
|
|
251
|
+
unique_key=unique_key,
|
|
252
|
+
rows_inserted=result.get("rows_inserted", 0),
|
|
253
|
+
rows_deleted=result.get("rows_deleted", 0),
|
|
254
|
+
)
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
def overwrite_parquet(
|
|
258
|
+
self,
|
|
259
|
+
*,
|
|
260
|
+
table_name: str,
|
|
261
|
+
data_path: str,
|
|
262
|
+
override_ref: str | None = None,
|
|
263
|
+
) -> dict[str, int]:
|
|
264
|
+
"""Overwrite a Delta table with staged parquet data.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
table_name: Fully qualified table name (namespace.table).
|
|
268
|
+
data_path: Path to parquet input data.
|
|
269
|
+
override_ref: Optional branch override for interface compatibility.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
dict[str, int]: Write statistics from the overwrite operation.
|
|
273
|
+
"""
|
|
274
|
+
_resolve_delta_ref(override_ref)
|
|
275
|
+
logger.info(
|
|
276
|
+
"delta_resource_overwrite_requested",
|
|
277
|
+
table_name=table_name,
|
|
278
|
+
source=data_path,
|
|
279
|
+
)
|
|
280
|
+
try:
|
|
281
|
+
result = overwrite_table(table_name=table_name, data_path=data_path)
|
|
282
|
+
except Exception as exc:
|
|
283
|
+
logger.error(
|
|
284
|
+
"delta_resource_overwrite_failed",
|
|
285
|
+
table_name=table_name,
|
|
286
|
+
source=data_path,
|
|
287
|
+
error_type=type(exc).__name__,
|
|
288
|
+
exc_info=True,
|
|
289
|
+
)
|
|
290
|
+
raise
|
|
291
|
+
logger.info(
|
|
292
|
+
"delta_resource_overwrite_completed",
|
|
293
|
+
table_name=table_name,
|
|
294
|
+
source=data_path,
|
|
295
|
+
rows_inserted=result.get("rows_inserted", 0),
|
|
296
|
+
rows_deleted=result.get("rows_deleted", 0),
|
|
297
|
+
)
|
|
298
|
+
return result
|
|
299
|
+
|
|
300
|
+
def delete_rows(
|
|
301
|
+
self,
|
|
302
|
+
*,
|
|
303
|
+
table_name: str,
|
|
304
|
+
predicate: str,
|
|
305
|
+
override_ref: str | None = None,
|
|
306
|
+
) -> dict[str, int]:
|
|
307
|
+
"""Delete rows matching a predicate expression.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
table_name: Fully qualified table name (namespace.table).
|
|
311
|
+
predicate: Filter expression (e.g. ``"status = 'inactive'"``).
|
|
312
|
+
override_ref: Optional branch override for interface compatibility.
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
dict[str, int]: Delete statistics (rows_deleted is -1 as Delta
|
|
316
|
+
does not return a count from predicate deletes).
|
|
317
|
+
"""
|
|
318
|
+
_resolve_delta_ref(override_ref)
|
|
319
|
+
logger.info(
|
|
320
|
+
"delta_resource_delete_rows_requested",
|
|
321
|
+
table_name=table_name,
|
|
322
|
+
predicate=predicate,
|
|
323
|
+
)
|
|
324
|
+
try:
|
|
325
|
+
result = delete_rows_from_table(table_name=table_name, predicate=predicate)
|
|
326
|
+
except Exception as exc:
|
|
327
|
+
logger.error(
|
|
328
|
+
"delta_resource_delete_rows_failed",
|
|
329
|
+
table_name=table_name,
|
|
330
|
+
predicate=predicate,
|
|
331
|
+
error_type=type(exc).__name__,
|
|
332
|
+
exc_info=True,
|
|
333
|
+
)
|
|
334
|
+
raise
|
|
335
|
+
logger.info(
|
|
336
|
+
"delta_resource_delete_rows_completed",
|
|
337
|
+
table_name=table_name,
|
|
338
|
+
predicate=predicate,
|
|
339
|
+
)
|
|
340
|
+
return result
|
|
341
|
+
|
|
342
|
+
def compact(self, *, table_name: str) -> dict[str, object]:
|
|
343
|
+
"""Compact small files in a table using Delta OPTIMIZE.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
table_name: Fully qualified table name (namespace.table).
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
dict[str, object]: Compaction results.
|
|
350
|
+
"""
|
|
351
|
+
from phlo_delta.tables import _resolve_table_uri
|
|
352
|
+
|
|
353
|
+
table_uri = _resolve_table_uri(table_name)
|
|
354
|
+
opts = get_settings().get_storage_options()
|
|
355
|
+
delta_table_cls = _load_delta_table()
|
|
356
|
+
|
|
357
|
+
logger.info("delta_resource_compact_requested", table_name=table_name)
|
|
358
|
+
dt = delta_table_cls(table_uri, storage_options=opts)
|
|
359
|
+
result = dt.optimize.compact()
|
|
360
|
+
logger.info("delta_resource_compact_completed", table_name=table_name)
|
|
361
|
+
return {"compaction": result}
|
|
362
|
+
|
|
363
|
+
def list_snapshots(self, *, table_name: str, limit: int = 10) -> list[dict]:
|
|
364
|
+
"""List recent table versions (Delta equivalent of snapshots).
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
table_name: Fully qualified table name (namespace.table).
|
|
368
|
+
limit: Maximum number of versions to return.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
list[dict]: Version metadata dicts, most recent first.
|
|
372
|
+
"""
|
|
373
|
+
return list_table_versions(table_name=table_name, limit=limit)
|
|
374
|
+
|
|
375
|
+
def rollback_to_snapshot(self, *, table_name: str, snapshot_id: int | str) -> dict:
|
|
376
|
+
"""Roll back a table to a previous version.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
table_name: Fully qualified table name (namespace.table).
|
|
380
|
+
snapshot_id: Target version number.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
dict: Contains ``rolled_back_to`` version.
|
|
384
|
+
"""
|
|
385
|
+
logger.info(
|
|
386
|
+
"delta_resource_rollback_requested",
|
|
387
|
+
table_name=table_name,
|
|
388
|
+
version=snapshot_id,
|
|
389
|
+
)
|
|
390
|
+
try:
|
|
391
|
+
result = rollback_table_to_version(table_name=table_name, version=int(snapshot_id))
|
|
392
|
+
except Exception as exc:
|
|
393
|
+
logger.error(
|
|
394
|
+
"delta_resource_rollback_failed",
|
|
395
|
+
table_name=table_name,
|
|
396
|
+
version=snapshot_id,
|
|
397
|
+
error_type=type(exc).__name__,
|
|
398
|
+
exc_info=True,
|
|
399
|
+
)
|
|
400
|
+
raise
|
|
401
|
+
logger.info(
|
|
402
|
+
"delta_resource_rollback_completed",
|
|
403
|
+
table_name=table_name,
|
|
404
|
+
version=snapshot_id,
|
|
405
|
+
)
|
|
406
|
+
return result
|
|
407
|
+
|
|
408
|
+
def vacuum(self, *, table_name: str, retain_hours: int = 168) -> dict:
|
|
409
|
+
"""Remove old files via Delta vacuum.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
table_name: Fully qualified table name (namespace.table).
|
|
413
|
+
retain_hours: Retention period in hours (default 168 = 7 days).
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
dict: Vacuum results.
|
|
417
|
+
"""
|
|
418
|
+
logger.info(
|
|
419
|
+
"delta_resource_vacuum_requested",
|
|
420
|
+
table_name=table_name,
|
|
421
|
+
retain_hours=retain_hours,
|
|
422
|
+
)
|
|
423
|
+
result = remove_orphan_files(table_name=table_name, retain_hours=retain_hours)
|
|
424
|
+
logger.info(
|
|
425
|
+
"delta_resource_vacuum_completed",
|
|
426
|
+
table_name=table_name,
|
|
427
|
+
files_removed=result["files_removed"],
|
|
428
|
+
)
|
|
429
|
+
return result
|