datazone-sdk 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datazone_sdk-1.0.0/PKG-INFO +56 -0
- datazone_sdk-1.0.0/README.md +35 -0
- datazone_sdk-1.0.0/datazone/__init__.py +2 -0
- datazone_sdk-1.0.0/datazone/db/__init__.py +1 -0
- datazone_sdk-1.0.0/datazone/db/client.py +233 -0
- datazone_sdk-1.0.0/datazone/deltastorage/__init__.py +4 -0
- datazone_sdk-1.0.0/datazone/deltastorage/data_types.py +94 -0
- datazone_sdk-1.0.0/datazone/deltastorage/generated_columns.py +151 -0
- datazone_sdk-1.0.0/datazone/deltastorage/schema.py +135 -0
- datazone_sdk-1.0.0/datazone/deltastorage/slicing.py +22 -0
- datazone_sdk-1.0.0/datazone/deltastorage/store.py +138 -0
- datazone_sdk-1.0.0/datazone/deltastorage/table.py +210 -0
- datazone_sdk-1.0.0/pyproject.toml +42 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datazone-sdk
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Database and Delta storage client library for working with Delta Lake tables
|
|
5
|
+
Author: Team Enigma
|
|
6
|
+
Author-email: enigma@energinet.dk
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
14
|
+
Requires-Dist: datamazing (>=5.1.6)
|
|
15
|
+
Requires-Dist: deltalake (==1.2.1)
|
|
16
|
+
Requires-Dist: obstore (>=0.8.2)
|
|
17
|
+
Requires-Dist: pandas (>=2.0.3,<3)
|
|
18
|
+
Requires-Dist: polars (>=1.33.1)
|
|
19
|
+
Requires-Dist: pyarrow (>=19.0.0)
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# Datazone SDK
|
|
23
|
+
|
|
24
|
+
Database and Delta storage client library for working with Delta Lake tables.
|
|
25
|
+
|
|
26
|
+
## Overview
|
|
27
|
+
|
|
28
|
+
This package provides functionality for interacting with Delta Lake tables, including:
|
|
29
|
+
|
|
30
|
+
- **Database Client**: High-level client for querying Delta Lake tables with support for time intervals, time travel, and filtering.
|
|
31
|
+
- **Delta Storage**: Low-level components for working with Delta tables, schemas, and data types.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install datazone-sdk
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import datazone as dz
|
|
43
|
+
|
|
44
|
+
# Create a client from a storage account name
|
|
45
|
+
client = dz.DatabaseClient.from_resource_name(
|
|
46
|
+
storage_account="<storage-account>",
|
|
47
|
+
container_name="<container-name>",
|
|
48
|
+
sub_path="<sub-path>",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Query a table
|
|
52
|
+
df = client.query(
|
|
53
|
+
table_name="my_table",
|
|
54
|
+
filters={"column": "value"},
|
|
55
|
+
)
|
|
56
|
+
```
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Datazone SDK
|
|
2
|
+
|
|
3
|
+
Database and Delta storage client library for working with Delta Lake tables.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
This package provides functionality for interacting with Delta Lake tables, including:
|
|
8
|
+
|
|
9
|
+
- **Database Client**: High-level client for querying Delta Lake tables with support for time intervals, time travel, and filtering.
|
|
10
|
+
- **Delta Storage**: Low-level components for working with Delta tables, schemas, and data types.
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install datazone-sdk
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
import datazone as dz
|
|
22
|
+
|
|
23
|
+
# Create a client from a storage account name
|
|
24
|
+
client = dz.DatabaseClient.from_resource_name(
|
|
25
|
+
storage_account="<storage-account>",
|
|
26
|
+
container_name="<container-name>",
|
|
27
|
+
sub_path="<sub-path>",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Query a table
|
|
31
|
+
df = client.query(
|
|
32
|
+
table_name="my_table",
|
|
33
|
+
filters={"column": "value"},
|
|
34
|
+
)
|
|
35
|
+
```
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from . import client
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import datamazing.pandas as pdz
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pyarrow.compute as pc
|
|
6
|
+
|
|
7
|
+
from datazone.deltastorage.slicing import HyperSlice
|
|
8
|
+
from datazone.deltastorage.store import Store
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DatabaseClient:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
path: str,
|
|
15
|
+
storage_options: dict[str, str] | None = None,
|
|
16
|
+
table_prefix: str = "",
|
|
17
|
+
):
|
|
18
|
+
self.store = Store(
|
|
19
|
+
path=path,
|
|
20
|
+
storage_options=storage_options,
|
|
21
|
+
)
|
|
22
|
+
self.table_prefix = table_prefix
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def from_resource_name(
|
|
26
|
+
cls,
|
|
27
|
+
storage_account: str,
|
|
28
|
+
container_name: str = "datasets",
|
|
29
|
+
sub_path: str = "",
|
|
30
|
+
table_prefix: str = "",
|
|
31
|
+
):
|
|
32
|
+
"""Create a DatabaseClient from resource name (storage account).
|
|
33
|
+
This assumes the path of the delta lake is of the form:
|
|
34
|
+
abfss://{container_name}@{storage_account}.dfs.core.windows.net/{sub_path}
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
storage_account (str): Storage account name.
|
|
38
|
+
container_name (str, optional): Container name. Defaults to "datasets".
|
|
39
|
+
sub_path (str, optional): Sub-path within the container. Defaults to "".
|
|
40
|
+
table_prefix (str, optional): Table prefix to use (e.g. `mz_` for archive).
|
|
41
|
+
Defaults to "".
|
|
42
|
+
credential (optional): Azure credential to use.
|
|
43
|
+
Defaults to DefaultAzureCredential().
|
|
44
|
+
"""
|
|
45
|
+
path = f"abfss://{container_name}@{storage_account}.dfs.core.windows.net"
|
|
46
|
+
if sub_path:
|
|
47
|
+
path += f"/{sub_path}"
|
|
48
|
+
|
|
49
|
+
storage_options = Store._get_func_storage_options()
|
|
50
|
+
|
|
51
|
+
return cls(
|
|
52
|
+
path=path, storage_options=storage_options, table_prefix=table_prefix
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def get_unit_and_multiple(self, timedelta: pd.Timedelta) -> tuple[str | None, int]:
|
|
56
|
+
"""
|
|
57
|
+
Get unit and multiple of a timedelta. E.g. for a timedelta of "PT5M" then
|
|
58
|
+
unit = "minute" and multiple = 5.
|
|
59
|
+
NOTE: Timedelta must have one and only one non-zero component,
|
|
60
|
+
i.e. "PT0S" doesnt work, and neither does "PT5M10S".
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
timedelta (pd.Timedelta): Timedelta
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
tuple[str, int]: Unit and multiple
|
|
67
|
+
"""
|
|
68
|
+
components = timedelta.components._asdict()
|
|
69
|
+
|
|
70
|
+
# remove plural ending from unit, since
|
|
71
|
+
# this is the standard pyarrow uses
|
|
72
|
+
components = {k[:-1]: v for k, v in components.items()}
|
|
73
|
+
|
|
74
|
+
non_zero_components = {
|
|
75
|
+
unit: multiple for unit, multiple in components.items() if multiple != 0
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if len(non_zero_components) == 0:
|
|
79
|
+
return None, 0
|
|
80
|
+
|
|
81
|
+
if len(non_zero_components) != 1:
|
|
82
|
+
raise ValueError("Timedelta must have one and only one non-zero multiple.")
|
|
83
|
+
|
|
84
|
+
return next(iter(non_zero_components.items()))
|
|
85
|
+
|
|
86
|
+
def relative_time_travel_version(
|
|
87
|
+
self, time_column: str, block: pd.Timedelta, horizon: pd.Timedelta
|
|
88
|
+
) -> pc.Expression:
|
|
89
|
+
"""
|
|
90
|
+
Get value to use for filtering a relative time travel
|
|
91
|
+
(i.e. the interval [valid-from, valid-to] must contain
|
|
92
|
+
this value)
|
|
93
|
+
"""
|
|
94
|
+
unit, multiple = self.get_unit_and_multiple(block)
|
|
95
|
+
|
|
96
|
+
if multiple == 0:
|
|
97
|
+
# `pc.floor_temporal` fails with multiple=0,
|
|
98
|
+
# but in this case we don't need to floor
|
|
99
|
+
# the time anyway
|
|
100
|
+
start_of_block = pc.field("time_utc")
|
|
101
|
+
else:
|
|
102
|
+
start_of_block = pc.floor_temporal(
|
|
103
|
+
pc.field(time_column),
|
|
104
|
+
multiple=multiple,
|
|
105
|
+
unit=unit,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
return start_of_block - horizon.to_pytimedelta()
|
|
109
|
+
|
|
110
|
+
def time_travel_filter(
|
|
111
|
+
self,
|
|
112
|
+
time_travel: pdz.TimeTravel,
|
|
113
|
+
time_column: str,
|
|
114
|
+
valid_from_column: str,
|
|
115
|
+
valid_to_column: str,
|
|
116
|
+
) -> list[HyperSlice]:
|
|
117
|
+
"""Filter delta table on a time travel
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
time_travel (pdz.TimeTravel): Time travel
|
|
121
|
+
time_column (str): Time column name
|
|
122
|
+
valid_from_column (str): Valid-from column name
|
|
123
|
+
valid_to_column (str): Valid-to column name
|
|
124
|
+
"""
|
|
125
|
+
match time_travel.tense:
|
|
126
|
+
case "absolute":
|
|
127
|
+
# If the time travel is absolute, we filter
|
|
128
|
+
# to entries where [valid-from, valid-to]
|
|
129
|
+
# contains `as_of_time`
|
|
130
|
+
version = time_travel.as_of_time.to_pydatetime()
|
|
131
|
+
case "relative":
|
|
132
|
+
version = self.relative_time_travel_version(
|
|
133
|
+
time_column, time_travel.block, time_travel.horizon
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return [
|
|
137
|
+
HyperSlice((valid_from_column, "<=", version)),
|
|
138
|
+
HyperSlice((valid_to_column, ">", version)),
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
def query(
|
|
142
|
+
self,
|
|
143
|
+
table_name: str,
|
|
144
|
+
time_interval: Optional[pdz.TimeInterval] = None,
|
|
145
|
+
time_travel: Optional[pdz.TimeTravel] = None,
|
|
146
|
+
filters: Optional[dict[str, object]] = None,
|
|
147
|
+
columns: Optional[list[str]] = None,
|
|
148
|
+
include_validity_period_columns: bool = False,
|
|
149
|
+
include_generated_columns: bool = False,
|
|
150
|
+
) -> pd.DataFrame:
|
|
151
|
+
"""Query table.
|
|
152
|
+
Query defaults are set to match old Table Storage client behavior.
|
|
153
|
+
Time travel defaults to "as of now"
|
|
154
|
+
Validity period columns are dropped by default.
|
|
155
|
+
Generated columns are dropped by default.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
table_name (str): Name of the table
|
|
159
|
+
time_interval (Optional[pdz.TimeInterval], optional): Time interval for the
|
|
160
|
+
query. Defaults to None.
|
|
161
|
+
time_travel (Optional[pdz.TimeTravel], optional): Time travel information.
|
|
162
|
+
Defaults to None.
|
|
163
|
+
filters (Optional[dict[str, object]], optional): Filters to apply to the
|
|
164
|
+
query.
|
|
165
|
+
Defaults to None.
|
|
166
|
+
columns (Optional[list[str]], optional): Columns to return.
|
|
167
|
+
Selecting columns can significantly improve query performance.
|
|
168
|
+
Defaults to None, meaning all columns will be returned.
|
|
169
|
+
include_validity_period_columns (bool, optional): Whether to include
|
|
170
|
+
validity period columns in the result;
|
|
171
|
+
(`valid_from_time_utc`, `valid_to_time_utc`).
|
|
172
|
+
Defaults to False.
|
|
173
|
+
include_generated_columns (bool, optional): Whether to include generated
|
|
174
|
+
columns in the result; (e.g. `valid_from_time_utc`, `valid_to_time_utc`).
|
|
175
|
+
Defaults to False.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
pd.DataFrame: The result of the query.
|
|
179
|
+
"""
|
|
180
|
+
# Prefix used to differentiate between operation ("{table_name}")
|
|
181
|
+
# and historical ("mz_{table_name}").
|
|
182
|
+
table_name = f"{self.table_prefix}{table_name}"
|
|
183
|
+
|
|
184
|
+
table = self.store.get_table(table_name)
|
|
185
|
+
hyper_slice = []
|
|
186
|
+
|
|
187
|
+
if filters:
|
|
188
|
+
for key, value in filters.items():
|
|
189
|
+
if isinstance(value, (list, tuple, set)):
|
|
190
|
+
hyper_slice.append((key, "in", value))
|
|
191
|
+
else:
|
|
192
|
+
hyper_slice.append((key, "=", value))
|
|
193
|
+
if time_interval:
|
|
194
|
+
hyper_slice.append(("time_utc", ">=", time_interval.left))
|
|
195
|
+
hyper_slice.append(("time_utc", "<=", time_interval.right))
|
|
196
|
+
|
|
197
|
+
if time_travel is None:
|
|
198
|
+
time_travel = pdz.TimeTravel(
|
|
199
|
+
as_of_time=pd.Timestamp.utcnow(),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
tt_filter = self.time_travel_filter(
|
|
203
|
+
time_travel,
|
|
204
|
+
time_column="time_utc",
|
|
205
|
+
valid_from_column="valid_from_time_utc",
|
|
206
|
+
valid_to_column="valid_to_time_utc",
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
hyper_slice.extend(tt_filter)
|
|
210
|
+
pl_df = table.read(hyper_slice=HyperSlice(hyper_slice), columns=columns)
|
|
211
|
+
|
|
212
|
+
pd_df = pl_df.to_pandas()
|
|
213
|
+
|
|
214
|
+
# We truncate to second, and change to nanosecond
|
|
215
|
+
# precision because this was used by the old solution (Azure Table Storage)
|
|
216
|
+
for col in pd_df.select_dtypes(include=["datetime", "datetimetz"]).columns:
|
|
217
|
+
pd_df[col] = pd_df[col].dt.floor("s").dt.as_unit("ns")
|
|
218
|
+
|
|
219
|
+
# Drop generated columns
|
|
220
|
+
if not include_generated_columns:
|
|
221
|
+
generated_cols = []
|
|
222
|
+
for field in table.schema().fields:
|
|
223
|
+
if field.generated_as is not None:
|
|
224
|
+
generated_cols.append(field.column_name)
|
|
225
|
+
pd_df = pd_df.drop(columns=generated_cols, errors="ignore")
|
|
226
|
+
|
|
227
|
+
# Drop valid-from/to columns
|
|
228
|
+
if not include_validity_period_columns:
|
|
229
|
+
pd_df = pd_df.drop(
|
|
230
|
+
columns=["valid_from_time_utc", "valid_to_time_utc"], errors="ignore"
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
return pd_df
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DataType(ABC):
|
|
7
|
+
@classmethod
|
|
8
|
+
def from_arrow(cls, pa_type: pa.DataType) -> "DataType":
|
|
9
|
+
if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
|
|
10
|
+
return String()
|
|
11
|
+
elif pa.types.is_floating(pa_type):
|
|
12
|
+
return Float()
|
|
13
|
+
elif pa.types.is_integer(pa_type):
|
|
14
|
+
return Int()
|
|
15
|
+
elif pa.types.is_timestamp(pa_type):
|
|
16
|
+
return Timestamp(tz=pa_type.tz)
|
|
17
|
+
elif pa.types.is_date(pa_type):
|
|
18
|
+
return Date()
|
|
19
|
+
elif pa.types.is_boolean(pa_type):
|
|
20
|
+
return Boolean()
|
|
21
|
+
elif pa.types.is_null(pa_type):
|
|
22
|
+
return Null()
|
|
23
|
+
else:
|
|
24
|
+
raise ValueError(f"Unsupported data type: {pa_type}")
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def __str__(self):
|
|
28
|
+
...
|
|
29
|
+
|
|
30
|
+
def __eq__(self, other: "DataType") -> bool:
|
|
31
|
+
return self.to_arrow() == other.to_arrow()
|
|
32
|
+
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def to_arrow(self) -> pa.DataType:
|
|
35
|
+
...
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class String(DataType):
|
|
39
|
+
def __str__(self):
|
|
40
|
+
return "string"
|
|
41
|
+
|
|
42
|
+
def to_arrow(self) -> pa.DataType:
|
|
43
|
+
return pa.string()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Float(DataType):
|
|
47
|
+
def __str__(self):
|
|
48
|
+
return "float"
|
|
49
|
+
|
|
50
|
+
def to_arrow(self) -> pa.DataType:
|
|
51
|
+
return pa.float64()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Int(DataType):
|
|
55
|
+
def __str__(self):
|
|
56
|
+
return "int"
|
|
57
|
+
|
|
58
|
+
def to_arrow(self) -> pa.DataType:
|
|
59
|
+
return pa.int64()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class Timestamp(DataType):
|
|
63
|
+
def __str__(self):
|
|
64
|
+
return f"timestamp[{self.tz}]"
|
|
65
|
+
|
|
66
|
+
def __init__(self, tz: str):
|
|
67
|
+
self.tz = tz
|
|
68
|
+
|
|
69
|
+
def to_arrow(self) -> pa.DataType:
|
|
70
|
+
return pa.timestamp("us", tz=self.tz)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class Date(DataType):
|
|
74
|
+
def __str__(self):
|
|
75
|
+
return "date"
|
|
76
|
+
|
|
77
|
+
def to_arrow(self) -> pa.DataType:
|
|
78
|
+
return pa.date32()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class Boolean(DataType):
|
|
82
|
+
def __str__(self):
|
|
83
|
+
return "boolean"
|
|
84
|
+
|
|
85
|
+
def to_arrow(self) -> pa.DataType:
|
|
86
|
+
return pa.bool_()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class Null(DataType):
|
|
90
|
+
def __str__(self):
|
|
91
|
+
return "null"
|
|
92
|
+
|
|
93
|
+
def to_arrow(self) -> pa.DataType:
|
|
94
|
+
return pa.null()
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import datetime as dt
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any
|
|
5
|
+
from zoneinfo import ZoneInfo
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GeneratedColumn(ABC):
|
|
11
|
+
"""Class representing a generated column."""
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def base_column_names(self) -> list[str]:
|
|
16
|
+
...
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def to_metadata(self) -> dict:
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def expression(self) -> pl.Expr:
|
|
24
|
+
"""Polars expression to compute the generated column."""
|
|
25
|
+
...
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def get_generated_conditions(self, op: str, value) -> list[tuple[str, Any]]:
|
|
29
|
+
"""Get conditions on the generated column based on a condition
|
|
30
|
+
on the base column."""
|
|
31
|
+
...
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_metadata(cls, metadata: dict):
|
|
35
|
+
if metadata["type"] == "date_bucket":
|
|
36
|
+
return DateBucket(
|
|
37
|
+
base_column_name=metadata["base_column_name"],
|
|
38
|
+
as_tz=metadata.get("as_tz", None),
|
|
39
|
+
)
|
|
40
|
+
elif metadata["type"] == "concat":
|
|
41
|
+
return Concat(
|
|
42
|
+
base_column_names=metadata["base_column_names"],
|
|
43
|
+
delimiter=metadata["delimiter"],
|
|
44
|
+
)
|
|
45
|
+
else:
|
|
46
|
+
raise ValueError(f"Unknown generated column type: {metadata['type']}")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DateBucket(GeneratedColumn):
|
|
50
|
+
def __init__(self, base_column_name: str, as_tz: str | None = None):
|
|
51
|
+
"""Generated column which bins a timestamp column to a date,
|
|
52
|
+
optionally converting to a specific timezone first.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
base_column_name (str): Base column name
|
|
56
|
+
as_tz (str, optional): Cast to date in this timezone.
|
|
57
|
+
Defaults to None, meaning it uses the original timezone.
|
|
58
|
+
"""
|
|
59
|
+
self.base_column_name = base_column_name
|
|
60
|
+
self.as_tz = as_tz
|
|
61
|
+
|
|
62
|
+
def __str__(self):
|
|
63
|
+
if self.as_tz is not None:
|
|
64
|
+
return f"date_bucket({self.base_column_name}, as_tz={self.as_tz})"
|
|
65
|
+
else:
|
|
66
|
+
return f"date_bucket({self.base_column_name})"
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def base_column_names(self) -> list[str]:
|
|
70
|
+
return [self.base_column_name]
|
|
71
|
+
|
|
72
|
+
def to_metadata(self) -> dict:
|
|
73
|
+
metadata = {
|
|
74
|
+
"type": "date_bucket",
|
|
75
|
+
"base_column_name": self.base_column_name,
|
|
76
|
+
}
|
|
77
|
+
if self.as_tz is not None:
|
|
78
|
+
metadata["as_tz"] = self.as_tz
|
|
79
|
+
return metadata
|
|
80
|
+
|
|
81
|
+
def expression(self) -> pl.Expr:
|
|
82
|
+
expr = pl.col(self.base_column_name)
|
|
83
|
+
if self.as_tz is not None:
|
|
84
|
+
expr = expr.dt.convert_time_zone(self.as_tz)
|
|
85
|
+
return expr.dt.date()
|
|
86
|
+
|
|
87
|
+
def get_generated_conditions(
|
|
88
|
+
self, op: str, value: dt.datetime
|
|
89
|
+
) -> list[tuple[str, dt.date]]:
|
|
90
|
+
"""Get conditions on the generated column based on a condition
|
|
91
|
+
on the base column.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
op (str): Operator
|
|
95
|
+
value (dt.datetime): Value
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
tuple[str, dt.date]: List of conditions on the generated column
|
|
99
|
+
"""
|
|
100
|
+
timestamp = copy.copy(value)
|
|
101
|
+
if self.as_tz is not None:
|
|
102
|
+
timestamp = timestamp.astimezone(ZoneInfo(self.as_tz))
|
|
103
|
+
date = timestamp.date()
|
|
104
|
+
|
|
105
|
+
match op:
|
|
106
|
+
case "=":
|
|
107
|
+
return [("=", date)]
|
|
108
|
+
case ("<" | "<="):
|
|
109
|
+
return [("<=", date)]
|
|
110
|
+
case (">" | ">="):
|
|
111
|
+
return [(">=", date)]
|
|
112
|
+
case _:
|
|
113
|
+
# for other operations, we cannot make any
|
|
114
|
+
# useful filters on the generated column
|
|
115
|
+
return []
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class Concat(GeneratedColumn):
|
|
119
|
+
def __init__(self, base_column_names: list[str], delimiter: str):
|
|
120
|
+
"""Generated column which concats multiple columns into a single string.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
base_column_names (str): Base column names
|
|
124
|
+
delimiter (str): Delimiter used for concatenation
|
|
125
|
+
"""
|
|
126
|
+
self._base_column_names = base_column_names
|
|
127
|
+
self.delimiter = delimiter
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def base_column_names(self) -> list[str]:
|
|
131
|
+
return self._base_column_names
|
|
132
|
+
|
|
133
|
+
def __str__(self):
|
|
134
|
+
base_cols_str = ", ".join(self.base_column_names)
|
|
135
|
+
return f"concat([{base_cols_str}], delimiter='{self.delimiter}')"
|
|
136
|
+
|
|
137
|
+
def to_metadata(self) -> dict:
|
|
138
|
+
metadata = {
|
|
139
|
+
"type": "concat",
|
|
140
|
+
"base_column_names": self.base_column_names,
|
|
141
|
+
"delimiter": self.delimiter,
|
|
142
|
+
}
|
|
143
|
+
return metadata
|
|
144
|
+
|
|
145
|
+
def expression(self) -> pl.Expr:
|
|
146
|
+
return pl.concat_str(self.base_column_names, separator=self.delimiter)
|
|
147
|
+
|
|
148
|
+
def get_generated_conditions(self, op: str, value: Any) -> list[tuple[str, Any]]:
|
|
149
|
+
# for concat generated columns, we cannot make any
|
|
150
|
+
# useful filters on the generated column
|
|
151
|
+
return []
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import polars as pl
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
|
|
7
|
+
from .data_types import DataType
|
|
8
|
+
from .generated_columns import GeneratedColumn
|
|
9
|
+
from .slicing import HyperSlice
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Field:
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
column_name: str,
|
|
16
|
+
data_type: DataType,
|
|
17
|
+
generated_as: Optional[GeneratedColumn] = None,
|
|
18
|
+
):
|
|
19
|
+
"""Class representing a table field.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
column_name (str): Column name
|
|
23
|
+
data_type (DataType): Data type
|
|
24
|
+
generated_as (GeneratedColumn, optional): Generated column based on a
|
|
25
|
+
regular column. Defaults to None, meaning the column is not generated.
|
|
26
|
+
"""
|
|
27
|
+
self.column_name = column_name
|
|
28
|
+
self.data_type = data_type
|
|
29
|
+
self.generated_as = generated_as
|
|
30
|
+
|
|
31
|
+
def __eq__(self, other: "Field") -> bool:
|
|
32
|
+
return self.to_arrow().equals(other.to_arrow(), check_metadata=True)
|
|
33
|
+
|
|
34
|
+
def __repr__(self):
|
|
35
|
+
string = f"{self.column_name}: {self.data_type}"
|
|
36
|
+
if self.generated_as is not None:
|
|
37
|
+
string = f"{string} [generated as {self.generated_as}]"
|
|
38
|
+
return string
|
|
39
|
+
|
|
40
|
+
def to_arrow(self):
|
|
41
|
+
"""Convert to pyarrow field. Information about generated columns
|
|
42
|
+
is stored as metadata."""
|
|
43
|
+
pa_dtype = self.data_type.to_arrow()
|
|
44
|
+
|
|
45
|
+
pa_metadata = None
|
|
46
|
+
if self.generated_as is not None:
|
|
47
|
+
pa_metadata = {
|
|
48
|
+
"generated_column": json.dumps(self.generated_as.to_metadata())
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return pa.field(self.column_name, pa_dtype, metadata=pa_metadata)
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def from_arrow(cls, pa_field: pa.Field) -> "Field":
|
|
55
|
+
"""Convert from pyarrow field"""
|
|
56
|
+
data_type = DataType.from_arrow(pa_field.type)
|
|
57
|
+
|
|
58
|
+
generated_as = None
|
|
59
|
+
|
|
60
|
+
if pa_field.metadata is not None:
|
|
61
|
+
pa_metadata = {k.decode(): v.decode() for k, v in pa_field.metadata.items()}
|
|
62
|
+
if "generated_column" in pa_metadata:
|
|
63
|
+
gen_col_metadata = json.loads(pa_metadata["generated_column"])
|
|
64
|
+
generated_as = GeneratedColumn.from_metadata(gen_col_metadata)
|
|
65
|
+
|
|
66
|
+
return cls(pa_field.name, data_type, generated_as)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Schema:
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
fields: list[Field],
|
|
73
|
+
):
|
|
74
|
+
"""Class representing a table schema.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
fields (list[Field]): Schema fields
|
|
78
|
+
"""
|
|
79
|
+
self.fields = fields
|
|
80
|
+
|
|
81
|
+
def __eq__(self, other: "Schema") -> bool:
|
|
82
|
+
return self.to_arrow().equals(other.to_arrow(), check_metadata=True)
|
|
83
|
+
|
|
84
|
+
def __repr__(self):
|
|
85
|
+
field_reprs = [repr(field) for field in self.fields]
|
|
86
|
+
return "\n".join(field_reprs)
|
|
87
|
+
|
|
88
|
+
def to_arrow(self):
|
|
89
|
+
"""Convert to pyarrow schema."""
|
|
90
|
+
pa_fields = [field.to_arrow() for field in self.fields]
|
|
91
|
+
return pa.schema(pa_fields)
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def from_arrow(cls, pa_schema: pa.Schema) -> "Schema":
|
|
95
|
+
"""Convert from pyarrow schema"""
|
|
96
|
+
fields = []
|
|
97
|
+
for pa_field in pa_schema:
|
|
98
|
+
field = Field.from_arrow(pa_field)
|
|
99
|
+
fields.append(field)
|
|
100
|
+
return cls(fields)
|
|
101
|
+
|
|
102
|
+
def add_generated_columns(self, base_df: pl.DataFrame) -> pl.DataFrame:
|
|
103
|
+
"""Add additional columns to a dataframe derived
|
|
104
|
+
from all generated columns in the schema.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
base_df (pl.DataFrame): Input dataframe
|
|
108
|
+
"""
|
|
109
|
+
generated_exprs = []
|
|
110
|
+
for field in self.fields:
|
|
111
|
+
if field.generated_as is not None:
|
|
112
|
+
expr = field.generated_as.expression().alias(field.column_name)
|
|
113
|
+
generated_exprs.append(expr)
|
|
114
|
+
|
|
115
|
+
return base_df.with_columns(generated_exprs)
|
|
116
|
+
|
|
117
|
+
def add_generated_filters(self, base_slice: HyperSlice) -> HyperSlice:
|
|
118
|
+
"""Add additional filters to a hyperslice based from
|
|
119
|
+
all generated columns in the schema.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
base_slice (HyperSlice): Input hyperslice
|
|
123
|
+
"""
|
|
124
|
+
generated_slice = []
|
|
125
|
+
for col, op, val in base_slice:
|
|
126
|
+
for field in self.fields:
|
|
127
|
+
if field.generated_as is None:
|
|
128
|
+
continue
|
|
129
|
+
if col not in field.generated_as.base_column_names:
|
|
130
|
+
continue
|
|
131
|
+
for gen_op, gen_val in field.generated_as.get_generated_conditions(
|
|
132
|
+
op, val
|
|
133
|
+
):
|
|
134
|
+
generated_slice.append((field.column_name, gen_op, gen_val))
|
|
135
|
+
return HyperSlice(list(base_slice) + generated_slice)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class HyperSlice(list[tuple[str, str, Any]]):
|
|
5
|
+
"""A list of tuples representing a n-dimensional slice of a table.
|
|
6
|
+
Each tuple corresponds to a filter applied on the table,
|
|
7
|
+
in the form of (column, operator, value). For example, the
|
|
8
|
+
following hyper slice represents all records in a table
|
|
9
|
+
where the country is 'Denmark' and the date is greater than
|
|
10
|
+
2000-01-01:
|
|
11
|
+
|
|
12
|
+
[
|
|
13
|
+
("country", "=", "Denmark"),
|
|
14
|
+
("date", ">", "2000-01-01"),
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
In SQL, this would be equivalent to the WHERE clause:
|
|
18
|
+
|
|
19
|
+
country = 'Denmark' AND date > '2000-01-01'
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
...
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import deltalake as dl
|
|
4
|
+
import obstore as obs
|
|
5
|
+
from deltalake.exceptions import TableNotFoundError as DeltaTableNotFoundError
|
|
6
|
+
|
|
7
|
+
from .schema import Schema
|
|
8
|
+
from .table import Table
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Store:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
path: str,
|
|
15
|
+
storage_options: dict[str, str] | None = None,
|
|
16
|
+
):
|
|
17
|
+
"""Class representing a store containing datasets
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
path (str): Root directory containing Delta tables
|
|
21
|
+
storage_options (dict[str, str] | None, optional): Storage options used for
|
|
22
|
+
remote cloud storage. For more information on available options,
|
|
23
|
+
go to https://delta-io.github.io/delta-rs/integrations/object-storage/.
|
|
24
|
+
Defaults to None, corresponding to the local file system.
|
|
25
|
+
"""
|
|
26
|
+
self.path = path
|
|
27
|
+
if storage_options is None:
|
|
28
|
+
storage_options = self._get_func_storage_options()
|
|
29
|
+
self.storage_options = storage_options
|
|
30
|
+
# We use obstore to interact with remote
|
|
31
|
+
# cloud storage for operations not directly
|
|
32
|
+
# supported by delta-rs (e.g. listing directories)
|
|
33
|
+
# We could use fsspec, but the `storage_options`
|
|
34
|
+
# used by delta-rs and fsspec are not compatible
|
|
35
|
+
self._obstore = obs.store.from_url(
|
|
36
|
+
url=path,
|
|
37
|
+
config=storage_options,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def __repr__(self):
|
|
41
|
+
return f"Store('{self.path}')"
|
|
42
|
+
|
|
43
|
+
def _get_table_uri(self, table_name: str) -> str:
|
|
44
|
+
return self.path + "/" + table_name
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _get_func_storage_options() -> dict[str, str]:
|
|
48
|
+
"""Get storage options.
|
|
49
|
+
This differ depending on whether we are running
|
|
50
|
+
in cloud or locally.
|
|
51
|
+
"""
|
|
52
|
+
if "IDENTITY_ENDPOINT" in os.environ:
|
|
53
|
+
# When running in Azure Function, the environment variable IDENTITY_ENDPOINT
|
|
54
|
+
# will be set, and we use the managed identity to access the storage account
|
|
55
|
+
storage_options = {"azure_msi_endpoint": os.environ["IDENTITY_ENDPOINT"]}
|
|
56
|
+
else:
|
|
57
|
+
# When running locally, we use the Azure CLI to authenticate.
|
|
58
|
+
storage_options = {"use_azure_cli": "true"}
|
|
59
|
+
|
|
60
|
+
return storage_options
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def from_func_environment_variable(cls, container_name: str = "datasets"):
|
|
64
|
+
"""Create Store instance from environment variable.
|
|
65
|
+
This uses default storage options, created in `__init__`."""
|
|
66
|
+
storage_account_name = os.environ["OPERATIONAL_DATA_STORAGE_ACCOUNT"]
|
|
67
|
+
path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net"
|
|
68
|
+
|
|
69
|
+
return cls(path=path)
|
|
70
|
+
|
|
71
|
+
def list_tables(self) -> list[str]:
|
|
72
|
+
"""List all Delta tables"""
|
|
73
|
+
return self._obstore.list_with_delimiter()["common_prefixes"]
|
|
74
|
+
|
|
75
|
+
def table_exists(self, table_name: str):
|
|
76
|
+
"""Check if Delta table exists
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
table_name (str): Table name
|
|
80
|
+
"""
|
|
81
|
+
# For some reason `deltalake.DeltaTable.is_deltatable()` can be very slow.
|
|
82
|
+
# deltalake has an issue open about this:
|
|
83
|
+
# https://github.com/delta-io/delta-rs/issues/3942
|
|
84
|
+
# For now we catch the exception when trying to load the table
|
|
85
|
+
try:
|
|
86
|
+
_ = dl.DeltaTable(
|
|
87
|
+
table_uri=self._get_table_uri(table_name),
|
|
88
|
+
storage_options=self.storage_options,
|
|
89
|
+
without_files=True,
|
|
90
|
+
)
|
|
91
|
+
except DeltaTableNotFoundError:
|
|
92
|
+
return False
|
|
93
|
+
return True
|
|
94
|
+
|
|
95
|
+
def create_table(
|
|
96
|
+
self,
|
|
97
|
+
table_name: str,
|
|
98
|
+
schema: Schema,
|
|
99
|
+
partition_by: list[str] | None = None,
|
|
100
|
+
) -> Table:
|
|
101
|
+
"""Create Delta table
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
table_name (str): Table name.
|
|
105
|
+
schema (pl.Schema): Table schema.
|
|
106
|
+
partition_by (list[str]): Partition columns.
|
|
107
|
+
"""
|
|
108
|
+
if self.table_exists(table_name):
|
|
109
|
+
raise ValueError(f"Table with name '{table_name}' already exists")
|
|
110
|
+
|
|
111
|
+
if schema is None:
|
|
112
|
+
raise ValueError("Schema must be provided when creating a new table")
|
|
113
|
+
|
|
114
|
+
pa_schema = schema.to_arrow()
|
|
115
|
+
|
|
116
|
+
dl.DeltaTable.create(
|
|
117
|
+
table_uri=self._get_table_uri(table_name),
|
|
118
|
+
schema=pa_schema,
|
|
119
|
+
storage_options=self.storage_options,
|
|
120
|
+
partition_by=partition_by,
|
|
121
|
+
configuration={
|
|
122
|
+
"delta.deletedFileRetentionDuration": "interval 2 hours",
|
|
123
|
+
"delta.logRetentionDuration": "interval 4 hours",
|
|
124
|
+
},
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return Table(self._get_table_uri(table_name), self.storage_options)
|
|
128
|
+
|
|
129
|
+
def get_table(self, table_name: str) -> Table:
|
|
130
|
+
"""Get Delta table
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
table_name (str): Table name
|
|
134
|
+
"""
|
|
135
|
+
if not self.table_exists(table_name):
|
|
136
|
+
raise ValueError(f"Table with name '{table_name}' does not exist")
|
|
137
|
+
|
|
138
|
+
return Table(self._get_table_uri(table_name), self.storage_options)
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
import deltalake as dl
|
|
4
|
+
import polars as pl
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
|
|
7
|
+
from .schema import Schema
|
|
8
|
+
from .slicing import HyperSlice
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _dnf_to_sql(dnf: list[tuple]) -> str:
|
|
12
|
+
"""Convert DNF expression to SQL expression."""
|
|
13
|
+
if len(dnf) == 0:
|
|
14
|
+
return "1=1"
|
|
15
|
+
|
|
16
|
+
sql_parts = []
|
|
17
|
+
for col, op, val in dnf:
|
|
18
|
+
if op == "in":
|
|
19
|
+
assert isinstance(val, list)
|
|
20
|
+
lst = ", ".join([f"'{item}'" for item in val])
|
|
21
|
+
sql_parts.append(f"{col} IN ({lst})")
|
|
22
|
+
elif op in [">=", "<=", ">", "<", "="]:
|
|
23
|
+
sql_parts.append(f"{col} {op} '{val}'")
|
|
24
|
+
else:
|
|
25
|
+
raise ValueError(f"Unsupported operation: {op}")
|
|
26
|
+
|
|
27
|
+
return " AND ".join(sql_parts)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Table:
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
table_uri: str,
|
|
34
|
+
storage_options: dict[str, str] | None = None,
|
|
35
|
+
):
|
|
36
|
+
"""Class representing a dataset
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
delta_table (dl.DeltaTable): Delta table
|
|
40
|
+
"""
|
|
41
|
+
self.table_uri = table_uri
|
|
42
|
+
self.storage_options = storage_options
|
|
43
|
+
|
|
44
|
+
self.table_name = self.table_uri.split("/")[-1]
|
|
45
|
+
self._delta_table = None
|
|
46
|
+
|
|
47
|
+
def __repr__(self):
|
|
48
|
+
return f"Table('{self.table_name}')"
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def delta_table(self) -> dl.DeltaTable:
|
|
52
|
+
"""Get the Delta table object.
|
|
53
|
+
As the `Table`-class is lazily initialized,
|
|
54
|
+
the `delta_table`-property is initialized when needed.
|
|
55
|
+
We do not cache it, which creates a little overhead, but reduces
|
|
56
|
+
the risk of false transaction issues when doing concurrent reads/writes.
|
|
57
|
+
This is important because using the same instance can lead to transaction
|
|
58
|
+
issues in delta as DeltaTable uses metadata (transaction id) from
|
|
59
|
+
the first time the object is instantiated.
|
|
60
|
+
|
|
61
|
+
The risk disappears when this instance is created within a lock.
|
|
62
|
+
"""
|
|
63
|
+
return dl.DeltaTable(self.table_uri, storage_options=self.storage_options)
|
|
64
|
+
|
|
65
|
+
def partition_cols(self) -> list[str]:
|
|
66
|
+
"""Get the partition columns of the table"""
|
|
67
|
+
return self.delta_table.metadata().partition_columns
|
|
68
|
+
|
|
69
|
+
def schema(self) -> Schema:
|
|
70
|
+
"""Get the schema of the table"""
|
|
71
|
+
pa_schema = pa.schema(self.delta_table.schema())
|
|
72
|
+
return Schema.from_arrow(pa_schema)
|
|
73
|
+
|
|
74
|
+
def read(
|
|
75
|
+
self, hyper_slice: Optional[HyperSlice] = None, columns=None
|
|
76
|
+
) -> pl.DataFrame:
|
|
77
|
+
"""Read from Delta table
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
hyper_slice (HyperSlice): Hyper sliced used to filter data
|
|
81
|
+
"""
|
|
82
|
+
if hyper_slice is None:
|
|
83
|
+
hyper_slice = []
|
|
84
|
+
|
|
85
|
+
# add generated filters to hyperslice
|
|
86
|
+
hyper_slice = self.schema().add_generated_filters(hyper_slice)
|
|
87
|
+
|
|
88
|
+
delta_table = self.delta_table
|
|
89
|
+
partition_cols = delta_table.metadata().partition_columns
|
|
90
|
+
|
|
91
|
+
if len(hyper_slice) == 0:
|
|
92
|
+
file_filters = None
|
|
93
|
+
partition_filters = None
|
|
94
|
+
else:
|
|
95
|
+
file_filters = hyper_slice
|
|
96
|
+
partition_filters = [f for f in hyper_slice if f[0] in partition_cols]
|
|
97
|
+
|
|
98
|
+
pyarrow_table_existing_data = delta_table.to_pyarrow_table(
|
|
99
|
+
columns=columns,
|
|
100
|
+
partitions=partition_filters,
|
|
101
|
+
filters=file_filters,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return pl.from_arrow(pyarrow_table_existing_data)
|
|
105
|
+
|
|
106
|
+
def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
|
|
107
|
+
"""Convert Polars dataframe to pyarrow table with casted schema.
|
|
108
|
+
The conversion will include generated columns.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
df (pl.DataFrame): Dataframe to convert
|
|
112
|
+
schema (Schema): Schema to cast the dataframe to
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
pa.Table: PyArrow table with the casted schema
|
|
116
|
+
"""
|
|
117
|
+
df = schema.add_generated_columns(df)
|
|
118
|
+
pyarrow_table = df.to_arrow()
|
|
119
|
+
|
|
120
|
+
# we need to cast the incoming data to the
|
|
121
|
+
# table schema. In theory, this should automatically
|
|
122
|
+
# be casted, but it seems that metadata on fields
|
|
123
|
+
# gets removed otherwise.
|
|
124
|
+
pa_schema = schema.to_arrow()
|
|
125
|
+
return pyarrow_table.select(pa_schema.names).cast(pa_schema)
|
|
126
|
+
|
|
127
|
+
def _write_deltalake(
|
|
128
|
+
self, data: pa.Table, mode: str, predicate: Optional[str]
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Write data to Delta Lake using deltalake-python.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
data (pa.Table): PyArrow table to write to Delta Lake
|
|
134
|
+
mode (str): Write mode, either "overwrite" or "append"
|
|
135
|
+
predicate (Optional[str]): SQL predicate to filter rows for update or
|
|
136
|
+
delete operations. If None, the operation will apply to all rows.
|
|
137
|
+
"""
|
|
138
|
+
dl.write_deltalake(
|
|
139
|
+
table_or_uri=self.delta_table,
|
|
140
|
+
data=data,
|
|
141
|
+
mode=mode,
|
|
142
|
+
predicate=predicate,
|
|
143
|
+
schema_mode="merge",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def update(self, df: pl.DataFrame, hyper_slice: HyperSlice) -> None:
|
|
147
|
+
"""Update rows in Delta Lake based on a HyperSlice. This will overwrite data
|
|
148
|
+
in the Delta Lake specified by the HyperSlice.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
df (pl.DataFrame): DataFrame containing the rows to update.
|
|
152
|
+
hyper_slice (HyperSlice): HyperSlice used to define rows to update.
|
|
153
|
+
If None, all rows will be updated.
|
|
154
|
+
"""
|
|
155
|
+
schema = self.schema()
|
|
156
|
+
data = self._to_writable_pyarrow_table(df=df, schema=schema)
|
|
157
|
+
|
|
158
|
+
hyper_slice = schema.add_generated_filters(hyper_slice)
|
|
159
|
+
if len(hyper_slice) == 0:
|
|
160
|
+
predicate = None
|
|
161
|
+
else:
|
|
162
|
+
predicate = _dnf_to_sql(hyper_slice)
|
|
163
|
+
|
|
164
|
+
self._write_deltalake(data=data, mode="overwrite", predicate=predicate)
|
|
165
|
+
|
|
166
|
+
def append(self, df: pl.DataFrame) -> None:
|
|
167
|
+
"""Append rows to Delta Lake. This will write data to the Delta Lake.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
df (pl.DataFrame): DataFrame containing the rows to append.
|
|
171
|
+
"""
|
|
172
|
+
schema = self.schema()
|
|
173
|
+
data = self._to_writable_pyarrow_table(df=df, schema=schema)
|
|
174
|
+
self._write_deltalake(data=data, mode="append", predicate=None)
|
|
175
|
+
|
|
176
|
+
def optimize(self) -> list[str]:
|
|
177
|
+
"""Optimize Delta table by compacting and vacuuming
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
list[str]: List of removed files
|
|
181
|
+
"""
|
|
182
|
+
delta_table = self.delta_table
|
|
183
|
+
metrics = delta_table.optimize.compact()
|
|
184
|
+
|
|
185
|
+
vacuumed_files = delta_table.vacuum(
|
|
186
|
+
dry_run=False,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
metrics["numFilesVacuumed"] = len(vacuumed_files)
|
|
190
|
+
|
|
191
|
+
return metrics
|
|
192
|
+
|
|
193
|
+
def delete(self, hyper_slice: HyperSlice) -> dict[str, Any]:
|
|
194
|
+
"""Delete data from Delta table
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
hyper_slice (HyperSlice): Hyper slice to delete.
|
|
198
|
+
If None, all data will be deleted.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
dict[str, any]: Delete metrics.
|
|
202
|
+
|
|
203
|
+
https://docs.databricks.com/gcp/en/delta/history#operation-metrics-keys
|
|
204
|
+
"""
|
|
205
|
+
if hyper_slice is None or hyper_slice == [()]:
|
|
206
|
+
predicate = None
|
|
207
|
+
else:
|
|
208
|
+
predicate = _dnf_to_sql(hyper_slice)
|
|
209
|
+
|
|
210
|
+
return self.delta_table.delete(predicate)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "datazone-sdk"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "Database and Delta storage client library for working with Delta Lake tables"
|
|
5
|
+
authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
|
|
9
|
+
[tool.poetry]
|
|
10
|
+
packages = [{ include = "datazone" }]
|
|
11
|
+
requires-poetry = ">=2.2"
|
|
12
|
+
|
|
13
|
+
[tool.poetry.dependencies]
|
|
14
|
+
pandas = ">=2.0.3,<3"
|
|
15
|
+
polars = ">=1.33.1"
|
|
16
|
+
obstore = ">=0.8.2"
|
|
17
|
+
deltalake = "==1.2.1" # pin to avoid breaking changes in 1.3.0. Follow Github issue here https://github.com/delta-io/delta-rs/issues/3939
|
|
18
|
+
pyarrow = ">=19.0.0"
|
|
19
|
+
datamazing = ">=5.1.6"
|
|
20
|
+
|
|
21
|
+
[tool.poetry.group.dev.dependencies]
|
|
22
|
+
pre-commit = ">=2.20.0"
|
|
23
|
+
|
|
24
|
+
[tool.poetry.group.test.dependencies]
|
|
25
|
+
pytest = ">=7"
|
|
26
|
+
pytest-cov = ">=3.0.0"
|
|
27
|
+
mypy = ">=1.19.0"
|
|
28
|
+
|
|
29
|
+
[build-system]
|
|
30
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
31
|
+
build-backend = "poetry.core.masonry.api"
|
|
32
|
+
|
|
33
|
+
[tool.isort]
|
|
34
|
+
multi_line_output = 3
|
|
35
|
+
line_length = 88
|
|
36
|
+
include_trailing_comma = true
|
|
37
|
+
|
|
38
|
+
[tool.black]
|
|
39
|
+
line_length = 88
|
|
40
|
+
|
|
41
|
+
[tool.mypy]
|
|
42
|
+
ignore_missing_imports = true
|