datazone-sdk 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.4
2
+ Name: datazone-sdk
3
+ Version: 1.0.0
4
+ Summary: Database and Delta storage client library for working with Delta Lake tables
5
+ Author: Team Enigma
6
+ Author-email: enigma@energinet.dk
7
+ Requires-Python: >=3.10
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Programming Language :: Python :: 3.14
14
+ Requires-Dist: datamazing (>=5.1.6)
15
+ Requires-Dist: deltalake (==1.2.1)
16
+ Requires-Dist: obstore (>=0.8.2)
17
+ Requires-Dist: pandas (>=2.0.3,<3)
18
+ Requires-Dist: polars (>=1.33.1)
19
+ Requires-Dist: pyarrow (>=19.0.0)
20
+ Description-Content-Type: text/markdown
21
+
22
+ # Datazone SDK
23
+
24
+ Database and Delta storage client library for working with Delta Lake tables.
25
+
26
+ ## Overview
27
+
28
+ This package provides functionality for interacting with Delta Lake tables, including:
29
+
30
+ - **Database Client**: High-level client for querying Delta Lake tables with support for time intervals, time travel, and filtering.
31
+ - **Delta Storage**: Low-level components for working with Delta tables, schemas, and data types.
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install datazone-sdk
37
+ ```
38
+
39
+ ## Usage
40
+
41
+ ```python
42
+ import datazone as dz
43
+
44
+ # Create a client from a storage account name
45
+ client = dz.DatabaseClient.from_resource_name(
46
+ storage_account="<storage-account>",
47
+ container_name="<container-name>",
48
+ sub_path="<sub-path>",
49
+ )
50
+
51
+ # Query a table
52
+ df = client.query(
53
+ table_name="my_table",
54
+ filters={"column": "value"},
55
+ )
56
+ ```
@@ -0,0 +1,35 @@
1
+ # Datazone SDK
2
+
3
+ Database and Delta storage client library for working with Delta Lake tables.
4
+
5
+ ## Overview
6
+
7
+ This package provides functionality for interacting with Delta Lake tables, including:
8
+
9
+ - **Database Client**: High-level client for querying Delta Lake tables with support for time intervals, time travel, and filtering.
10
+ - **Delta Storage**: Low-level components for working with Delta tables, schemas, and data types.
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install datazone-sdk
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ```python
21
+ import datazone as dz
22
+
23
+ # Create a client from a storage account name
24
+ client = dz.DatabaseClient.from_resource_name(
25
+ storage_account="<storage-account>",
26
+ container_name="<container-name>",
27
+ sub_path="<sub-path>",
28
+ )
29
+
30
+ # Query a table
31
+ df = client.query(
32
+ table_name="my_table",
33
+ filters={"column": "value"},
34
+ )
35
+ ```
@@ -0,0 +1,2 @@
1
+ from datazone.db.client import DatabaseClient
2
+ from datazone.deltastorage import Field, HyperSlice, Schema, Store, Table
@@ -0,0 +1 @@
1
+ from . import client
@@ -0,0 +1,233 @@
1
+ from typing import Optional
2
+
3
+ import datamazing.pandas as pdz
4
+ import pandas as pd
5
+ import pyarrow.compute as pc
6
+
7
+ from datazone.deltastorage.slicing import HyperSlice
8
+ from datazone.deltastorage.store import Store
9
+
10
+
11
+ class DatabaseClient:
12
+ def __init__(
13
+ self,
14
+ path: str,
15
+ storage_options: dict[str, str] | None = None,
16
+ table_prefix: str = "",
17
+ ):
18
+ self.store = Store(
19
+ path=path,
20
+ storage_options=storage_options,
21
+ )
22
+ self.table_prefix = table_prefix
23
+
24
+ @classmethod
25
+ def from_resource_name(
26
+ cls,
27
+ storage_account: str,
28
+ container_name: str = "datasets",
29
+ sub_path: str = "",
30
+ table_prefix: str = "",
31
+ ):
32
+ """Create a DatabaseClient from resource name (storage account).
33
+ This assumes the path of the delta lake is of the form:
34
+ abfss://{container_name}@{storage_account}.dfs.core.windows.net/{sub_path}
35
+
36
+ Args:
37
+ storage_account (str): Storage account name.
38
+ container_name (str, optional): Container name. Defaults to "datasets".
39
+ sub_path (str, optional): Sub-path within the container. Defaults to "".
40
+ table_prefix (str, optional): Table prefix to use (e.g. `mz_` for archive).
41
+ Defaults to "".
42
+ credential (optional): Azure credential to use.
43
+ Defaults to DefaultAzureCredential().
44
+ """
45
+ path = f"abfss://{container_name}@{storage_account}.dfs.core.windows.net"
46
+ if sub_path:
47
+ path += f"/{sub_path}"
48
+
49
+ storage_options = Store._get_func_storage_options()
50
+
51
+ return cls(
52
+ path=path, storage_options=storage_options, table_prefix=table_prefix
53
+ )
54
+
55
+ def get_unit_and_multiple(self, timedelta: pd.Timedelta) -> tuple[str | None, int]:
56
+ """
57
+ Get unit and multiple of a timedelta. E.g. for a timedelta of "PT5M" then
58
+ unit = "minute" and multiple = 5.
59
+ NOTE: Timedelta must have one and only one non-zero component,
60
+ i.e. "PT0S" doesnt work, and neither does "PT5M10S".
61
+
62
+ Args:
63
+ timedelta (pd.Timedelta): Timedelta
64
+
65
+ Returns:
66
+ tuple[str, int]: Unit and multiple
67
+ """
68
+ components = timedelta.components._asdict()
69
+
70
+ # remove plural ending from unit, since
71
+ # this is the standard pyarrow uses
72
+ components = {k[:-1]: v for k, v in components.items()}
73
+
74
+ non_zero_components = {
75
+ unit: multiple for unit, multiple in components.items() if multiple != 0
76
+ }
77
+
78
+ if len(non_zero_components) == 0:
79
+ return None, 0
80
+
81
+ if len(non_zero_components) != 1:
82
+ raise ValueError("Timedelta must have one and only one non-zero multiple.")
83
+
84
+ return next(iter(non_zero_components.items()))
85
+
86
+ def relative_time_travel_version(
87
+ self, time_column: str, block: pd.Timedelta, horizon: pd.Timedelta
88
+ ) -> pc.Expression:
89
+ """
90
+ Get value to use for filtering a relative time travel
91
+ (i.e. the interval [valid-from, valid-to] must contain
92
+ this value)
93
+ """
94
+ unit, multiple = self.get_unit_and_multiple(block)
95
+
96
+ if multiple == 0:
97
+ # `pc.floor_temporal` fails with multiple=0,
98
+ # but in this case we don't need to floor
99
+ # the time anyway
100
+ start_of_block = pc.field("time_utc")
101
+ else:
102
+ start_of_block = pc.floor_temporal(
103
+ pc.field(time_column),
104
+ multiple=multiple,
105
+ unit=unit,
106
+ )
107
+
108
+ return start_of_block - horizon.to_pytimedelta()
109
+
110
+ def time_travel_filter(
111
+ self,
112
+ time_travel: pdz.TimeTravel,
113
+ time_column: str,
114
+ valid_from_column: str,
115
+ valid_to_column: str,
116
+ ) -> list[HyperSlice]:
117
+ """Filter delta table on a time travel
118
+
119
+ Args:
120
+ time_travel (pdz.TimeTravel): Time travel
121
+ time_column (str): Time column name
122
+ valid_from_column (str): Valid-from column name
123
+ valid_to_column (str): Valid-to column name
124
+ """
125
+ match time_travel.tense:
126
+ case "absolute":
127
+ # If the time travel is absolute, we filter
128
+ # to entries where [valid-from, valid-to]
129
+ # contains `as_of_time`
130
+ version = time_travel.as_of_time.to_pydatetime()
131
+ case "relative":
132
+ version = self.relative_time_travel_version(
133
+ time_column, time_travel.block, time_travel.horizon
134
+ )
135
+
136
+ return [
137
+ HyperSlice((valid_from_column, "<=", version)),
138
+ HyperSlice((valid_to_column, ">", version)),
139
+ ]
140
+
141
+ def query(
142
+ self,
143
+ table_name: str,
144
+ time_interval: Optional[pdz.TimeInterval] = None,
145
+ time_travel: Optional[pdz.TimeTravel] = None,
146
+ filters: Optional[dict[str, object]] = None,
147
+ columns: Optional[list[str]] = None,
148
+ include_validity_period_columns: bool = False,
149
+ include_generated_columns: bool = False,
150
+ ) -> pd.DataFrame:
151
+ """Query table.
152
+ Query defaults are set to match old Table Storage client behavior.
153
+ Time travel defaults to "as of now"
154
+ Validity period columns are dropped by default.
155
+ Generated columns are dropped by default.
156
+
157
+ Args:
158
+ table_name (str): Name of the table
159
+ time_interval (Optional[pdz.TimeInterval], optional): Time interval for the
160
+ query. Defaults to None.
161
+ time_travel (Optional[pdz.TimeTravel], optional): Time travel information.
162
+ Defaults to None.
163
+ filters (Optional[dict[str, object]], optional): Filters to apply to the
164
+ query.
165
+ Defaults to None.
166
+ columns (Optional[list[str]], optional): Columns to return.
167
+ Selecting columns can significantly improve query performance.
168
+ Defaults to None, meaning all columns will be returned.
169
+ include_validity_period_columns (bool, optional): Whether to include
170
+ validity period columns in the result;
171
+ (`valid_from_time_utc`, `valid_to_time_utc`).
172
+ Defaults to False.
173
+ include_generated_columns (bool, optional): Whether to include generated
174
+ columns in the result; (e.g. `valid_from_time_utc`, `valid_to_time_utc`).
175
+ Defaults to False.
176
+
177
+ Returns:
178
+ pd.DataFrame: The result of the query.
179
+ """
180
+ # Prefix used to differentiate between operation ("{table_name}")
181
+ # and historical ("mz_{table_name}").
182
+ table_name = f"{self.table_prefix}{table_name}"
183
+
184
+ table = self.store.get_table(table_name)
185
+ hyper_slice = []
186
+
187
+ if filters:
188
+ for key, value in filters.items():
189
+ if isinstance(value, (list, tuple, set)):
190
+ hyper_slice.append((key, "in", value))
191
+ else:
192
+ hyper_slice.append((key, "=", value))
193
+ if time_interval:
194
+ hyper_slice.append(("time_utc", ">=", time_interval.left))
195
+ hyper_slice.append(("time_utc", "<=", time_interval.right))
196
+
197
+ if time_travel is None:
198
+ time_travel = pdz.TimeTravel(
199
+ as_of_time=pd.Timestamp.utcnow(),
200
+ )
201
+
202
+ tt_filter = self.time_travel_filter(
203
+ time_travel,
204
+ time_column="time_utc",
205
+ valid_from_column="valid_from_time_utc",
206
+ valid_to_column="valid_to_time_utc",
207
+ )
208
+
209
+ hyper_slice.extend(tt_filter)
210
+ pl_df = table.read(hyper_slice=HyperSlice(hyper_slice), columns=columns)
211
+
212
+ pd_df = pl_df.to_pandas()
213
+
214
+ # We truncate to second, and change to nanosecond
215
+ # precision because this was used by the old solution (Azure Table Storage)
216
+ for col in pd_df.select_dtypes(include=["datetime", "datetimetz"]).columns:
217
+ pd_df[col] = pd_df[col].dt.floor("s").dt.as_unit("ns")
218
+
219
+ # Drop generated columns
220
+ if not include_generated_columns:
221
+ generated_cols = []
222
+ for field in table.schema().fields:
223
+ if field.generated_as is not None:
224
+ generated_cols.append(field.column_name)
225
+ pd_df = pd_df.drop(columns=generated_cols, errors="ignore")
226
+
227
+ # Drop valid-from/to columns
228
+ if not include_validity_period_columns:
229
+ pd_df = pd_df.drop(
230
+ columns=["valid_from_time_utc", "valid_to_time_utc"], errors="ignore"
231
+ )
232
+
233
+ return pd_df
@@ -0,0 +1,4 @@
1
+ from .schema import Field, Schema
2
+ from .slicing import HyperSlice
3
+ from .store import Store
4
+ from .table import Table
@@ -0,0 +1,94 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ import pyarrow as pa
4
+
5
+
6
+ class DataType(ABC):
7
+ @classmethod
8
+ def from_arrow(cls, pa_type: pa.DataType) -> "DataType":
9
+ if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
10
+ return String()
11
+ elif pa.types.is_floating(pa_type):
12
+ return Float()
13
+ elif pa.types.is_integer(pa_type):
14
+ return Int()
15
+ elif pa.types.is_timestamp(pa_type):
16
+ return Timestamp(tz=pa_type.tz)
17
+ elif pa.types.is_date(pa_type):
18
+ return Date()
19
+ elif pa.types.is_boolean(pa_type):
20
+ return Boolean()
21
+ elif pa.types.is_null(pa_type):
22
+ return Null()
23
+ else:
24
+ raise ValueError(f"Unsupported data type: {pa_type}")
25
+
26
+ @abstractmethod
27
+ def __str__(self):
28
+ ...
29
+
30
+ def __eq__(self, other: "DataType") -> bool:
31
+ return self.to_arrow() == other.to_arrow()
32
+
33
+ @abstractmethod
34
+ def to_arrow(self) -> pa.DataType:
35
+ ...
36
+
37
+
38
+ class String(DataType):
39
+ def __str__(self):
40
+ return "string"
41
+
42
+ def to_arrow(self) -> pa.DataType:
43
+ return pa.string()
44
+
45
+
46
+ class Float(DataType):
47
+ def __str__(self):
48
+ return "float"
49
+
50
+ def to_arrow(self) -> pa.DataType:
51
+ return pa.float64()
52
+
53
+
54
+ class Int(DataType):
55
+ def __str__(self):
56
+ return "int"
57
+
58
+ def to_arrow(self) -> pa.DataType:
59
+ return pa.int64()
60
+
61
+
62
+ class Timestamp(DataType):
63
+ def __str__(self):
64
+ return f"timestamp[{self.tz}]"
65
+
66
+ def __init__(self, tz: str):
67
+ self.tz = tz
68
+
69
+ def to_arrow(self) -> pa.DataType:
70
+ return pa.timestamp("us", tz=self.tz)
71
+
72
+
73
+ class Date(DataType):
74
+ def __str__(self):
75
+ return "date"
76
+
77
+ def to_arrow(self) -> pa.DataType:
78
+ return pa.date32()
79
+
80
+
81
+ class Boolean(DataType):
82
+ def __str__(self):
83
+ return "boolean"
84
+
85
+ def to_arrow(self) -> pa.DataType:
86
+ return pa.bool_()
87
+
88
+
89
+ class Null(DataType):
90
+ def __str__(self):
91
+ return "null"
92
+
93
+ def to_arrow(self) -> pa.DataType:
94
+ return pa.null()
@@ -0,0 +1,151 @@
1
+ import copy
2
+ import datetime as dt
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any
5
+ from zoneinfo import ZoneInfo
6
+
7
+ import polars as pl
8
+
9
+
10
+ class GeneratedColumn(ABC):
11
+ """Class representing a generated column."""
12
+
13
+ @property
14
+ @abstractmethod
15
+ def base_column_names(self) -> list[str]:
16
+ ...
17
+
18
+ @abstractmethod
19
+ def to_metadata(self) -> dict:
20
+ ...
21
+
22
+ @abstractmethod
23
+ def expression(self) -> pl.Expr:
24
+ """Polars expression to compute the generated column."""
25
+ ...
26
+
27
+ @abstractmethod
28
+ def get_generated_conditions(self, op: str, value) -> list[tuple[str, Any]]:
29
+ """Get conditions on the generated column based on a condition
30
+ on the base column."""
31
+ ...
32
+
33
+ @classmethod
34
+ def from_metadata(cls, metadata: dict):
35
+ if metadata["type"] == "date_bucket":
36
+ return DateBucket(
37
+ base_column_name=metadata["base_column_name"],
38
+ as_tz=metadata.get("as_tz", None),
39
+ )
40
+ elif metadata["type"] == "concat":
41
+ return Concat(
42
+ base_column_names=metadata["base_column_names"],
43
+ delimiter=metadata["delimiter"],
44
+ )
45
+ else:
46
+ raise ValueError(f"Unknown generated column type: {metadata['type']}")
47
+
48
+
49
+ class DateBucket(GeneratedColumn):
50
+ def __init__(self, base_column_name: str, as_tz: str | None = None):
51
+ """Generated column which bins a timestamp column to a date,
52
+ optionally converting to a specific timezone first.
53
+
54
+ Args:
55
+ base_column_name (str): Base column name
56
+ as_tz (str, optional): Cast to date in this timezone.
57
+ Defaults to None, meaning it uses the original timezone.
58
+ """
59
+ self.base_column_name = base_column_name
60
+ self.as_tz = as_tz
61
+
62
+ def __str__(self):
63
+ if self.as_tz is not None:
64
+ return f"date_bucket({self.base_column_name}, as_tz={self.as_tz})"
65
+ else:
66
+ return f"date_bucket({self.base_column_name})"
67
+
68
+ @property
69
+ def base_column_names(self) -> list[str]:
70
+ return [self.base_column_name]
71
+
72
+ def to_metadata(self) -> dict:
73
+ metadata = {
74
+ "type": "date_bucket",
75
+ "base_column_name": self.base_column_name,
76
+ }
77
+ if self.as_tz is not None:
78
+ metadata["as_tz"] = self.as_tz
79
+ return metadata
80
+
81
+ def expression(self) -> pl.Expr:
82
+ expr = pl.col(self.base_column_name)
83
+ if self.as_tz is not None:
84
+ expr = expr.dt.convert_time_zone(self.as_tz)
85
+ return expr.dt.date()
86
+
87
+ def get_generated_conditions(
88
+ self, op: str, value: dt.datetime
89
+ ) -> list[tuple[str, dt.date]]:
90
+ """Get conditions on the generated column based on a condition
91
+ on the base column.
92
+
93
+ Args:
94
+ op (str): Operator
95
+ value (dt.datetime): Value
96
+
97
+ Returns:
98
+ tuple[str, dt.date]: List of conditions on the generated column
99
+ """
100
+ timestamp = copy.copy(value)
101
+ if self.as_tz is not None:
102
+ timestamp = timestamp.astimezone(ZoneInfo(self.as_tz))
103
+ date = timestamp.date()
104
+
105
+ match op:
106
+ case "=":
107
+ return [("=", date)]
108
+ case ("<" | "<="):
109
+ return [("<=", date)]
110
+ case (">" | ">="):
111
+ return [(">=", date)]
112
+ case _:
113
+ # for other operations, we cannot make any
114
+ # useful filters on the generated column
115
+ return []
116
+
117
+
118
+ class Concat(GeneratedColumn):
119
+ def __init__(self, base_column_names: list[str], delimiter: str):
120
+ """Generated column which concats multiple columns into a single string.
121
+
122
+ Args:
123
+ base_column_names (str): Base column names
124
+ delimiter (str): Delimiter used for concatenation
125
+ """
126
+ self._base_column_names = base_column_names
127
+ self.delimiter = delimiter
128
+
129
+ @property
130
+ def base_column_names(self) -> list[str]:
131
+ return self._base_column_names
132
+
133
+ def __str__(self):
134
+ base_cols_str = ", ".join(self.base_column_names)
135
+ return f"concat([{base_cols_str}], delimiter='{self.delimiter}')"
136
+
137
+ def to_metadata(self) -> dict:
138
+ metadata = {
139
+ "type": "concat",
140
+ "base_column_names": self.base_column_names,
141
+ "delimiter": self.delimiter,
142
+ }
143
+ return metadata
144
+
145
+ def expression(self) -> pl.Expr:
146
+ return pl.concat_str(self.base_column_names, separator=self.delimiter)
147
+
148
+ def get_generated_conditions(self, op: str, value: Any) -> list[tuple[str, Any]]:
149
+ # for concat generated columns, we cannot make any
150
+ # useful filters on the generated column
151
+ return []
@@ -0,0 +1,135 @@
1
+ import json
2
+ from typing import Optional
3
+
4
+ import polars as pl
5
+ import pyarrow as pa
6
+
7
+ from .data_types import DataType
8
+ from .generated_columns import GeneratedColumn
9
+ from .slicing import HyperSlice
10
+
11
+
12
+ class Field:
13
+ def __init__(
14
+ self,
15
+ column_name: str,
16
+ data_type: DataType,
17
+ generated_as: Optional[GeneratedColumn] = None,
18
+ ):
19
+ """Class representing a table field.
20
+
21
+ Args:
22
+ column_name (str): Column name
23
+ data_type (DataType): Data type
24
+ generated_as (GeneratedColumn, optional): Generated column based on a
25
+ regular column. Defaults to None, meaning the column is not generated.
26
+ """
27
+ self.column_name = column_name
28
+ self.data_type = data_type
29
+ self.generated_as = generated_as
30
+
31
+ def __eq__(self, other: "Field") -> bool:
32
+ return self.to_arrow().equals(other.to_arrow(), check_metadata=True)
33
+
34
+ def __repr__(self):
35
+ string = f"{self.column_name}: {self.data_type}"
36
+ if self.generated_as is not None:
37
+ string = f"{string} [generated as {self.generated_as}]"
38
+ return string
39
+
40
+ def to_arrow(self):
41
+ """Convert to pyarrow field. Information about generated columns
42
+ is stored as metadata."""
43
+ pa_dtype = self.data_type.to_arrow()
44
+
45
+ pa_metadata = None
46
+ if self.generated_as is not None:
47
+ pa_metadata = {
48
+ "generated_column": json.dumps(self.generated_as.to_metadata())
49
+ }
50
+
51
+ return pa.field(self.column_name, pa_dtype, metadata=pa_metadata)
52
+
53
+ @classmethod
54
+ def from_arrow(cls, pa_field: pa.Field) -> "Field":
55
+ """Convert from pyarrow field"""
56
+ data_type = DataType.from_arrow(pa_field.type)
57
+
58
+ generated_as = None
59
+
60
+ if pa_field.metadata is not None:
61
+ pa_metadata = {k.decode(): v.decode() for k, v in pa_field.metadata.items()}
62
+ if "generated_column" in pa_metadata:
63
+ gen_col_metadata = json.loads(pa_metadata["generated_column"])
64
+ generated_as = GeneratedColumn.from_metadata(gen_col_metadata)
65
+
66
+ return cls(pa_field.name, data_type, generated_as)
67
+
68
+
69
+ class Schema:
70
+ def __init__(
71
+ self,
72
+ fields: list[Field],
73
+ ):
74
+ """Class representing a table schema.
75
+
76
+ Args:
77
+ fields (list[Field]): Schema fields
78
+ """
79
+ self.fields = fields
80
+
81
+ def __eq__(self, other: "Schema") -> bool:
82
+ return self.to_arrow().equals(other.to_arrow(), check_metadata=True)
83
+
84
+ def __repr__(self):
85
+ field_reprs = [repr(field) for field in self.fields]
86
+ return "\n".join(field_reprs)
87
+
88
+ def to_arrow(self):
89
+ """Convert to pyarrow schema."""
90
+ pa_fields = [field.to_arrow() for field in self.fields]
91
+ return pa.schema(pa_fields)
92
+
93
+ @classmethod
94
+ def from_arrow(cls, pa_schema: pa.Schema) -> "Schema":
95
+ """Convert from pyarrow schema"""
96
+ fields = []
97
+ for pa_field in pa_schema:
98
+ field = Field.from_arrow(pa_field)
99
+ fields.append(field)
100
+ return cls(fields)
101
+
102
+ def add_generated_columns(self, base_df: pl.DataFrame) -> pl.DataFrame:
103
+ """Add additional columns to a dataframe derived
104
+ from all generated columns in the schema.
105
+
106
+ Args:
107
+ base_df (pl.DataFrame): Input dataframe
108
+ """
109
+ generated_exprs = []
110
+ for field in self.fields:
111
+ if field.generated_as is not None:
112
+ expr = field.generated_as.expression().alias(field.column_name)
113
+ generated_exprs.append(expr)
114
+
115
+ return base_df.with_columns(generated_exprs)
116
+
117
+ def add_generated_filters(self, base_slice: HyperSlice) -> HyperSlice:
118
+ """Add additional filters to a hyperslice based from
119
+ all generated columns in the schema.
120
+
121
+ Args:
122
+ base_slice (HyperSlice): Input hyperslice
123
+ """
124
+ generated_slice = []
125
+ for col, op, val in base_slice:
126
+ for field in self.fields:
127
+ if field.generated_as is None:
128
+ continue
129
+ if col not in field.generated_as.base_column_names:
130
+ continue
131
+ for gen_op, gen_val in field.generated_as.get_generated_conditions(
132
+ op, val
133
+ ):
134
+ generated_slice.append((field.column_name, gen_op, gen_val))
135
+ return HyperSlice(list(base_slice) + generated_slice)
@@ -0,0 +1,22 @@
1
+ from typing import Any
2
+
3
+
4
+ class HyperSlice(list[tuple[str, str, Any]]):
5
+ """A list of tuples representing a n-dimensional slice of a table.
6
+ Each tuple corresponds to a filter applied on the table,
7
+ in the form of (column, operator, value). For example, the
8
+ following hyper slice represents all records in a table
9
+ where the country is 'Denmark' and the date is greater than
10
+ 2000-01-01:
11
+
12
+ [
13
+ ("country", "=", "Denmark"),
14
+ ("date", ">", "2000-01-01"),
15
+ ]
16
+
17
+ In SQL, this would be equivalent to the WHERE clause:
18
+
19
+ country = 'Denmark' AND date > '2000-01-01'
20
+ """
21
+
22
+ ...
@@ -0,0 +1,138 @@
1
+ import os
2
+
3
+ import deltalake as dl
4
+ import obstore as obs
5
+ from deltalake.exceptions import TableNotFoundError as DeltaTableNotFoundError
6
+
7
+ from .schema import Schema
8
+ from .table import Table
9
+
10
+
11
+ class Store:
12
+ def __init__(
13
+ self,
14
+ path: str,
15
+ storage_options: dict[str, str] | None = None,
16
+ ):
17
+ """Class representing a store containing datasets
18
+
19
+ Args:
20
+ path (str): Root directory containing Delta tables
21
+ storage_options (dict[str, str] | None, optional): Storage options used for
22
+ remote cloud storage. For more information on available options,
23
+ go to https://delta-io.github.io/delta-rs/integrations/object-storage/.
24
+ Defaults to None, corresponding to the local file system.
25
+ """
26
+ self.path = path
27
+ if storage_options is None:
28
+ storage_options = self._get_func_storage_options()
29
+ self.storage_options = storage_options
30
+ # We use obstore to interact with remote
31
+ # cloud storage for operations not directly
32
+ # supported by delta-rs (e.g. listing directories)
33
+ # We could use fsspec, but the `storage_options`
34
+ # used by delta-rs and fsspec are not compatible
35
+ self._obstore = obs.store.from_url(
36
+ url=path,
37
+ config=storage_options,
38
+ )
39
+
40
+ def __repr__(self):
41
+ return f"Store('{self.path}')"
42
+
43
+ def _get_table_uri(self, table_name: str) -> str:
44
+ return self.path + "/" + table_name
45
+
46
+ @staticmethod
47
+ def _get_func_storage_options() -> dict[str, str]:
48
+ """Get storage options.
49
+ This differ depending on whether we are running
50
+ in cloud or locally.
51
+ """
52
+ if "IDENTITY_ENDPOINT" in os.environ:
53
+ # When running in Azure Function, the environment variable IDENTITY_ENDPOINT
54
+ # will be set, and we use the managed identity to access the storage account
55
+ storage_options = {"azure_msi_endpoint": os.environ["IDENTITY_ENDPOINT"]}
56
+ else:
57
+ # When running locally, we use the Azure CLI to authenticate.
58
+ storage_options = {"use_azure_cli": "true"}
59
+
60
+ return storage_options
61
+
62
+ @classmethod
63
+ def from_func_environment_variable(cls, container_name: str = "datasets"):
64
+ """Create Store instance from environment variable.
65
+ This uses default storage options, created in `__init__`."""
66
+ storage_account_name = os.environ["OPERATIONAL_DATA_STORAGE_ACCOUNT"]
67
+ path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net"
68
+
69
+ return cls(path=path)
70
+
71
+ def list_tables(self) -> list[str]:
72
+ """List all Delta tables"""
73
+ return self._obstore.list_with_delimiter()["common_prefixes"]
74
+
75
+ def table_exists(self, table_name: str):
76
+ """Check if Delta table exists
77
+
78
+ Args:
79
+ table_name (str): Table name
80
+ """
81
+ # For some reason `deltalake.DeltaTable.is_deltatable()` can be very slow.
82
+ # deltalake has an issue open about this:
83
+ # https://github.com/delta-io/delta-rs/issues/3942
84
+ # For now we catch the exception when trying to load the table
85
+ try:
86
+ _ = dl.DeltaTable(
87
+ table_uri=self._get_table_uri(table_name),
88
+ storage_options=self.storage_options,
89
+ without_files=True,
90
+ )
91
+ except DeltaTableNotFoundError:
92
+ return False
93
+ return True
94
+
95
+ def create_table(
96
+ self,
97
+ table_name: str,
98
+ schema: Schema,
99
+ partition_by: list[str] | None = None,
100
+ ) -> Table:
101
+ """Create Delta table
102
+
103
+ Args:
104
+ table_name (str): Table name.
105
+ schema (pl.Schema): Table schema.
106
+ partition_by (list[str]): Partition columns.
107
+ """
108
+ if self.table_exists(table_name):
109
+ raise ValueError(f"Table with name '{table_name}' already exists")
110
+
111
+ if schema is None:
112
+ raise ValueError("Schema must be provided when creating a new table")
113
+
114
+ pa_schema = schema.to_arrow()
115
+
116
+ dl.DeltaTable.create(
117
+ table_uri=self._get_table_uri(table_name),
118
+ schema=pa_schema,
119
+ storage_options=self.storage_options,
120
+ partition_by=partition_by,
121
+ configuration={
122
+ "delta.deletedFileRetentionDuration": "interval 2 hours",
123
+ "delta.logRetentionDuration": "interval 4 hours",
124
+ },
125
+ )
126
+
127
+ return Table(self._get_table_uri(table_name), self.storage_options)
128
+
129
+ def get_table(self, table_name: str) -> Table:
130
+ """Get Delta table
131
+
132
+ Args:
133
+ table_name (str): Table name
134
+ """
135
+ if not self.table_exists(table_name):
136
+ raise ValueError(f"Table with name '{table_name}' does not exist")
137
+
138
+ return Table(self._get_table_uri(table_name), self.storage_options)
@@ -0,0 +1,210 @@
1
+ from typing import Any, Optional
2
+
3
+ import deltalake as dl
4
+ import polars as pl
5
+ import pyarrow as pa
6
+
7
+ from .schema import Schema
8
+ from .slicing import HyperSlice
9
+
10
+
11
+ def _dnf_to_sql(dnf: list[tuple]) -> str:
12
+ """Convert DNF expression to SQL expression."""
13
+ if len(dnf) == 0:
14
+ return "1=1"
15
+
16
+ sql_parts = []
17
+ for col, op, val in dnf:
18
+ if op == "in":
19
+ assert isinstance(val, list)
20
+ lst = ", ".join([f"'{item}'" for item in val])
21
+ sql_parts.append(f"{col} IN ({lst})")
22
+ elif op in [">=", "<=", ">", "<", "="]:
23
+ sql_parts.append(f"{col} {op} '{val}'")
24
+ else:
25
+ raise ValueError(f"Unsupported operation: {op}")
26
+
27
+ return " AND ".join(sql_parts)
28
+
29
+
30
+ class Table:
31
+ def __init__(
32
+ self,
33
+ table_uri: str,
34
+ storage_options: dict[str, str] | None = None,
35
+ ):
36
+ """Class representing a dataset
37
+
38
+ Args:
39
+ delta_table (dl.DeltaTable): Delta table
40
+ """
41
+ self.table_uri = table_uri
42
+ self.storage_options = storage_options
43
+
44
+ self.table_name = self.table_uri.split("/")[-1]
45
+ self._delta_table = None
46
+
47
+ def __repr__(self):
48
+ return f"Table('{self.table_name}')"
49
+
50
+ @property
51
+ def delta_table(self) -> dl.DeltaTable:
52
+ """Get the Delta table object.
53
+ As the `Table`-class is lazily initialized,
54
+ the `delta_table`-property is initialized when needed.
55
+ We do not cache it, which creates a little overhead, but reduces
56
+ the risk of false transaction issues when doing concurrent reads/writes.
57
+ This is important because using the same instance can lead to transaction
58
+ issues in delta as DeltaTable uses metadata (transaction id) from
59
+ the first time the object is instantiated.
60
+
61
+ The risk disappears when this instance is created within a lock.
62
+ """
63
+ return dl.DeltaTable(self.table_uri, storage_options=self.storage_options)
64
+
65
+ def partition_cols(self) -> list[str]:
66
+ """Get the partition columns of the table"""
67
+ return self.delta_table.metadata().partition_columns
68
+
69
+ def schema(self) -> Schema:
70
+ """Get the schema of the table"""
71
+ pa_schema = pa.schema(self.delta_table.schema())
72
+ return Schema.from_arrow(pa_schema)
73
+
74
+ def read(
75
+ self, hyper_slice: Optional[HyperSlice] = None, columns=None
76
+ ) -> pl.DataFrame:
77
+ """Read from Delta table
78
+
79
+ Args:
80
+ hyper_slice (HyperSlice): Hyper sliced used to filter data
81
+ """
82
+ if hyper_slice is None:
83
+ hyper_slice = []
84
+
85
+ # add generated filters to hyperslice
86
+ hyper_slice = self.schema().add_generated_filters(hyper_slice)
87
+
88
+ delta_table = self.delta_table
89
+ partition_cols = delta_table.metadata().partition_columns
90
+
91
+ if len(hyper_slice) == 0:
92
+ file_filters = None
93
+ partition_filters = None
94
+ else:
95
+ file_filters = hyper_slice
96
+ partition_filters = [f for f in hyper_slice if f[0] in partition_cols]
97
+
98
+ pyarrow_table_existing_data = delta_table.to_pyarrow_table(
99
+ columns=columns,
100
+ partitions=partition_filters,
101
+ filters=file_filters,
102
+ )
103
+
104
+ return pl.from_arrow(pyarrow_table_existing_data)
105
+
106
+ def _to_writable_pyarrow_table(self, df: pl.DataFrame, schema: Schema) -> pa.Table:
107
+ """Convert Polars dataframe to pyarrow table with casted schema.
108
+ The conversion will include generated columns.
109
+
110
+ Args:
111
+ df (pl.DataFrame): Dataframe to convert
112
+ schema (Schema): Schema to cast the dataframe to
113
+
114
+ Returns:
115
+ pa.Table: PyArrow table with the casted schema
116
+ """
117
+ df = schema.add_generated_columns(df)
118
+ pyarrow_table = df.to_arrow()
119
+
120
+ # we need to cast the incoming data to the
121
+ # table schema. In theory, this should automatically
122
+ # be casted, but it seems that metadata on fields
123
+ # gets removed otherwise.
124
+ pa_schema = schema.to_arrow()
125
+ return pyarrow_table.select(pa_schema.names).cast(pa_schema)
126
+
127
+ def _write_deltalake(
128
+ self, data: pa.Table, mode: str, predicate: Optional[str]
129
+ ) -> None:
130
+ """Write data to Delta Lake using deltalake-python.
131
+
132
+ Args:
133
+ data (pa.Table): PyArrow table to write to Delta Lake
134
+ mode (str): Write mode, either "overwrite" or "append"
135
+ predicate (Optional[str]): SQL predicate to filter rows for update or
136
+ delete operations. If None, the operation will apply to all rows.
137
+ """
138
+ dl.write_deltalake(
139
+ table_or_uri=self.delta_table,
140
+ data=data,
141
+ mode=mode,
142
+ predicate=predicate,
143
+ schema_mode="merge",
144
+ )
145
+
146
+ def update(self, df: pl.DataFrame, hyper_slice: HyperSlice) -> None:
147
+ """Update rows in Delta Lake based on a HyperSlice. This will overwrite data
148
+ in the Delta Lake specified by the HyperSlice.
149
+
150
+ Args:
151
+ df (pl.DataFrame): DataFrame containing the rows to update.
152
+ hyper_slice (HyperSlice): HyperSlice used to define rows to update.
153
+ If None, all rows will be updated.
154
+ """
155
+ schema = self.schema()
156
+ data = self._to_writable_pyarrow_table(df=df, schema=schema)
157
+
158
+ hyper_slice = schema.add_generated_filters(hyper_slice)
159
+ if len(hyper_slice) == 0:
160
+ predicate = None
161
+ else:
162
+ predicate = _dnf_to_sql(hyper_slice)
163
+
164
+ self._write_deltalake(data=data, mode="overwrite", predicate=predicate)
165
+
166
+ def append(self, df: pl.DataFrame) -> None:
167
+ """Append rows to Delta Lake. This will write data to the Delta Lake.
168
+
169
+ Args:
170
+ df (pl.DataFrame): DataFrame containing the rows to append.
171
+ """
172
+ schema = self.schema()
173
+ data = self._to_writable_pyarrow_table(df=df, schema=schema)
174
+ self._write_deltalake(data=data, mode="append", predicate=None)
175
+
176
+ def optimize(self) -> list[str]:
177
+ """Optimize Delta table by compacting and vacuuming
178
+
179
+ Returns:
180
+ list[str]: List of removed files
181
+ """
182
+ delta_table = self.delta_table
183
+ metrics = delta_table.optimize.compact()
184
+
185
+ vacuumed_files = delta_table.vacuum(
186
+ dry_run=False,
187
+ )
188
+
189
+ metrics["numFilesVacuumed"] = len(vacuumed_files)
190
+
191
+ return metrics
192
+
193
+ def delete(self, hyper_slice: HyperSlice) -> dict[str, Any]:
194
+ """Delete data from Delta table
195
+
196
+ Args:
197
+ hyper_slice (HyperSlice): Hyper slice to delete.
198
+ If None, all data will be deleted.
199
+
200
+ Returns:
201
+ dict[str, any]: Delete metrics.
202
+
203
+ https://docs.databricks.com/gcp/en/delta/history#operation-metrics-keys
204
+ """
205
+ if hyper_slice is None or hyper_slice == [()]:
206
+ predicate = None
207
+ else:
208
+ predicate = _dnf_to_sql(hyper_slice)
209
+
210
+ return self.delta_table.delete(predicate)
@@ -0,0 +1,42 @@
1
+ [project]
2
+ name = "datazone-sdk"
3
+ version = "1.0.0"
4
+ description = "Database and Delta storage client library for working with Delta Lake tables"
5
+ authors = [{ name = "Team Enigma", email = "enigma@energinet.dk" }]
6
+ requires-python = ">=3.10"
7
+ readme = "README.md"
8
+
9
+ [tool.poetry]
10
+ packages = [{ include = "datazone" }]
11
+ requires-poetry = ">=2.2"
12
+
13
+ [tool.poetry.dependencies]
14
+ pandas = ">=2.0.3,<3"
15
+ polars = ">=1.33.1"
16
+ obstore = ">=0.8.2"
17
+ deltalake = "==1.2.1" # pin to avoid breaking changes in 1.3.0. Follow Github issue here https://github.com/delta-io/delta-rs/issues/3939
18
+ pyarrow = ">=19.0.0"
19
+ datamazing = ">=5.1.6"
20
+
21
+ [tool.poetry.group.dev.dependencies]
22
+ pre-commit = ">=2.20.0"
23
+
24
+ [tool.poetry.group.test.dependencies]
25
+ pytest = ">=7"
26
+ pytest-cov = ">=3.0.0"
27
+ mypy = ">=1.19.0"
28
+
29
+ [build-system]
30
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
31
+ build-backend = "poetry.core.masonry.api"
32
+
33
+ [tool.isort]
34
+ multi_line_output = 3
35
+ line_length = 88
36
+ include_trailing_comma = true
37
+
38
+ [tool.black]
39
+ line_length = 88
40
+
41
+ [tool.mypy]
42
+ ignore_missing_imports = true