ducklake-delta-exporter 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/PKG-INFO +1 -1
- {ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter/__init__.py +3 -2
- {ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter.egg-info/PKG-INFO +1 -1
- {ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter.egg-info/SOURCES.txt +2 -1
- {ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/setup.py +1 -1
- ducklake_delta_exporter-0.3.0/tests/test_stats_transformation.py +656 -0
- {ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/README.md +0 -0
- {ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter.egg-info/dependency_links.txt +0 -0
- {ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter.egg-info/requires.txt +0 -0
- {ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter.egg-info/top_level.txt +0 -0
- {ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/setup.cfg +0 -0
{ducklake_delta_exporter-0.2.0 → ducklake_delta_exporter-0.3.0}/ducklake_delta_exporter/__init__.py
RENAMED
|
@@ -144,7 +144,8 @@ def generate_latest_delta_log(db_path: str):
|
|
|
144
144
|
'name': c.column_name,
|
|
145
145
|
'type':
|
|
146
146
|
CASE
|
|
147
|
-
WHEN contains(lower(c.column_type), '
|
|
147
|
+
WHEN contains(lower(c.column_type), 'bigint') OR
|
|
148
|
+
(contains(lower(c.column_type), 'int') AND contains(c.column_type, '64')) THEN 'long'
|
|
148
149
|
WHEN contains(lower(c.column_type), 'int') THEN 'integer'
|
|
149
150
|
WHEN contains(lower(c.column_type), 'float') THEN 'double'
|
|
150
151
|
WHEN contains(lower(c.column_type), 'double') THEN 'double'
|
|
@@ -172,7 +173,7 @@ def generate_latest_delta_log(db_path: str):
|
|
|
172
173
|
MAX(fcs.null_count) AS null_count
|
|
173
174
|
FROM ducklake_data_file df
|
|
174
175
|
LEFT JOIN ducklake_file_column_stats fcs ON df.data_file_id = fcs.data_file_id
|
|
175
|
-
LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id
|
|
176
|
+
LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id AND c.table_id = df.table_id
|
|
176
177
|
WHERE df.table_id = ?
|
|
177
178
|
AND df.end_snapshot IS NULL
|
|
178
179
|
AND c.column_id IS NOT NULL
|
|
@@ -5,4 +5,5 @@ ducklake_delta_exporter.egg-info/PKG-INFO
|
|
|
5
5
|
ducklake_delta_exporter.egg-info/SOURCES.txt
|
|
6
6
|
ducklake_delta_exporter.egg-info/dependency_links.txt
|
|
7
7
|
ducklake_delta_exporter.egg-info/requires.txt
|
|
8
|
-
ducklake_delta_exporter.egg-info/top_level.txt
|
|
8
|
+
ducklake_delta_exporter.egg-info/top_level.txt
|
|
9
|
+
tests/test_stats_transformation.py
|
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for Delta Lake stats transformation in DuckLake Delta Exporter.
|
|
3
|
+
|
|
4
|
+
Tests verify that column statistics (min/max values) are correctly transformed
|
|
5
|
+
from DuckLake format to Delta Lake format for all supported data types.
|
|
6
|
+
"""
|
|
7
|
+
import pytest
|
|
8
|
+
import duckdb
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestStatsTransformation:
|
|
13
|
+
"""Test stats value transformations for Delta Lake format."""
|
|
14
|
+
|
|
15
|
+
@pytest.fixture
|
|
16
|
+
def con(self):
|
|
17
|
+
"""Create a DuckDB connection for testing."""
|
|
18
|
+
return duckdb.connect()
|
|
19
|
+
|
|
20
|
+
def transform_value(self, con, value: str, column_type: str) -> str:
|
|
21
|
+
"""
|
|
22
|
+
Apply the same transformation logic used in the exporter.
|
|
23
|
+
|
|
24
|
+
This mirrors the CASE statement in file_column_stats_transformed CTE.
|
|
25
|
+
"""
|
|
26
|
+
result = con.execute("""
|
|
27
|
+
SELECT CASE
|
|
28
|
+
WHEN $1 IS NULL THEN NULL
|
|
29
|
+
WHEN contains(lower($2), 'timestamp') THEN
|
|
30
|
+
regexp_replace(
|
|
31
|
+
regexp_replace(replace($1, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
|
|
32
|
+
'^([^.]+)$', '\\1.000'
|
|
33
|
+
) || 'Z'
|
|
34
|
+
WHEN contains(lower($2), 'date') THEN $1
|
|
35
|
+
WHEN contains(lower($2), 'bool') THEN CAST(lower($1) IN ('true', 't', '1', 'yes') AS VARCHAR)
|
|
36
|
+
WHEN contains(lower($2), 'int') OR contains(lower($2), 'float')
|
|
37
|
+
OR contains(lower($2), 'double') OR contains(lower($2), 'decimal') THEN
|
|
38
|
+
CASE WHEN contains($1, '.') OR contains(lower($1), 'e')
|
|
39
|
+
THEN CAST(TRY_CAST($1 AS DOUBLE) AS VARCHAR)
|
|
40
|
+
ELSE CAST(TRY_CAST($1 AS BIGINT) AS VARCHAR)
|
|
41
|
+
END
|
|
42
|
+
ELSE $1
|
|
43
|
+
END AS transformed
|
|
44
|
+
""", [value, column_type]).fetchone()[0]
|
|
45
|
+
return result
|
|
46
|
+
|
|
47
|
+
# ==================== TIMESTAMP TESTS ====================
|
|
48
|
+
|
|
49
|
+
def test_timestamp_basic(self, con):
|
|
50
|
+
"""Test basic timestamp transformation."""
|
|
51
|
+
result = self.transform_value(con, "2024-01-15 10:30:45", "TIMESTAMP")
|
|
52
|
+
assert result == "2024-01-15T10:30:45.000Z"
|
|
53
|
+
|
|
54
|
+
def test_timestamp_with_milliseconds(self, con):
|
|
55
|
+
"""Test timestamp with milliseconds."""
|
|
56
|
+
result = self.transform_value(con, "2024-01-15 10:30:45.123", "TIMESTAMP")
|
|
57
|
+
assert result == "2024-01-15T10:30:45.123Z"
|
|
58
|
+
|
|
59
|
+
def test_timestamp_with_timezone_offset(self, con):
|
|
60
|
+
"""Test timestamp with timezone offset gets stripped."""
|
|
61
|
+
result = self.transform_value(con, "2024-01-15 10:30:45+00", "TIMESTAMP")
|
|
62
|
+
assert result == "2024-01-15T10:30:45.000Z"
|
|
63
|
+
|
|
64
|
+
def test_timestamp_with_full_timezone(self, con):
|
|
65
|
+
"""Test timestamp with full timezone offset."""
|
|
66
|
+
result = self.transform_value(con, "2024-01-15 10:30:45+05:30", "TIMESTAMP")
|
|
67
|
+
assert result == "2024-01-15T10:30:45.000Z"
|
|
68
|
+
|
|
69
|
+
def test_timestamp_negative_timezone(self, con):
|
|
70
|
+
"""Test timestamp with negative timezone offset."""
|
|
71
|
+
result = self.transform_value(con, "2024-01-15 10:30:45-08:00", "TIMESTAMP")
|
|
72
|
+
assert result == "2024-01-15T10:30:45.000Z"
|
|
73
|
+
|
|
74
|
+
def test_timestamp_with_tz_type(self, con):
|
|
75
|
+
"""Test TIMESTAMP WITH TIME ZONE type."""
|
|
76
|
+
result = self.transform_value(con, "2024-01-15 10:30:45", "TIMESTAMP WITH TIME ZONE")
|
|
77
|
+
assert result == "2024-01-15T10:30:45.000Z"
|
|
78
|
+
|
|
79
|
+
def test_timestamptz_type(self, con):
|
|
80
|
+
"""Test TIMESTAMPTZ type alias."""
|
|
81
|
+
result = self.transform_value(con, "2024-01-15 10:30:45", "TIMESTAMPTZ")
|
|
82
|
+
assert result == "2024-01-15T10:30:45.000Z"
|
|
83
|
+
|
|
84
|
+
# ==================== DATE TESTS ====================
|
|
85
|
+
|
|
86
|
+
def test_date_basic(self, con):
|
|
87
|
+
"""Test date pass-through."""
|
|
88
|
+
result = self.transform_value(con, "2024-01-15", "DATE")
|
|
89
|
+
assert result == "2024-01-15"
|
|
90
|
+
|
|
91
|
+
def test_date_edge_cases(self, con):
|
|
92
|
+
"""Test date edge cases."""
|
|
93
|
+
assert self.transform_value(con, "2000-01-01", "DATE") == "2000-01-01"
|
|
94
|
+
assert self.transform_value(con, "2099-12-31", "DATE") == "2099-12-31"
|
|
95
|
+
|
|
96
|
+
# ==================== BOOLEAN TESTS ====================
|
|
97
|
+
|
|
98
|
+
def test_boolean_true_values(self, con):
|
|
99
|
+
"""Test various true representations."""
|
|
100
|
+
assert self.transform_value(con, "true", "BOOLEAN") == "true"
|
|
101
|
+
assert self.transform_value(con, "TRUE", "BOOLEAN") == "true"
|
|
102
|
+
assert self.transform_value(con, "True", "BOOLEAN") == "true"
|
|
103
|
+
assert self.transform_value(con, "t", "BOOLEAN") == "true"
|
|
104
|
+
assert self.transform_value(con, "T", "BOOLEAN") == "true"
|
|
105
|
+
assert self.transform_value(con, "1", "BOOLEAN") == "true"
|
|
106
|
+
assert self.transform_value(con, "yes", "BOOLEAN") == "true"
|
|
107
|
+
assert self.transform_value(con, "YES", "BOOLEAN") == "true"
|
|
108
|
+
|
|
109
|
+
def test_boolean_false_values(self, con):
|
|
110
|
+
"""Test various false representations."""
|
|
111
|
+
assert self.transform_value(con, "false", "BOOLEAN") == "false"
|
|
112
|
+
assert self.transform_value(con, "FALSE", "BOOLEAN") == "false"
|
|
113
|
+
assert self.transform_value(con, "False", "BOOLEAN") == "false"
|
|
114
|
+
assert self.transform_value(con, "f", "BOOLEAN") == "false"
|
|
115
|
+
assert self.transform_value(con, "F", "BOOLEAN") == "false"
|
|
116
|
+
assert self.transform_value(con, "0", "BOOLEAN") == "false"
|
|
117
|
+
assert self.transform_value(con, "no", "BOOLEAN") == "false"
|
|
118
|
+
assert self.transform_value(con, "NO", "BOOLEAN") == "false"
|
|
119
|
+
|
|
120
|
+
def test_bool_type_alias(self, con):
|
|
121
|
+
"""Test BOOL type alias."""
|
|
122
|
+
assert self.transform_value(con, "true", "BOOL") == "true"
|
|
123
|
+
assert self.transform_value(con, "false", "BOOL") == "false"
|
|
124
|
+
|
|
125
|
+
# ==================== INTEGER TESTS ====================
|
|
126
|
+
|
|
127
|
+
def test_integer_basic(self, con):
|
|
128
|
+
"""Test basic integer transformation."""
|
|
129
|
+
assert self.transform_value(con, "42", "INTEGER") == "42"
|
|
130
|
+
assert self.transform_value(con, "0", "INTEGER") == "0"
|
|
131
|
+
assert self.transform_value(con, "-100", "INTEGER") == "-100"
|
|
132
|
+
|
|
133
|
+
def test_integer_large_values(self, con):
|
|
134
|
+
"""Test large integer values."""
|
|
135
|
+
assert self.transform_value(con, "2147483647", "INTEGER") == "2147483647"
|
|
136
|
+
assert self.transform_value(con, "-2147483648", "INTEGER") == "-2147483648"
|
|
137
|
+
|
|
138
|
+
def test_int_type_alias(self, con):
|
|
139
|
+
"""Test INT type alias."""
|
|
140
|
+
assert self.transform_value(con, "42", "INT") == "42"
|
|
141
|
+
|
|
142
|
+
def test_int4_type(self, con):
|
|
143
|
+
"""Test INT4 type."""
|
|
144
|
+
assert self.transform_value(con, "42", "INT4") == "42"
|
|
145
|
+
|
|
146
|
+
def test_smallint(self, con):
|
|
147
|
+
"""Test SMALLINT type."""
|
|
148
|
+
assert self.transform_value(con, "32767", "SMALLINT") == "32767"
|
|
149
|
+
|
|
150
|
+
def test_tinyint(self, con):
|
|
151
|
+
"""Test TINYINT type."""
|
|
152
|
+
assert self.transform_value(con, "127", "TINYINT") == "127"
|
|
153
|
+
|
|
154
|
+
# ==================== BIGINT TESTS ====================
|
|
155
|
+
|
|
156
|
+
def test_bigint_basic(self, con):
|
|
157
|
+
"""Test BIGINT transformation."""
|
|
158
|
+
assert self.transform_value(con, "9223372036854775807", "BIGINT") == "9223372036854775807"
|
|
159
|
+
assert self.transform_value(con, "-9223372036854775808", "BIGINT") == "-9223372036854775808"
|
|
160
|
+
|
|
161
|
+
def test_int64_type(self, con):
|
|
162
|
+
"""Test INT64 type alias."""
|
|
163
|
+
assert self.transform_value(con, "123456789012345", "INT64") == "123456789012345"
|
|
164
|
+
|
|
165
|
+
def test_int8_type(self, con):
|
|
166
|
+
"""Test INT8 type (DuckDB alias for BIGINT)."""
|
|
167
|
+
assert self.transform_value(con, "123456789", "INT8") == "123456789"
|
|
168
|
+
|
|
169
|
+
# ==================== FLOAT/DOUBLE TESTS ====================
|
|
170
|
+
|
|
171
|
+
def test_float_basic(self, con):
|
|
172
|
+
"""Test float transformation."""
|
|
173
|
+
result = self.transform_value(con, "3.14", "FLOAT")
|
|
174
|
+
assert float(result) == pytest.approx(3.14)
|
|
175
|
+
|
|
176
|
+
def test_double_basic(self, con):
|
|
177
|
+
"""Test double transformation."""
|
|
178
|
+
result = self.transform_value(con, "3.141592653589793", "DOUBLE")
|
|
179
|
+
assert float(result) == pytest.approx(3.141592653589793)
|
|
180
|
+
|
|
181
|
+
def test_float_scientific_notation(self, con):
|
|
182
|
+
"""Test scientific notation."""
|
|
183
|
+
result = self.transform_value(con, "1.5e10", "DOUBLE")
|
|
184
|
+
assert float(result) == pytest.approx(1.5e10)
|
|
185
|
+
|
|
186
|
+
def test_float_negative_exponent(self, con):
|
|
187
|
+
"""Test negative exponent."""
|
|
188
|
+
result = self.transform_value(con, "1.5e-5", "DOUBLE")
|
|
189
|
+
assert float(result) == pytest.approx(1.5e-5)
|
|
190
|
+
|
|
191
|
+
def test_real_type(self, con):
|
|
192
|
+
"""Test REAL type (alias for FLOAT)."""
|
|
193
|
+
result = self.transform_value(con, "2.5", "REAL")
|
|
194
|
+
assert float(result) == pytest.approx(2.5)
|
|
195
|
+
|
|
196
|
+
def test_float4_type(self, con):
|
|
197
|
+
"""Test FLOAT4 type."""
|
|
198
|
+
result = self.transform_value(con, "2.5", "FLOAT4")
|
|
199
|
+
assert float(result) == pytest.approx(2.5)
|
|
200
|
+
|
|
201
|
+
def test_float8_type(self, con):
|
|
202
|
+
"""Test FLOAT8 type (alias for DOUBLE)."""
|
|
203
|
+
result = self.transform_value(con, "2.5", "FLOAT8")
|
|
204
|
+
assert float(result) == pytest.approx(2.5)
|
|
205
|
+
|
|
206
|
+
# ==================== DECIMAL TESTS ====================
|
|
207
|
+
|
|
208
|
+
def test_decimal_basic(self, con):
|
|
209
|
+
"""Test DECIMAL transformation."""
|
|
210
|
+
result = self.transform_value(con, "123.45", "DECIMAL(10,2)")
|
|
211
|
+
assert float(result) == pytest.approx(123.45)
|
|
212
|
+
|
|
213
|
+
def test_decimal_high_precision(self, con):
|
|
214
|
+
"""Test high precision decimal."""
|
|
215
|
+
result = self.transform_value(con, "123456.789012", "DECIMAL(18,6)")
|
|
216
|
+
assert float(result) == pytest.approx(123456.789012)
|
|
217
|
+
|
|
218
|
+
def test_numeric_type(self, con):
|
|
219
|
+
"""Test NUMERIC type (alias for DECIMAL)."""
|
|
220
|
+
result = self.transform_value(con, "99.99", "NUMERIC(5,2)")
|
|
221
|
+
assert float(result) == pytest.approx(99.99)
|
|
222
|
+
|
|
223
|
+
def test_decimal_integer_value(self, con):
|
|
224
|
+
"""Test decimal with integer value (no decimal point)."""
|
|
225
|
+
result = self.transform_value(con, "100", "DECIMAL(10,2)")
|
|
226
|
+
assert result == "100"
|
|
227
|
+
|
|
228
|
+
# ==================== STRING TESTS ====================
|
|
229
|
+
|
|
230
|
+
def test_string_passthrough(self, con):
|
|
231
|
+
"""Test string values pass through unchanged."""
|
|
232
|
+
assert self.transform_value(con, "hello world", "VARCHAR") == "hello world"
|
|
233
|
+
assert self.transform_value(con, "test@example.com", "VARCHAR") == "test@example.com"
|
|
234
|
+
|
|
235
|
+
def test_text_type(self, con):
|
|
236
|
+
"""Test TEXT type."""
|
|
237
|
+
assert self.transform_value(con, "some text", "TEXT") == "some text"
|
|
238
|
+
|
|
239
|
+
def test_string_type(self, con):
|
|
240
|
+
"""Test STRING type."""
|
|
241
|
+
assert self.transform_value(con, "a string", "STRING") == "a string"
|
|
242
|
+
|
|
243
|
+
def test_char_type(self, con):
|
|
244
|
+
"""Test CHAR type."""
|
|
245
|
+
assert self.transform_value(con, "ABC", "CHAR(10)") == "ABC"
|
|
246
|
+
|
|
247
|
+
def test_string_with_special_chars(self, con):
|
|
248
|
+
"""Test strings with special characters."""
|
|
249
|
+
assert self.transform_value(con, "Hello, World!", "VARCHAR") == "Hello, World!"
|
|
250
|
+
assert self.transform_value(con, "line1\nline2", "VARCHAR") == "line1\nline2"
|
|
251
|
+
|
|
252
|
+
# ==================== NULL TESTS ====================
|
|
253
|
+
|
|
254
|
+
def test_null_value(self, con):
|
|
255
|
+
"""Test NULL value handling."""
|
|
256
|
+
assert self.transform_value(con, None, "INTEGER") is None
|
|
257
|
+
assert self.transform_value(con, None, "VARCHAR") is None
|
|
258
|
+
assert self.transform_value(con, None, "TIMESTAMP") is None
|
|
259
|
+
|
|
260
|
+
# ==================== EDGE CASES ====================
|
|
261
|
+
|
|
262
|
+
def test_empty_string(self, con):
|
|
263
|
+
"""Test empty string handling."""
|
|
264
|
+
assert self.transform_value(con, "", "VARCHAR") == ""
|
|
265
|
+
|
|
266
|
+
def test_numeric_string_in_varchar(self, con):
|
|
267
|
+
"""Test numeric-looking string in VARCHAR stays as string."""
|
|
268
|
+
# VARCHAR should pass through without numeric conversion
|
|
269
|
+
assert self.transform_value(con, "42", "VARCHAR") == "42"
|
|
270
|
+
assert self.transform_value(con, "3.14", "VARCHAR") == "3.14"
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class TestStatsInCheckpoint:
|
|
274
|
+
"""Test that stats are correctly included in checkpoint parquet output."""
|
|
275
|
+
|
|
276
|
+
@pytest.fixture
|
|
277
|
+
def con(self):
|
|
278
|
+
"""Create a DuckDB connection with DuckLake extension."""
|
|
279
|
+
con = duckdb.connect()
|
|
280
|
+
try:
|
|
281
|
+
# Try loading first (if already installed)
|
|
282
|
+
con.execute("LOAD ducklake")
|
|
283
|
+
except Exception:
|
|
284
|
+
try:
|
|
285
|
+
# Try installing from community
|
|
286
|
+
con.execute("INSTALL ducklake FROM community")
|
|
287
|
+
con.execute("LOAD ducklake")
|
|
288
|
+
except Exception as e:
|
|
289
|
+
pytest.skip(f"DuckLake extension not available: {e}")
|
|
290
|
+
return con
|
|
291
|
+
|
|
292
|
+
def test_stats_structure_in_add_action(self, con, tmp_path):
|
|
293
|
+
"""Test that stats JSON has correct structure in add action."""
|
|
294
|
+
# Create a DuckLake database with test data
|
|
295
|
+
db_path = str(tmp_path / "test.ducklake")
|
|
296
|
+
data_path = str(tmp_path / "data")
|
|
297
|
+
|
|
298
|
+
con.execute(f"ATTACH 'ducklake:{db_path}' AS test_db (DATA_PATH '{data_path}')")
|
|
299
|
+
con.execute("USE test_db")
|
|
300
|
+
|
|
301
|
+
# Create table with various column types
|
|
302
|
+
con.execute("""
|
|
303
|
+
CREATE TABLE test_stats (
|
|
304
|
+
id INTEGER,
|
|
305
|
+
name VARCHAR,
|
|
306
|
+
amount DOUBLE,
|
|
307
|
+
created_at TIMESTAMP,
|
|
308
|
+
is_active BOOLEAN,
|
|
309
|
+
birth_date DATE
|
|
310
|
+
)
|
|
311
|
+
""")
|
|
312
|
+
|
|
313
|
+
# Insert test data
|
|
314
|
+
con.execute("""
|
|
315
|
+
INSERT INTO test_stats VALUES
|
|
316
|
+
(1, 'Alice', 100.50, '2024-01-15 10:30:00', true, '1990-05-20'),
|
|
317
|
+
(2, 'Bob', 200.75, '2024-02-20 14:45:00', false, '1985-12-10'),
|
|
318
|
+
(3, 'Charlie', 50.25, '2024-03-25 09:15:00', true, '1992-08-05')
|
|
319
|
+
""")
|
|
320
|
+
|
|
321
|
+
# Check that file column stats were recorded (query from metadata schema)
|
|
322
|
+
stats = con.execute("""
|
|
323
|
+
SELECT
|
|
324
|
+
c.column_name,
|
|
325
|
+
c.column_type,
|
|
326
|
+
fcs.min_value,
|
|
327
|
+
fcs.max_value,
|
|
328
|
+
fcs.null_count,
|
|
329
|
+
fcs.value_count
|
|
330
|
+
FROM __ducklake_metadata_test_db.ducklake_file_column_stats fcs
|
|
331
|
+
JOIN __ducklake_metadata_test_db.ducklake_column c ON fcs.column_id = c.column_id
|
|
332
|
+
ORDER BY c.column_order
|
|
333
|
+
""").fetchall()
|
|
334
|
+
|
|
335
|
+
# Verify stats exist for all columns
|
|
336
|
+
column_names = [s[0] for s in stats]
|
|
337
|
+
assert 'id' in column_names
|
|
338
|
+
assert 'name' in column_names
|
|
339
|
+
assert 'amount' in column_names
|
|
340
|
+
assert 'created_at' in column_names
|
|
341
|
+
assert 'is_active' in column_names
|
|
342
|
+
assert 'birth_date' in column_names
|
|
343
|
+
|
|
344
|
+
# Verify min/max values are present (except for boolean which has no min/max)
|
|
345
|
+
for stat in stats:
|
|
346
|
+
col_name, col_type, min_val, max_val, null_count, value_count = stat
|
|
347
|
+
if 'bool' not in col_type.lower():
|
|
348
|
+
assert min_val is not None, f"min_value should not be None for {col_name}"
|
|
349
|
+
assert max_val is not None, f"max_value should not be None for {col_name}"
|
|
350
|
+
assert null_count == 0, f"null_count should be 0 for {col_name}"
|
|
351
|
+
assert value_count == 3, f"value_count should be 3 for {col_name}"
|
|
352
|
+
|
|
353
|
+
def test_stats_json_format(self, con, tmp_path):
|
|
354
|
+
"""Test that exported stats JSON is valid and has expected format."""
|
|
355
|
+
# Create a DuckLake database
|
|
356
|
+
db_path = str(tmp_path / "test.ducklake")
|
|
357
|
+
data_path = str(tmp_path / "data")
|
|
358
|
+
|
|
359
|
+
con.execute(f"ATTACH 'ducklake:{db_path}' AS test_db (DATA_PATH '{data_path}')")
|
|
360
|
+
con.execute("USE test_db")
|
|
361
|
+
|
|
362
|
+
con.execute("""
|
|
363
|
+
CREATE TABLE stats_test (
|
|
364
|
+
int_col INTEGER,
|
|
365
|
+
str_col VARCHAR,
|
|
366
|
+
ts_col TIMESTAMP
|
|
367
|
+
)
|
|
368
|
+
""")
|
|
369
|
+
|
|
370
|
+
con.execute("""
|
|
371
|
+
INSERT INTO stats_test VALUES
|
|
372
|
+
(10, 'min', '2024-01-01 00:00:00'),
|
|
373
|
+
(50, 'max', '2024-12-31 23:59:59')
|
|
374
|
+
""")
|
|
375
|
+
|
|
376
|
+
# Close connection before exporting (file lock)
|
|
377
|
+
con.close()
|
|
378
|
+
|
|
379
|
+
# Run the exporter
|
|
380
|
+
from ducklake_delta_exporter import generate_latest_delta_log
|
|
381
|
+
generate_latest_delta_log(db_path)
|
|
382
|
+
|
|
383
|
+
# Reopen for verification
|
|
384
|
+
con = duckdb.connect()
|
|
385
|
+
|
|
386
|
+
# Read the checkpoint parquet
|
|
387
|
+
checkpoint_files = list((tmp_path / "data" / "main" / "stats_test" / "_delta_log").glob("*.checkpoint.parquet"))
|
|
388
|
+
assert len(checkpoint_files) == 1, "Should have exactly one checkpoint file"
|
|
389
|
+
|
|
390
|
+
# Read the add action and verify stats
|
|
391
|
+
add_rows = con.execute(f"""
|
|
392
|
+
SELECT add.stats
|
|
393
|
+
FROM '{checkpoint_files[0]}'
|
|
394
|
+
WHERE add IS NOT NULL
|
|
395
|
+
""").fetchall()
|
|
396
|
+
|
|
397
|
+
assert len(add_rows) == 1, "Should have one add action"
|
|
398
|
+
|
|
399
|
+
stats_json = json.loads(add_rows[0][0])
|
|
400
|
+
|
|
401
|
+
# Verify stats structure
|
|
402
|
+
assert 'numRecords' in stats_json
|
|
403
|
+
assert 'minValues' in stats_json
|
|
404
|
+
assert 'maxValues' in stats_json
|
|
405
|
+
assert 'nullCount' in stats_json
|
|
406
|
+
|
|
407
|
+
# Verify record count
|
|
408
|
+
assert stats_json['numRecords'] == 2
|
|
409
|
+
|
|
410
|
+
# Verify min/max values exist for columns
|
|
411
|
+
assert 'int_col' in stats_json['minValues']
|
|
412
|
+
assert 'str_col' in stats_json['minValues']
|
|
413
|
+
assert 'ts_col' in stats_json['minValues']
|
|
414
|
+
|
|
415
|
+
# Verify integer stats
|
|
416
|
+
assert stats_json['minValues']['int_col'] == '10'
|
|
417
|
+
assert stats_json['maxValues']['int_col'] == '50'
|
|
418
|
+
|
|
419
|
+
# Verify string stats (alphabetically: 'max' < 'min')
|
|
420
|
+
assert stats_json['minValues']['str_col'] == 'max'
|
|
421
|
+
assert stats_json['maxValues']['str_col'] == 'min'
|
|
422
|
+
|
|
423
|
+
# Verify timestamp format (should be ISO with Z suffix)
|
|
424
|
+
assert stats_json['minValues']['ts_col'].endswith('Z')
|
|
425
|
+
assert 'T' in stats_json['minValues']['ts_col']
|
|
426
|
+
|
|
427
|
+
def test_null_count_tracking(self, con, tmp_path):
|
|
428
|
+
"""Test that null counts are correctly tracked in stats."""
|
|
429
|
+
db_path = str(tmp_path / "test.ducklake")
|
|
430
|
+
data_path = str(tmp_path / "data")
|
|
431
|
+
|
|
432
|
+
con.execute(f"ATTACH 'ducklake:{db_path}' AS test_db (DATA_PATH '{data_path}')")
|
|
433
|
+
con.execute("USE test_db")
|
|
434
|
+
|
|
435
|
+
con.execute("""
|
|
436
|
+
CREATE TABLE null_test (
|
|
437
|
+
required_col INTEGER,
|
|
438
|
+
nullable_col VARCHAR
|
|
439
|
+
)
|
|
440
|
+
""")
|
|
441
|
+
|
|
442
|
+
con.execute("""
|
|
443
|
+
INSERT INTO null_test VALUES
|
|
444
|
+
(1, 'value1'),
|
|
445
|
+
(2, NULL),
|
|
446
|
+
(3, 'value3'),
|
|
447
|
+
(4, NULL)
|
|
448
|
+
""")
|
|
449
|
+
|
|
450
|
+
# Close connection before exporting (file lock)
|
|
451
|
+
con.close()
|
|
452
|
+
|
|
453
|
+
from ducklake_delta_exporter import generate_latest_delta_log
|
|
454
|
+
generate_latest_delta_log(db_path)
|
|
455
|
+
|
|
456
|
+
# Reopen for verification
|
|
457
|
+
con = duckdb.connect()
|
|
458
|
+
|
|
459
|
+
checkpoint_files = list((tmp_path / "data" / "main" / "null_test" / "_delta_log").glob("*.checkpoint.parquet"))
|
|
460
|
+
|
|
461
|
+
add_rows = con.execute(f"""
|
|
462
|
+
SELECT add.stats
|
|
463
|
+
FROM '{checkpoint_files[0]}'
|
|
464
|
+
WHERE add IS NOT NULL
|
|
465
|
+
""").fetchall()
|
|
466
|
+
|
|
467
|
+
stats_json = json.loads(add_rows[0][0])
|
|
468
|
+
|
|
469
|
+
# Verify null counts
|
|
470
|
+
assert stats_json['nullCount']['required_col'] == 0
|
|
471
|
+
assert stats_json['nullCount']['nullable_col'] == 2
|
|
472
|
+
|
|
473
|
+
def test_stats_isolation_between_tables(self, con, tmp_path):
|
|
474
|
+
"""Test that stats from different tables don't leak into each other.
|
|
475
|
+
|
|
476
|
+
This tests the fix for the table_id grouping bug where columns with
|
|
477
|
+
the same name from different tables could have their stats mixed.
|
|
478
|
+
"""
|
|
479
|
+
db_path = str(tmp_path / "test.ducklake")
|
|
480
|
+
data_path = str(tmp_path / "data")
|
|
481
|
+
|
|
482
|
+
con.execute(f"ATTACH 'ducklake:{db_path}' AS test_db (DATA_PATH '{data_path}')")
|
|
483
|
+
con.execute("USE test_db")
|
|
484
|
+
|
|
485
|
+
# Create two tables with overlapping column names but different data
|
|
486
|
+
con.execute("""
|
|
487
|
+
CREATE TABLE table_a (
|
|
488
|
+
id INTEGER,
|
|
489
|
+
name VARCHAR,
|
|
490
|
+
value INTEGER
|
|
491
|
+
)
|
|
492
|
+
""")
|
|
493
|
+
|
|
494
|
+
con.execute("""
|
|
495
|
+
CREATE TABLE table_b (
|
|
496
|
+
id INTEGER,
|
|
497
|
+
name VARCHAR,
|
|
498
|
+
amount DOUBLE,
|
|
499
|
+
extra_col VARCHAR
|
|
500
|
+
)
|
|
501
|
+
""")
|
|
502
|
+
|
|
503
|
+
# Insert different data ranges
|
|
504
|
+
con.execute("""
|
|
505
|
+
INSERT INTO table_a VALUES
|
|
506
|
+
(1, 'alpha', 100),
|
|
507
|
+
(2, 'beta', 200)
|
|
508
|
+
""")
|
|
509
|
+
|
|
510
|
+
con.execute("""
|
|
511
|
+
INSERT INTO table_b VALUES
|
|
512
|
+
(100, 'zebra', 999.99, 'extra1'),
|
|
513
|
+
(200, 'yak', 888.88, 'extra2')
|
|
514
|
+
""")
|
|
515
|
+
|
|
516
|
+
# Close connection before exporting (file lock)
|
|
517
|
+
con.close()
|
|
518
|
+
|
|
519
|
+
from ducklake_delta_exporter import generate_latest_delta_log
|
|
520
|
+
generate_latest_delta_log(db_path)
|
|
521
|
+
|
|
522
|
+
# Reopen for verification
|
|
523
|
+
con = duckdb.connect()
|
|
524
|
+
con.execute("LOAD ducklake")
|
|
525
|
+
|
|
526
|
+
# Check table_a stats - should only have its own columns
|
|
527
|
+
checkpoint_a = list((tmp_path / "data" / "main" / "table_a" / "_delta_log").glob("*.checkpoint.parquet"))
|
|
528
|
+
assert len(checkpoint_a) == 1
|
|
529
|
+
|
|
530
|
+
stats_a = con.execute(f"""
|
|
531
|
+
SELECT add.stats
|
|
532
|
+
FROM '{checkpoint_a[0]}'
|
|
533
|
+
WHERE add IS NOT NULL
|
|
534
|
+
""").fetchone()[0]
|
|
535
|
+
|
|
536
|
+
stats_a_json = json.loads(stats_a)
|
|
537
|
+
|
|
538
|
+
# table_a should have id, name, value - NOT amount or extra_col
|
|
539
|
+
assert 'id' in stats_a_json['minValues']
|
|
540
|
+
assert 'name' in stats_a_json['minValues']
|
|
541
|
+
assert 'value' in stats_a_json['minValues']
|
|
542
|
+
assert 'amount' not in stats_a_json['minValues'], "table_a should not have table_b's 'amount' column"
|
|
543
|
+
assert 'extra_col' not in stats_a_json['minValues'], "table_a should not have table_b's 'extra_col' column"
|
|
544
|
+
|
|
545
|
+
# Verify table_a stats have correct values (not mixed with table_b)
|
|
546
|
+
assert stats_a_json['minValues']['id'] == '1' # table_a has 1,2 not 100,200
|
|
547
|
+
assert stats_a_json['maxValues']['id'] == '2'
|
|
548
|
+
assert stats_a_json['minValues']['name'] == 'alpha' # not 'yak' or 'zebra'
|
|
549
|
+
assert stats_a_json['maxValues']['name'] == 'beta'
|
|
550
|
+
|
|
551
|
+
# Check table_b stats - should only have its own columns
|
|
552
|
+
checkpoint_b = list((tmp_path / "data" / "main" / "table_b" / "_delta_log").glob("*.checkpoint.parquet"))
|
|
553
|
+
assert len(checkpoint_b) == 1
|
|
554
|
+
|
|
555
|
+
stats_b = con.execute(f"""
|
|
556
|
+
SELECT add.stats
|
|
557
|
+
FROM '{checkpoint_b[0]}'
|
|
558
|
+
WHERE add IS NOT NULL
|
|
559
|
+
""").fetchone()[0]
|
|
560
|
+
|
|
561
|
+
stats_b_json = json.loads(stats_b)
|
|
562
|
+
|
|
563
|
+
# table_b should have id, name, amount, extra_col - NOT value
|
|
564
|
+
assert 'id' in stats_b_json['minValues']
|
|
565
|
+
assert 'name' in stats_b_json['minValues']
|
|
566
|
+
assert 'amount' in stats_b_json['minValues']
|
|
567
|
+
assert 'extra_col' in stats_b_json['minValues']
|
|
568
|
+
assert 'value' not in stats_b_json['minValues'], "table_b should not have table_a's 'value' column"
|
|
569
|
+
|
|
570
|
+
# Verify table_b stats have correct values (not mixed with table_a)
|
|
571
|
+
assert stats_b_json['minValues']['id'] == '100' # table_b has 100,200 not 1,2
|
|
572
|
+
assert stats_b_json['maxValues']['id'] == '200'
|
|
573
|
+
assert stats_b_json['minValues']['name'] == 'yak' # not 'alpha' or 'beta'
|
|
574
|
+
assert stats_b_json['maxValues']['name'] == 'zebra'
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
class TestDeltaLakeTypeMapping:
|
|
578
|
+
"""Test that DuckDB types are correctly mapped to Delta Lake types in schema."""
|
|
579
|
+
|
|
580
|
+
@pytest.fixture
|
|
581
|
+
def con(self):
|
|
582
|
+
"""Create a DuckDB connection."""
|
|
583
|
+
return duckdb.connect()
|
|
584
|
+
|
|
585
|
+
def map_type(self, con, duckdb_type: str) -> str:
|
|
586
|
+
"""Map DuckDB type to Delta Lake type using the exporter's logic."""
|
|
587
|
+
result = con.execute("""
|
|
588
|
+
SELECT CASE
|
|
589
|
+
WHEN contains(lower($1), 'bigint') OR
|
|
590
|
+
(contains(lower($1), 'int') AND contains($1, '64')) THEN 'long'
|
|
591
|
+
WHEN contains(lower($1), 'int') THEN 'integer'
|
|
592
|
+
WHEN contains(lower($1), 'float') THEN 'double'
|
|
593
|
+
WHEN contains(lower($1), 'double') THEN 'double'
|
|
594
|
+
WHEN contains(lower($1), 'bool') THEN 'boolean'
|
|
595
|
+
WHEN contains(lower($1), 'timestamp') THEN 'timestamp'
|
|
596
|
+
WHEN contains(lower($1), 'date') THEN 'date'
|
|
597
|
+
WHEN contains(lower($1), 'decimal') THEN lower($1)
|
|
598
|
+
ELSE 'string'
|
|
599
|
+
END
|
|
600
|
+
""", [duckdb_type]).fetchone()[0]
|
|
601
|
+
return result
|
|
602
|
+
|
|
603
|
+
def test_integer_types(self, con):
|
|
604
|
+
"""Test integer type mappings."""
|
|
605
|
+
assert self.map_type(con, "INTEGER") == "integer"
|
|
606
|
+
assert self.map_type(con, "INT") == "integer"
|
|
607
|
+
assert self.map_type(con, "INT4") == "integer"
|
|
608
|
+
assert self.map_type(con, "SMALLINT") == "integer"
|
|
609
|
+
assert self.map_type(con, "TINYINT") == "integer"
|
|
610
|
+
|
|
611
|
+
def test_bigint_types(self, con):
|
|
612
|
+
"""Test bigint type mappings."""
|
|
613
|
+
assert self.map_type(con, "BIGINT") == "long"
|
|
614
|
+
assert self.map_type(con, "INT64") == "long"
|
|
615
|
+
assert self.map_type(con, "INT8") == "integer" # INT8 doesn't contain '64' or 'bigint'
|
|
616
|
+
|
|
617
|
+
def test_float_types(self, con):
|
|
618
|
+
"""Test float/double type mappings."""
|
|
619
|
+
assert self.map_type(con, "FLOAT") == "double"
|
|
620
|
+
assert self.map_type(con, "DOUBLE") == "double"
|
|
621
|
+
assert self.map_type(con, "REAL") == "string" # REAL doesn't match 'float' or 'double'
|
|
622
|
+
assert self.map_type(con, "FLOAT4") == "double"
|
|
623
|
+
assert self.map_type(con, "FLOAT8") == "double"
|
|
624
|
+
|
|
625
|
+
def test_boolean_types(self, con):
|
|
626
|
+
"""Test boolean type mappings."""
|
|
627
|
+
assert self.map_type(con, "BOOLEAN") == "boolean"
|
|
628
|
+
assert self.map_type(con, "BOOL") == "boolean"
|
|
629
|
+
|
|
630
|
+
def test_timestamp_types(self, con):
|
|
631
|
+
"""Test timestamp type mappings."""
|
|
632
|
+
assert self.map_type(con, "TIMESTAMP") == "timestamp"
|
|
633
|
+
assert self.map_type(con, "TIMESTAMP WITH TIME ZONE") == "timestamp"
|
|
634
|
+
assert self.map_type(con, "TIMESTAMPTZ") == "timestamp"
|
|
635
|
+
|
|
636
|
+
def test_date_type(self, con):
|
|
637
|
+
"""Test date type mapping."""
|
|
638
|
+
assert self.map_type(con, "DATE") == "date"
|
|
639
|
+
|
|
640
|
+
def test_decimal_types(self, con):
|
|
641
|
+
"""Test decimal type mappings."""
|
|
642
|
+
assert self.map_type(con, "DECIMAL(10,2)") == "decimal(10,2)"
|
|
643
|
+
assert self.map_type(con, "DECIMAL(18,6)") == "decimal(18,6)"
|
|
644
|
+
assert self.map_type(con, "NUMERIC(5,2)") == "string" # NUMERIC doesn't match 'decimal'
|
|
645
|
+
|
|
646
|
+
def test_string_types(self, con):
|
|
647
|
+
"""Test string type mappings."""
|
|
648
|
+
assert self.map_type(con, "VARCHAR") == "string"
|
|
649
|
+
assert self.map_type(con, "VARCHAR(100)") == "string"
|
|
650
|
+
assert self.map_type(con, "TEXT") == "string"
|
|
651
|
+
assert self.map_type(con, "STRING") == "string"
|
|
652
|
+
assert self.map_type(con, "CHAR(10)") == "string"
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
if __name__ == "__main__":
|
|
656
|
+
pytest.main([__file__, "-v"])
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|