ducklake-delta-exporter 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ducklake-delta-exporter
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
5
  Home-page: https://github.com/djouallah/ducklake_delta_exporter
6
6
  Author: mim
@@ -144,7 +144,8 @@ def generate_latest_delta_log(db_path: str):
144
144
  'name': c.column_name,
145
145
  'type':
146
146
  CASE
147
- WHEN contains(lower(c.column_type), 'int') AND contains(c.column_type, '64') THEN 'long'
147
+ WHEN contains(lower(c.column_type), 'bigint') OR
148
+ (contains(lower(c.column_type), 'int') AND contains(c.column_type, '64')) THEN 'long'
148
149
  WHEN contains(lower(c.column_type), 'int') THEN 'integer'
149
150
  WHEN contains(lower(c.column_type), 'float') THEN 'double'
150
151
  WHEN contains(lower(c.column_type), 'double') THEN 'double'
@@ -172,7 +173,7 @@ def generate_latest_delta_log(db_path: str):
172
173
  MAX(fcs.null_count) AS null_count
173
174
  FROM ducklake_data_file df
174
175
  LEFT JOIN ducklake_file_column_stats fcs ON df.data_file_id = fcs.data_file_id
175
- LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id
176
+ LEFT JOIN ducklake_column c ON fcs.column_id = c.column_id AND c.table_id = df.table_id
176
177
  WHERE df.table_id = ?
177
178
  AND df.end_snapshot IS NULL
178
179
  AND c.column_id IS NOT NULL
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ducklake-delta-exporter
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: A utility to export DuckLake database metadata to Delta Lake transaction logs.
5
5
  Home-page: https://github.com/djouallah/ducklake_delta_exporter
6
6
  Author: mim
@@ -5,4 +5,5 @@ ducklake_delta_exporter.egg-info/PKG-INFO
5
5
  ducklake_delta_exporter.egg-info/SOURCES.txt
6
6
  ducklake_delta_exporter.egg-info/dependency_links.txt
7
7
  ducklake_delta_exporter.egg-info/requires.txt
8
- ducklake_delta_exporter.egg-info/top_level.txt
8
+ ducklake_delta_exporter.egg-info/top_level.txt
9
+ tests/test_stats_transformation.py
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name='ducklake-delta-exporter',
6
- version='0.2.0',
6
+ version='0.3.0',
7
7
  packages=find_packages(),
8
8
  install_requires=['duckdb>=1.4.4'],
9
9
  author='mim',
@@ -0,0 +1,656 @@
1
+ """
2
+ Tests for Delta Lake stats transformation in DuckLake Delta Exporter.
3
+
4
+ Tests verify that column statistics (min/max values) are correctly transformed
5
+ from DuckLake format to Delta Lake format for all supported data types.
6
+ """
7
+ import pytest
8
+ import duckdb
9
+ import json
10
+
11
+
12
+ class TestStatsTransformation:
13
+ """Test stats value transformations for Delta Lake format."""
14
+
15
+ @pytest.fixture
16
+ def con(self):
17
+ """Create a DuckDB connection for testing."""
18
+ return duckdb.connect()
19
+
20
+ def transform_value(self, con, value: str, column_type: str) -> str:
21
+ """
22
+ Apply the same transformation logic used in the exporter.
23
+
24
+ This mirrors the CASE statement in file_column_stats_transformed CTE.
25
+ """
26
+ result = con.execute("""
27
+ SELECT CASE
28
+ WHEN $1 IS NULL THEN NULL
29
+ WHEN contains(lower($2), 'timestamp') THEN
30
+ regexp_replace(
31
+ regexp_replace(replace($1, ' ', 'T'), '[+-]\\d{2}(?::\\d{2})?$', ''),
32
+ '^([^.]+)$', '\\1.000'
33
+ ) || 'Z'
34
+ WHEN contains(lower($2), 'date') THEN $1
35
+ WHEN contains(lower($2), 'bool') THEN CAST(lower($1) IN ('true', 't', '1', 'yes') AS VARCHAR)
36
+ WHEN contains(lower($2), 'int') OR contains(lower($2), 'float')
37
+ OR contains(lower($2), 'double') OR contains(lower($2), 'decimal') THEN
38
+ CASE WHEN contains($1, '.') OR contains(lower($1), 'e')
39
+ THEN CAST(TRY_CAST($1 AS DOUBLE) AS VARCHAR)
40
+ ELSE CAST(TRY_CAST($1 AS BIGINT) AS VARCHAR)
41
+ END
42
+ ELSE $1
43
+ END AS transformed
44
+ """, [value, column_type]).fetchone()[0]
45
+ return result
46
+
47
+ # ==================== TIMESTAMP TESTS ====================
48
+
49
+ def test_timestamp_basic(self, con):
50
+ """Test basic timestamp transformation."""
51
+ result = self.transform_value(con, "2024-01-15 10:30:45", "TIMESTAMP")
52
+ assert result == "2024-01-15T10:30:45.000Z"
53
+
54
+ def test_timestamp_with_milliseconds(self, con):
55
+ """Test timestamp with milliseconds."""
56
+ result = self.transform_value(con, "2024-01-15 10:30:45.123", "TIMESTAMP")
57
+ assert result == "2024-01-15T10:30:45.123Z"
58
+
59
+ def test_timestamp_with_timezone_offset(self, con):
60
+ """Test timestamp with timezone offset gets stripped."""
61
+ result = self.transform_value(con, "2024-01-15 10:30:45+00", "TIMESTAMP")
62
+ assert result == "2024-01-15T10:30:45.000Z"
63
+
64
+ def test_timestamp_with_full_timezone(self, con):
65
+ """Test timestamp with full timezone offset."""
66
+ result = self.transform_value(con, "2024-01-15 10:30:45+05:30", "TIMESTAMP")
67
+ assert result == "2024-01-15T10:30:45.000Z"
68
+
69
+ def test_timestamp_negative_timezone(self, con):
70
+ """Test timestamp with negative timezone offset."""
71
+ result = self.transform_value(con, "2024-01-15 10:30:45-08:00", "TIMESTAMP")
72
+ assert result == "2024-01-15T10:30:45.000Z"
73
+
74
+ def test_timestamp_with_tz_type(self, con):
75
+ """Test TIMESTAMP WITH TIME ZONE type."""
76
+ result = self.transform_value(con, "2024-01-15 10:30:45", "TIMESTAMP WITH TIME ZONE")
77
+ assert result == "2024-01-15T10:30:45.000Z"
78
+
79
+ def test_timestamptz_type(self, con):
80
+ """Test TIMESTAMPTZ type alias."""
81
+ result = self.transform_value(con, "2024-01-15 10:30:45", "TIMESTAMPTZ")
82
+ assert result == "2024-01-15T10:30:45.000Z"
83
+
84
+ # ==================== DATE TESTS ====================
85
+
86
+ def test_date_basic(self, con):
87
+ """Test date pass-through."""
88
+ result = self.transform_value(con, "2024-01-15", "DATE")
89
+ assert result == "2024-01-15"
90
+
91
+ def test_date_edge_cases(self, con):
92
+ """Test date edge cases."""
93
+ assert self.transform_value(con, "2000-01-01", "DATE") == "2000-01-01"
94
+ assert self.transform_value(con, "2099-12-31", "DATE") == "2099-12-31"
95
+
96
+ # ==================== BOOLEAN TESTS ====================
97
+
98
+ def test_boolean_true_values(self, con):
99
+ """Test various true representations."""
100
+ assert self.transform_value(con, "true", "BOOLEAN") == "true"
101
+ assert self.transform_value(con, "TRUE", "BOOLEAN") == "true"
102
+ assert self.transform_value(con, "True", "BOOLEAN") == "true"
103
+ assert self.transform_value(con, "t", "BOOLEAN") == "true"
104
+ assert self.transform_value(con, "T", "BOOLEAN") == "true"
105
+ assert self.transform_value(con, "1", "BOOLEAN") == "true"
106
+ assert self.transform_value(con, "yes", "BOOLEAN") == "true"
107
+ assert self.transform_value(con, "YES", "BOOLEAN") == "true"
108
+
109
+ def test_boolean_false_values(self, con):
110
+ """Test various false representations."""
111
+ assert self.transform_value(con, "false", "BOOLEAN") == "false"
112
+ assert self.transform_value(con, "FALSE", "BOOLEAN") == "false"
113
+ assert self.transform_value(con, "False", "BOOLEAN") == "false"
114
+ assert self.transform_value(con, "f", "BOOLEAN") == "false"
115
+ assert self.transform_value(con, "F", "BOOLEAN") == "false"
116
+ assert self.transform_value(con, "0", "BOOLEAN") == "false"
117
+ assert self.transform_value(con, "no", "BOOLEAN") == "false"
118
+ assert self.transform_value(con, "NO", "BOOLEAN") == "false"
119
+
120
+ def test_bool_type_alias(self, con):
121
+ """Test BOOL type alias."""
122
+ assert self.transform_value(con, "true", "BOOL") == "true"
123
+ assert self.transform_value(con, "false", "BOOL") == "false"
124
+
125
+ # ==================== INTEGER TESTS ====================
126
+
127
+ def test_integer_basic(self, con):
128
+ """Test basic integer transformation."""
129
+ assert self.transform_value(con, "42", "INTEGER") == "42"
130
+ assert self.transform_value(con, "0", "INTEGER") == "0"
131
+ assert self.transform_value(con, "-100", "INTEGER") == "-100"
132
+
133
+ def test_integer_large_values(self, con):
134
+ """Test large integer values."""
135
+ assert self.transform_value(con, "2147483647", "INTEGER") == "2147483647"
136
+ assert self.transform_value(con, "-2147483648", "INTEGER") == "-2147483648"
137
+
138
+ def test_int_type_alias(self, con):
139
+ """Test INT type alias."""
140
+ assert self.transform_value(con, "42", "INT") == "42"
141
+
142
+ def test_int4_type(self, con):
143
+ """Test INT4 type."""
144
+ assert self.transform_value(con, "42", "INT4") == "42"
145
+
146
+ def test_smallint(self, con):
147
+ """Test SMALLINT type."""
148
+ assert self.transform_value(con, "32767", "SMALLINT") == "32767"
149
+
150
+ def test_tinyint(self, con):
151
+ """Test TINYINT type."""
152
+ assert self.transform_value(con, "127", "TINYINT") == "127"
153
+
154
+ # ==================== BIGINT TESTS ====================
155
+
156
+ def test_bigint_basic(self, con):
157
+ """Test BIGINT transformation."""
158
+ assert self.transform_value(con, "9223372036854775807", "BIGINT") == "9223372036854775807"
159
+ assert self.transform_value(con, "-9223372036854775808", "BIGINT") == "-9223372036854775808"
160
+
161
+ def test_int64_type(self, con):
162
+ """Test INT64 type alias."""
163
+ assert self.transform_value(con, "123456789012345", "INT64") == "123456789012345"
164
+
165
+ def test_int8_type(self, con):
166
+ """Test INT8 type (DuckDB alias for BIGINT)."""
167
+ assert self.transform_value(con, "123456789", "INT8") == "123456789"
168
+
169
+ # ==================== FLOAT/DOUBLE TESTS ====================
170
+
171
+ def test_float_basic(self, con):
172
+ """Test float transformation."""
173
+ result = self.transform_value(con, "3.14", "FLOAT")
174
+ assert float(result) == pytest.approx(3.14)
175
+
176
+ def test_double_basic(self, con):
177
+ """Test double transformation."""
178
+ result = self.transform_value(con, "3.141592653589793", "DOUBLE")
179
+ assert float(result) == pytest.approx(3.141592653589793)
180
+
181
+ def test_float_scientific_notation(self, con):
182
+ """Test scientific notation."""
183
+ result = self.transform_value(con, "1.5e10", "DOUBLE")
184
+ assert float(result) == pytest.approx(1.5e10)
185
+
186
+ def test_float_negative_exponent(self, con):
187
+ """Test negative exponent."""
188
+ result = self.transform_value(con, "1.5e-5", "DOUBLE")
189
+ assert float(result) == pytest.approx(1.5e-5)
190
+
191
+ def test_real_type(self, con):
192
+ """Test REAL type (alias for FLOAT)."""
193
+ result = self.transform_value(con, "2.5", "REAL")
194
+ assert float(result) == pytest.approx(2.5)
195
+
196
+ def test_float4_type(self, con):
197
+ """Test FLOAT4 type."""
198
+ result = self.transform_value(con, "2.5", "FLOAT4")
199
+ assert float(result) == pytest.approx(2.5)
200
+
201
+ def test_float8_type(self, con):
202
+ """Test FLOAT8 type (alias for DOUBLE)."""
203
+ result = self.transform_value(con, "2.5", "FLOAT8")
204
+ assert float(result) == pytest.approx(2.5)
205
+
206
+ # ==================== DECIMAL TESTS ====================
207
+
208
+ def test_decimal_basic(self, con):
209
+ """Test DECIMAL transformation."""
210
+ result = self.transform_value(con, "123.45", "DECIMAL(10,2)")
211
+ assert float(result) == pytest.approx(123.45)
212
+
213
+ def test_decimal_high_precision(self, con):
214
+ """Test high precision decimal."""
215
+ result = self.transform_value(con, "123456.789012", "DECIMAL(18,6)")
216
+ assert float(result) == pytest.approx(123456.789012)
217
+
218
+ def test_numeric_type(self, con):
219
+ """Test NUMERIC type (alias for DECIMAL)."""
220
+ result = self.transform_value(con, "99.99", "NUMERIC(5,2)")
221
+ assert float(result) == pytest.approx(99.99)
222
+
223
+ def test_decimal_integer_value(self, con):
224
+ """Test decimal with integer value (no decimal point)."""
225
+ result = self.transform_value(con, "100", "DECIMAL(10,2)")
226
+ assert result == "100"
227
+
228
+ # ==================== STRING TESTS ====================
229
+
230
+ def test_string_passthrough(self, con):
231
+ """Test string values pass through unchanged."""
232
+ assert self.transform_value(con, "hello world", "VARCHAR") == "hello world"
233
+ assert self.transform_value(con, "test@example.com", "VARCHAR") == "test@example.com"
234
+
235
+ def test_text_type(self, con):
236
+ """Test TEXT type."""
237
+ assert self.transform_value(con, "some text", "TEXT") == "some text"
238
+
239
+ def test_string_type(self, con):
240
+ """Test STRING type."""
241
+ assert self.transform_value(con, "a string", "STRING") == "a string"
242
+
243
+ def test_char_type(self, con):
244
+ """Test CHAR type."""
245
+ assert self.transform_value(con, "ABC", "CHAR(10)") == "ABC"
246
+
247
+ def test_string_with_special_chars(self, con):
248
+ """Test strings with special characters."""
249
+ assert self.transform_value(con, "Hello, World!", "VARCHAR") == "Hello, World!"
250
+ assert self.transform_value(con, "line1\nline2", "VARCHAR") == "line1\nline2"
251
+
252
+ # ==================== NULL TESTS ====================
253
+
254
+ def test_null_value(self, con):
255
+ """Test NULL value handling."""
256
+ assert self.transform_value(con, None, "INTEGER") is None
257
+ assert self.transform_value(con, None, "VARCHAR") is None
258
+ assert self.transform_value(con, None, "TIMESTAMP") is None
259
+
260
+ # ==================== EDGE CASES ====================
261
+
262
+ def test_empty_string(self, con):
263
+ """Test empty string handling."""
264
+ assert self.transform_value(con, "", "VARCHAR") == ""
265
+
266
+ def test_numeric_string_in_varchar(self, con):
267
+ """Test numeric-looking string in VARCHAR stays as string."""
268
+ # VARCHAR should pass through without numeric conversion
269
+ assert self.transform_value(con, "42", "VARCHAR") == "42"
270
+ assert self.transform_value(con, "3.14", "VARCHAR") == "3.14"
271
+
272
+
273
+ class TestStatsInCheckpoint:
274
+ """Test that stats are correctly included in checkpoint parquet output."""
275
+
276
+ @pytest.fixture
277
+ def con(self):
278
+ """Create a DuckDB connection with DuckLake extension."""
279
+ con = duckdb.connect()
280
+ try:
281
+ # Try loading first (if already installed)
282
+ con.execute("LOAD ducklake")
283
+ except Exception:
284
+ try:
285
+ # Try installing from community
286
+ con.execute("INSTALL ducklake FROM community")
287
+ con.execute("LOAD ducklake")
288
+ except Exception as e:
289
+ pytest.skip(f"DuckLake extension not available: {e}")
290
+ return con
291
+
292
+ def test_stats_structure_in_add_action(self, con, tmp_path):
293
+ """Test that stats JSON has correct structure in add action."""
294
+ # Create a DuckLake database with test data
295
+ db_path = str(tmp_path / "test.ducklake")
296
+ data_path = str(tmp_path / "data")
297
+
298
+ con.execute(f"ATTACH 'ducklake:{db_path}' AS test_db (DATA_PATH '{data_path}')")
299
+ con.execute("USE test_db")
300
+
301
+ # Create table with various column types
302
+ con.execute("""
303
+ CREATE TABLE test_stats (
304
+ id INTEGER,
305
+ name VARCHAR,
306
+ amount DOUBLE,
307
+ created_at TIMESTAMP,
308
+ is_active BOOLEAN,
309
+ birth_date DATE
310
+ )
311
+ """)
312
+
313
+ # Insert test data
314
+ con.execute("""
315
+ INSERT INTO test_stats VALUES
316
+ (1, 'Alice', 100.50, '2024-01-15 10:30:00', true, '1990-05-20'),
317
+ (2, 'Bob', 200.75, '2024-02-20 14:45:00', false, '1985-12-10'),
318
+ (3, 'Charlie', 50.25, '2024-03-25 09:15:00', true, '1992-08-05')
319
+ """)
320
+
321
+ # Check that file column stats were recorded (query from metadata schema)
322
+ stats = con.execute("""
323
+ SELECT
324
+ c.column_name,
325
+ c.column_type,
326
+ fcs.min_value,
327
+ fcs.max_value,
328
+ fcs.null_count,
329
+ fcs.value_count
330
+ FROM __ducklake_metadata_test_db.ducklake_file_column_stats fcs
331
+ JOIN __ducklake_metadata_test_db.ducklake_column c ON fcs.column_id = c.column_id
332
+ ORDER BY c.column_order
333
+ """).fetchall()
334
+
335
+ # Verify stats exist for all columns
336
+ column_names = [s[0] for s in stats]
337
+ assert 'id' in column_names
338
+ assert 'name' in column_names
339
+ assert 'amount' in column_names
340
+ assert 'created_at' in column_names
341
+ assert 'is_active' in column_names
342
+ assert 'birth_date' in column_names
343
+
344
+ # Verify min/max values are present (except for boolean which has no min/max)
345
+ for stat in stats:
346
+ col_name, col_type, min_val, max_val, null_count, value_count = stat
347
+ if 'bool' not in col_type.lower():
348
+ assert min_val is not None, f"min_value should not be None for {col_name}"
349
+ assert max_val is not None, f"max_value should not be None for {col_name}"
350
+ assert null_count == 0, f"null_count should be 0 for {col_name}"
351
+ assert value_count == 3, f"value_count should be 3 for {col_name}"
352
+
353
+ def test_stats_json_format(self, con, tmp_path):
354
+ """Test that exported stats JSON is valid and has expected format."""
355
+ # Create a DuckLake database
356
+ db_path = str(tmp_path / "test.ducklake")
357
+ data_path = str(tmp_path / "data")
358
+
359
+ con.execute(f"ATTACH 'ducklake:{db_path}' AS test_db (DATA_PATH '{data_path}')")
360
+ con.execute("USE test_db")
361
+
362
+ con.execute("""
363
+ CREATE TABLE stats_test (
364
+ int_col INTEGER,
365
+ str_col VARCHAR,
366
+ ts_col TIMESTAMP
367
+ )
368
+ """)
369
+
370
+ con.execute("""
371
+ INSERT INTO stats_test VALUES
372
+ (10, 'min', '2024-01-01 00:00:00'),
373
+ (50, 'max', '2024-12-31 23:59:59')
374
+ """)
375
+
376
+ # Close connection before exporting (file lock)
377
+ con.close()
378
+
379
+ # Run the exporter
380
+ from ducklake_delta_exporter import generate_latest_delta_log
381
+ generate_latest_delta_log(db_path)
382
+
383
+ # Reopen for verification
384
+ con = duckdb.connect()
385
+
386
+ # Read the checkpoint parquet
387
+ checkpoint_files = list((tmp_path / "data" / "main" / "stats_test" / "_delta_log").glob("*.checkpoint.parquet"))
388
+ assert len(checkpoint_files) == 1, "Should have exactly one checkpoint file"
389
+
390
+ # Read the add action and verify stats
391
+ add_rows = con.execute(f"""
392
+ SELECT add.stats
393
+ FROM '{checkpoint_files[0]}'
394
+ WHERE add IS NOT NULL
395
+ """).fetchall()
396
+
397
+ assert len(add_rows) == 1, "Should have one add action"
398
+
399
+ stats_json = json.loads(add_rows[0][0])
400
+
401
+ # Verify stats structure
402
+ assert 'numRecords' in stats_json
403
+ assert 'minValues' in stats_json
404
+ assert 'maxValues' in stats_json
405
+ assert 'nullCount' in stats_json
406
+
407
+ # Verify record count
408
+ assert stats_json['numRecords'] == 2
409
+
410
+ # Verify min/max values exist for columns
411
+ assert 'int_col' in stats_json['minValues']
412
+ assert 'str_col' in stats_json['minValues']
413
+ assert 'ts_col' in stats_json['minValues']
414
+
415
+ # Verify integer stats
416
+ assert stats_json['minValues']['int_col'] == '10'
417
+ assert stats_json['maxValues']['int_col'] == '50'
418
+
419
+ # Verify string stats (alphabetically: 'max' < 'min')
420
+ assert stats_json['minValues']['str_col'] == 'max'
421
+ assert stats_json['maxValues']['str_col'] == 'min'
422
+
423
+ # Verify timestamp format (should be ISO with Z suffix)
424
+ assert stats_json['minValues']['ts_col'].endswith('Z')
425
+ assert 'T' in stats_json['minValues']['ts_col']
426
+
427
+ def test_null_count_tracking(self, con, tmp_path):
428
+ """Test that null counts are correctly tracked in stats."""
429
+ db_path = str(tmp_path / "test.ducklake")
430
+ data_path = str(tmp_path / "data")
431
+
432
+ con.execute(f"ATTACH 'ducklake:{db_path}' AS test_db (DATA_PATH '{data_path}')")
433
+ con.execute("USE test_db")
434
+
435
+ con.execute("""
436
+ CREATE TABLE null_test (
437
+ required_col INTEGER,
438
+ nullable_col VARCHAR
439
+ )
440
+ """)
441
+
442
+ con.execute("""
443
+ INSERT INTO null_test VALUES
444
+ (1, 'value1'),
445
+ (2, NULL),
446
+ (3, 'value3'),
447
+ (4, NULL)
448
+ """)
449
+
450
+ # Close connection before exporting (file lock)
451
+ con.close()
452
+
453
+ from ducklake_delta_exporter import generate_latest_delta_log
454
+ generate_latest_delta_log(db_path)
455
+
456
+ # Reopen for verification
457
+ con = duckdb.connect()
458
+
459
+ checkpoint_files = list((tmp_path / "data" / "main" / "null_test" / "_delta_log").glob("*.checkpoint.parquet"))
460
+
461
+ add_rows = con.execute(f"""
462
+ SELECT add.stats
463
+ FROM '{checkpoint_files[0]}'
464
+ WHERE add IS NOT NULL
465
+ """).fetchall()
466
+
467
+ stats_json = json.loads(add_rows[0][0])
468
+
469
+ # Verify null counts
470
+ assert stats_json['nullCount']['required_col'] == 0
471
+ assert stats_json['nullCount']['nullable_col'] == 2
472
+
473
+ def test_stats_isolation_between_tables(self, con, tmp_path):
474
+ """Test that stats from different tables don't leak into each other.
475
+
476
+ This tests the fix for the table_id grouping bug where columns with
477
+ the same name from different tables could have their stats mixed.
478
+ """
479
+ db_path = str(tmp_path / "test.ducklake")
480
+ data_path = str(tmp_path / "data")
481
+
482
+ con.execute(f"ATTACH 'ducklake:{db_path}' AS test_db (DATA_PATH '{data_path}')")
483
+ con.execute("USE test_db")
484
+
485
+ # Create two tables with overlapping column names but different data
486
+ con.execute("""
487
+ CREATE TABLE table_a (
488
+ id INTEGER,
489
+ name VARCHAR,
490
+ value INTEGER
491
+ )
492
+ """)
493
+
494
+ con.execute("""
495
+ CREATE TABLE table_b (
496
+ id INTEGER,
497
+ name VARCHAR,
498
+ amount DOUBLE,
499
+ extra_col VARCHAR
500
+ )
501
+ """)
502
+
503
+ # Insert different data ranges
504
+ con.execute("""
505
+ INSERT INTO table_a VALUES
506
+ (1, 'alpha', 100),
507
+ (2, 'beta', 200)
508
+ """)
509
+
510
+ con.execute("""
511
+ INSERT INTO table_b VALUES
512
+ (100, 'zebra', 999.99, 'extra1'),
513
+ (200, 'yak', 888.88, 'extra2')
514
+ """)
515
+
516
+ # Close connection before exporting (file lock)
517
+ con.close()
518
+
519
+ from ducklake_delta_exporter import generate_latest_delta_log
520
+ generate_latest_delta_log(db_path)
521
+
522
+ # Reopen for verification
523
+ con = duckdb.connect()
524
+ con.execute("LOAD ducklake")
525
+
526
+ # Check table_a stats - should only have its own columns
527
+ checkpoint_a = list((tmp_path / "data" / "main" / "table_a" / "_delta_log").glob("*.checkpoint.parquet"))
528
+ assert len(checkpoint_a) == 1
529
+
530
+ stats_a = con.execute(f"""
531
+ SELECT add.stats
532
+ FROM '{checkpoint_a[0]}'
533
+ WHERE add IS NOT NULL
534
+ """).fetchone()[0]
535
+
536
+ stats_a_json = json.loads(stats_a)
537
+
538
+ # table_a should have id, name, value - NOT amount or extra_col
539
+ assert 'id' in stats_a_json['minValues']
540
+ assert 'name' in stats_a_json['minValues']
541
+ assert 'value' in stats_a_json['minValues']
542
+ assert 'amount' not in stats_a_json['minValues'], "table_a should not have table_b's 'amount' column"
543
+ assert 'extra_col' not in stats_a_json['minValues'], "table_a should not have table_b's 'extra_col' column"
544
+
545
+ # Verify table_a stats have correct values (not mixed with table_b)
546
+ assert stats_a_json['minValues']['id'] == '1' # table_a has 1,2 not 100,200
547
+ assert stats_a_json['maxValues']['id'] == '2'
548
+ assert stats_a_json['minValues']['name'] == 'alpha' # not 'yak' or 'zebra'
549
+ assert stats_a_json['maxValues']['name'] == 'beta'
550
+
551
+ # Check table_b stats - should only have its own columns
552
+ checkpoint_b = list((tmp_path / "data" / "main" / "table_b" / "_delta_log").glob("*.checkpoint.parquet"))
553
+ assert len(checkpoint_b) == 1
554
+
555
+ stats_b = con.execute(f"""
556
+ SELECT add.stats
557
+ FROM '{checkpoint_b[0]}'
558
+ WHERE add IS NOT NULL
559
+ """).fetchone()[0]
560
+
561
+ stats_b_json = json.loads(stats_b)
562
+
563
+ # table_b should have id, name, amount, extra_col - NOT value
564
+ assert 'id' in stats_b_json['minValues']
565
+ assert 'name' in stats_b_json['minValues']
566
+ assert 'amount' in stats_b_json['minValues']
567
+ assert 'extra_col' in stats_b_json['minValues']
568
+ assert 'value' not in stats_b_json['minValues'], "table_b should not have table_a's 'value' column"
569
+
570
+ # Verify table_b stats have correct values (not mixed with table_a)
571
+ assert stats_b_json['minValues']['id'] == '100' # table_b has 100,200 not 1,2
572
+ assert stats_b_json['maxValues']['id'] == '200'
573
+ assert stats_b_json['minValues']['name'] == 'yak' # not 'alpha' or 'beta'
574
+ assert stats_b_json['maxValues']['name'] == 'zebra'
575
+
576
+
577
+ class TestDeltaLakeTypeMapping:
578
+ """Test that DuckDB types are correctly mapped to Delta Lake types in schema."""
579
+
580
+ @pytest.fixture
581
+ def con(self):
582
+ """Create a DuckDB connection."""
583
+ return duckdb.connect()
584
+
585
+ def map_type(self, con, duckdb_type: str) -> str:
586
+ """Map DuckDB type to Delta Lake type using the exporter's logic."""
587
+ result = con.execute("""
588
+ SELECT CASE
589
+ WHEN contains(lower($1), 'bigint') OR
590
+ (contains(lower($1), 'int') AND contains($1, '64')) THEN 'long'
591
+ WHEN contains(lower($1), 'int') THEN 'integer'
592
+ WHEN contains(lower($1), 'float') THEN 'double'
593
+ WHEN contains(lower($1), 'double') THEN 'double'
594
+ WHEN contains(lower($1), 'bool') THEN 'boolean'
595
+ WHEN contains(lower($1), 'timestamp') THEN 'timestamp'
596
+ WHEN contains(lower($1), 'date') THEN 'date'
597
+ WHEN contains(lower($1), 'decimal') THEN lower($1)
598
+ ELSE 'string'
599
+ END
600
+ """, [duckdb_type]).fetchone()[0]
601
+ return result
602
+
603
+ def test_integer_types(self, con):
604
+ """Test integer type mappings."""
605
+ assert self.map_type(con, "INTEGER") == "integer"
606
+ assert self.map_type(con, "INT") == "integer"
607
+ assert self.map_type(con, "INT4") == "integer"
608
+ assert self.map_type(con, "SMALLINT") == "integer"
609
+ assert self.map_type(con, "TINYINT") == "integer"
610
+
611
+ def test_bigint_types(self, con):
612
+ """Test bigint type mappings."""
613
+ assert self.map_type(con, "BIGINT") == "long"
614
+ assert self.map_type(con, "INT64") == "long"
615
+ assert self.map_type(con, "INT8") == "integer" # INT8 doesn't contain '64' or 'bigint'
616
+
617
+ def test_float_types(self, con):
618
+ """Test float/double type mappings."""
619
+ assert self.map_type(con, "FLOAT") == "double"
620
+ assert self.map_type(con, "DOUBLE") == "double"
621
+ assert self.map_type(con, "REAL") == "string" # REAL doesn't match 'float' or 'double'
622
+ assert self.map_type(con, "FLOAT4") == "double"
623
+ assert self.map_type(con, "FLOAT8") == "double"
624
+
625
+ def test_boolean_types(self, con):
626
+ """Test boolean type mappings."""
627
+ assert self.map_type(con, "BOOLEAN") == "boolean"
628
+ assert self.map_type(con, "BOOL") == "boolean"
629
+
630
+ def test_timestamp_types(self, con):
631
+ """Test timestamp type mappings."""
632
+ assert self.map_type(con, "TIMESTAMP") == "timestamp"
633
+ assert self.map_type(con, "TIMESTAMP WITH TIME ZONE") == "timestamp"
634
+ assert self.map_type(con, "TIMESTAMPTZ") == "timestamp"
635
+
636
+ def test_date_type(self, con):
637
+ """Test date type mapping."""
638
+ assert self.map_type(con, "DATE") == "date"
639
+
640
+ def test_decimal_types(self, con):
641
+ """Test decimal type mappings."""
642
+ assert self.map_type(con, "DECIMAL(10,2)") == "decimal(10,2)"
643
+ assert self.map_type(con, "DECIMAL(18,6)") == "decimal(18,6)"
644
+ assert self.map_type(con, "NUMERIC(5,2)") == "string" # NUMERIC doesn't match 'decimal'
645
+
646
+ def test_string_types(self, con):
647
+ """Test string type mappings."""
648
+ assert self.map_type(con, "VARCHAR") == "string"
649
+ assert self.map_type(con, "VARCHAR(100)") == "string"
650
+ assert self.map_type(con, "TEXT") == "string"
651
+ assert self.map_type(con, "STRING") == "string"
652
+ assert self.map_type(con, "CHAR(10)") == "string"
653
+
654
+
655
+ if __name__ == "__main__":
656
+ pytest.main([__file__, "-v"])