dlt-iceberg 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dlt_iceberg/__init__.py CHANGED
@@ -13,6 +13,9 @@ from .destination import iceberg_rest as iceberg_rest_function_based
13
13
  # Export the class-based version as the primary destination
14
14
  iceberg_rest = iceberg_rest_class_based
15
15
 
16
+ # Adapter for Iceberg-specific hints
17
+ from .adapter import iceberg_adapter, iceberg_partition, PartitionTransform
18
+
16
19
  # Errors
17
20
  from .schema_casting import CastingError
18
21
  from .schema_evolution import SchemaEvolutionError
@@ -23,6 +26,9 @@ __all__ = [
23
26
  "iceberg_rest_function_based",
24
27
  "IcebergRestClient",
25
28
  "IcebergRestConfiguration",
29
+ "iceberg_adapter",
30
+ "iceberg_partition",
31
+ "PartitionTransform",
26
32
  "CastingError",
27
33
  "SchemaEvolutionError",
28
34
  ]
dlt_iceberg/adapter.py ADDED
@@ -0,0 +1,276 @@
1
+ """
2
+ Iceberg adapter for dlt resources.
3
+
4
+ Provides a way to add Iceberg-specific hints to dlt resources, following
5
+ the adapter pattern used by BigQuery, Databricks, and other destinations.
6
+
7
+ Usage:
8
+ from dlt_iceberg import iceberg_adapter, iceberg_partition
9
+
10
+ @dlt.resource(name="events")
11
+ def my_events():
12
+ yield {"event_date": "2024-01-01", "user_id": 123}
13
+
14
+ # Partition by month on event_date and bucket user_id
15
+ adapted = iceberg_adapter(
16
+ my_events,
17
+ partition=[
18
+ iceberg_partition.month("event_date"),
19
+ iceberg_partition.bucket("user_id", 10),
20
+ ]
21
+ )
22
+ """
23
+
24
+ import logging
25
+ from typing import Any, List, Optional, Union, cast
26
+ from dataclasses import dataclass
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class PartitionTransform:
33
+ """Represents an Iceberg partition transform for a column.
34
+
35
+ Attributes:
36
+ column: Column name to partition on
37
+ transform: Transform type (identity, year, month, day, hour, bucket, truncate)
38
+ param: Optional parameter for bucket[N] or truncate[N]
39
+ name: Optional custom name for the partition field
40
+ """
41
+
42
+ column: str
43
+ transform: str
44
+ param: Optional[int] = None
45
+ name: Optional[str] = None
46
+
47
+ def to_hint_value(self) -> str:
48
+ """Convert to partition_transform hint value."""
49
+ if self.param is not None:
50
+ return f"{self.transform}[{self.param}]"
51
+ return self.transform
52
+
53
+
54
+ class iceberg_partition:
55
+ """Factory for Iceberg partition transforms.
56
+
57
+ Provides static methods to create partition specifications:
58
+
59
+ - identity(column, name=None): No transformation, use value as-is
60
+ - year(column, name=None): Extract year from timestamp/date
61
+ - month(column, name=None): Extract year-month from timestamp/date
62
+ - day(column, name=None): Extract date from timestamp/date
63
+ - hour(column, name=None): Extract date-hour from timestamp
64
+ - bucket(num_buckets, column, name=None): Hash partition into n buckets
65
+ - truncate(width, column, name=None): Truncate string/number to width
66
+
67
+ Examples:
68
+ iceberg_partition.month("created_at")
69
+ iceberg_partition.month("created_at", "month_created")
70
+ iceberg_partition.bucket(10, "user_id")
71
+ iceberg_partition.bucket(10, "user_id", "user_bucket")
72
+ iceberg_partition.truncate(4, "email")
73
+ """
74
+
75
+ @staticmethod
76
+ def identity(column: str, name: Optional[str] = None) -> PartitionTransform:
77
+ """Identity transform - use column value as-is for partitioning.
78
+
79
+ Args:
80
+ column: Column name to partition on
81
+ name: Optional custom name for the partition field
82
+ """
83
+ return PartitionTransform(column=column, transform="identity", name=name)
84
+
85
+ @staticmethod
86
+ def year(column: str, name: Optional[str] = None) -> PartitionTransform:
87
+ """Year transform - partition by year extracted from timestamp/date.
88
+
89
+ Args:
90
+ column: Column name to partition on
91
+ name: Optional custom name for the partition field
92
+ """
93
+ return PartitionTransform(column=column, transform="year", name=name)
94
+
95
+ @staticmethod
96
+ def month(column: str, name: Optional[str] = None) -> PartitionTransform:
97
+ """Month transform - partition by year-month extracted from timestamp/date.
98
+
99
+ Args:
100
+ column: Column name to partition on
101
+ name: Optional custom name for the partition field
102
+ """
103
+ return PartitionTransform(column=column, transform="month", name=name)
104
+
105
+ @staticmethod
106
+ def day(column: str, name: Optional[str] = None) -> PartitionTransform:
107
+ """Day transform - partition by date extracted from timestamp/date.
108
+
109
+ Args:
110
+ column: Column name to partition on
111
+ name: Optional custom name for the partition field
112
+ """
113
+ return PartitionTransform(column=column, transform="day", name=name)
114
+
115
+ @staticmethod
116
+ def hour(column: str, name: Optional[str] = None) -> PartitionTransform:
117
+ """Hour transform - partition by date-hour extracted from timestamp.
118
+
119
+ Args:
120
+ column: Column name to partition on
121
+ name: Optional custom name for the partition field
122
+ """
123
+ return PartitionTransform(column=column, transform="hour", name=name)
124
+
125
+ @staticmethod
126
+ def bucket(num_buckets: int, column: str, name: Optional[str] = None) -> PartitionTransform:
127
+ """Bucket transform - hash partition into n buckets.
128
+
129
+ Args:
130
+ num_buckets: Number of buckets (must be positive)
131
+ column: Column name to partition on
132
+ name: Optional custom name for the partition field
133
+
134
+ Raises:
135
+ ValueError: If num_buckets is not positive
136
+ """
137
+ if num_buckets <= 0:
138
+ raise ValueError(f"num_buckets must be positive, got {num_buckets}")
139
+ return PartitionTransform(column=column, transform="bucket", param=num_buckets, name=name)
140
+
141
+ @staticmethod
142
+ def truncate(width: int, column: str, name: Optional[str] = None) -> PartitionTransform:
143
+ """Truncate transform - truncate string/number to width.
144
+
145
+ Args:
146
+ width: Truncation width (must be positive)
147
+ column: Column name to partition on
148
+ name: Optional custom name for the partition field
149
+
150
+ Raises:
151
+ ValueError: If width is not positive
152
+ """
153
+ if width <= 0:
154
+ raise ValueError(f"width must be positive, got {width}")
155
+ return PartitionTransform(column=column, transform="truncate", param=width, name=name)
156
+
157
+
158
+ def _get_resource_for_adapter(data: Any):
159
+ """Get or create a DltResource from data.
160
+
161
+ Follows the pattern from dlt.destinations.utils.get_resource_for_adapter.
162
+ """
163
+ import dlt
164
+ from dlt.extract.resource import DltResource
165
+ from dlt.extract.source import DltSource
166
+
167
+ if isinstance(data, DltResource):
168
+ return data
169
+
170
+ if isinstance(data, DltSource):
171
+ if len(data.selected_resources.keys()) == 1:
172
+ return list(data.selected_resources.values())[0]
173
+ else:
174
+ raise ValueError(
175
+ "You are trying to use iceberg_adapter on a DltSource with "
176
+ "multiple resources. You can only use adapters on: pure data, "
177
+ "a DltResource, or a DltSource with a single DltResource."
178
+ )
179
+
180
+ resource_name = None
181
+ if not hasattr(data, "__name__"):
182
+ logger.info("Setting default resource name to 'content' for adapted resource.")
183
+ resource_name = "content"
184
+
185
+ return cast("DltResource", dlt.resource(data, name=resource_name))
186
+
187
+
188
+ def iceberg_adapter(
189
+ data: Any,
190
+ partition: Optional[Union[str, PartitionTransform, List[Union[str, PartitionTransform]]]] = None,
191
+ ):
192
+ """
193
+ Apply Iceberg-specific hints to a dlt resource.
194
+
195
+ This adapter prepares data for loading into Iceberg tables by setting
196
+ partition specifications using Iceberg's native transforms.
197
+
198
+ Args:
199
+ data: A dlt resource, source (with single resource), or raw data
200
+ partition: Partition specification(s). Can be:
201
+ - A column name string (uses identity transform)
202
+ - A single PartitionTransform
203
+ - A list of column names and/or PartitionTransform objects
204
+ Use iceberg_partition helpers to create transforms.
205
+
206
+ Returns:
207
+ DltResource with Iceberg-specific hints applied
208
+
209
+ Examples:
210
+ # Simple identity partition by column name
211
+ iceberg_adapter(my_resource, partition="region")
212
+ iceberg_adapter(my_resource, partition=["region", "category"])
213
+
214
+ # Single partition column with month transform
215
+ iceberg_adapter(my_resource, partition=iceberg_partition.month("created_at"))
216
+
217
+ # Multiple partition columns with mixed specs
218
+ iceberg_adapter(
219
+ my_resource,
220
+ partition=[
221
+ iceberg_partition.day("event_date"),
222
+ "region", # identity partition
223
+ iceberg_partition.bucket(10, "user_id"),
224
+ ]
225
+ )
226
+
227
+ # Works with raw data too
228
+ data = [{"id": 1, "ts": "2024-01-01"}]
229
+ iceberg_adapter(data, partition=iceberg_partition.month("ts"))
230
+ """
231
+ resource = _get_resource_for_adapter(data)
232
+
233
+ if partition is None:
234
+ return resource
235
+
236
+ # Normalize to list
237
+ if isinstance(partition, (str, PartitionTransform)):
238
+ partition_list = [partition]
239
+ else:
240
+ partition_list = partition
241
+
242
+ if not partition_list:
243
+ return resource
244
+
245
+ # Convert strings to identity PartitionTransforms
246
+ partitions: List[PartitionTransform] = []
247
+ for p in partition_list:
248
+ if isinstance(p, str):
249
+ partitions.append(iceberg_partition.identity(p))
250
+ else:
251
+ partitions.append(p)
252
+
253
+ # Build column hints for partitioning
254
+ column_hints = {}
255
+
256
+ for p in partitions:
257
+ if p.column not in column_hints:
258
+ column_hints[p.column] = {"name": p.column}
259
+
260
+ # Set partition flag using x-partition (custom hint prefix)
261
+ column_hints[p.column]["x-partition"] = True
262
+
263
+ # Set transform (identity is handled as default in partition_builder)
264
+ if p.transform != "identity":
265
+ column_hints[p.column]["x-partition-transform"] = p.to_hint_value()
266
+
267
+ # Set custom partition field name if provided
268
+ if p.name:
269
+ column_hints[p.column]["x-partition-name"] = p.name
270
+
271
+ # Apply hints to resource
272
+ resource.apply_hints(columns=column_hints)
273
+
274
+ logger.info(f"Applied Iceberg partition hints: {[p.column for p in partitions]}")
275
+
276
+ return resource
@@ -37,6 +37,87 @@ from pyiceberg.io.pyarrow import schema_to_pyarrow
37
37
  logger = logging.getLogger(__name__)
38
38
 
39
39
 
40
+ def _get_merge_strategy(table_schema: TTableSchema) -> str:
41
+ """Extract merge strategy from table schema.
42
+
43
+ write_disposition can be:
44
+ - "merge" (string) -> use upsert (backward compatible)
45
+ - {"disposition": "merge", "strategy": "delete-insert"} -> explicit strategy
46
+
47
+ Returns:
48
+ Merge strategy: "upsert" or "delete-insert"
49
+ """
50
+ write_disposition = table_schema.get("write_disposition", "append")
51
+
52
+ if isinstance(write_disposition, dict):
53
+ return write_disposition.get("strategy", "delete-insert")
54
+
55
+ # String "merge" - use upsert as our default (backward compatible)
56
+ return "upsert"
57
+
58
+
59
+ def _execute_delete_insert(iceberg_table, arrow_table, primary_keys: list, identifier: str):
60
+ """Execute delete-insert merge strategy.
61
+
62
+ Deletes rows matching primary keys in incoming data, then appends new data.
63
+ Uses PyIceberg transaction for atomic delete + append.
64
+
65
+ Args:
66
+ iceberg_table: PyIceberg table object
67
+ arrow_table: Arrow table with data to merge
68
+ primary_keys: List of primary key column names
69
+ identifier: Table identifier for logging
70
+
71
+ Returns:
72
+ Tuple of (rows_deleted_estimate, rows_inserted)
73
+ """
74
+ from pyiceberg.expressions import In, And, EqualTo, Or
75
+
76
+ # Build delete filter from primary key values in incoming data
77
+ if len(primary_keys) == 1:
78
+ pk_col = primary_keys[0]
79
+ pk_values = arrow_table.column(pk_col).to_pylist()
80
+ unique_pk_values = list(set(pk_values))
81
+ delete_filter = In(pk_col, unique_pk_values)
82
+ deleted_estimate = len(unique_pk_values)
83
+ else:
84
+ # Composite primary key - build OR of AND conditions
85
+ pk_tuples = set()
86
+ for i in range(len(arrow_table)):
87
+ pk_tuple = tuple(
88
+ arrow_table.column(pk).to_pylist()[i] for pk in primary_keys
89
+ )
90
+ pk_tuples.add(pk_tuple)
91
+
92
+ conditions = []
93
+ for pk_tuple in pk_tuples:
94
+ and_conditions = [
95
+ EqualTo(pk, val) for pk, val in zip(primary_keys, pk_tuple)
96
+ ]
97
+ if len(and_conditions) == 1:
98
+ conditions.append(and_conditions[0])
99
+ else:
100
+ conditions.append(And(*and_conditions))
101
+
102
+ if len(conditions) == 1:
103
+ delete_filter = conditions[0]
104
+ else:
105
+ delete_filter = Or(*conditions)
106
+ deleted_estimate = len(pk_tuples)
107
+
108
+ logger.info(
109
+ f"Delete-insert for {identifier}: deleting up to {deleted_estimate} "
110
+ f"matching rows, inserting {len(arrow_table)} rows"
111
+ )
112
+
113
+ # Execute atomic delete + append using transaction
114
+ with iceberg_table.transaction() as txn:
115
+ txn.delete(delete_filter)
116
+ txn.append(arrow_table)
117
+
118
+ return (deleted_estimate, len(arrow_table))
119
+
120
+
40
121
  def _iceberg_rest_handler(
41
122
  items: str, # File path when batch_size=0
42
123
  table: TTableSchema,
@@ -270,13 +351,18 @@ def _iceberg_rest_handler(
270
351
  raise
271
352
 
272
353
  # Write data based on disposition
273
- if write_disposition == "replace":
354
+ # Handle both string and dict write_disposition
355
+ disposition_type = write_disposition
356
+ if isinstance(write_disposition, dict):
357
+ disposition_type = write_disposition.get("disposition", "append")
358
+
359
+ if disposition_type == "replace":
274
360
  logger.info(f"Overwriting table {identifier}")
275
361
  iceberg_table.overwrite(arrow_table)
276
- elif write_disposition == "append":
362
+ elif disposition_type == "append":
277
363
  logger.info(f"Appending to table {identifier}")
278
364
  iceberg_table.append(arrow_table)
279
- elif write_disposition == "merge":
365
+ elif disposition_type == "merge":
280
366
  # For merge, we need primary keys
281
367
  # Try multiple ways to get primary keys from dlt table schema
282
368
  primary_keys = table.get("primary_key") or table.get("x-merge-keys")
@@ -296,21 +382,36 @@ def _iceberg_rest_handler(
296
382
  )
297
383
  iceberg_table.append(arrow_table)
298
384
  else:
299
- logger.info(f"Merging into table {identifier} on keys {primary_keys}")
300
- # Use PyIceberg's upsert API to update existing rows and insert new ones
301
- # PyIceberg will automatically match rows based on join_cols (primary keys)
302
- upsert_result = iceberg_table.upsert(
303
- df=arrow_table,
304
- join_cols=primary_keys,
305
- when_matched_update_all=True,
306
- when_not_matched_insert_all=True,
307
- )
385
+ # Get merge strategy
386
+ merge_strategy = _get_merge_strategy(table)
308
387
  logger.info(
309
- f"Upsert completed: {upsert_result.rows_updated} updated, "
310
- f"{upsert_result.rows_inserted} inserted"
388
+ f"Merging into table {identifier} on keys {primary_keys} "
389
+ f"using strategy: {merge_strategy}"
311
390
  )
391
+
392
+ if merge_strategy == "delete-insert":
393
+ # Atomic delete + insert
394
+ deleted, inserted = _execute_delete_insert(
395
+ iceberg_table, arrow_table, primary_keys, identifier
396
+ )
397
+ logger.info(
398
+ f"Delete-insert completed: ~{deleted} deleted, "
399
+ f"{inserted} inserted"
400
+ )
401
+ else:
402
+ # Default: upsert strategy
403
+ upsert_result = iceberg_table.upsert(
404
+ df=arrow_table,
405
+ join_cols=primary_keys,
406
+ when_matched_update_all=True,
407
+ when_not_matched_insert_all=True,
408
+ )
409
+ logger.info(
410
+ f"Upsert completed: {upsert_result.rows_updated} updated, "
411
+ f"{upsert_result.rows_inserted} inserted"
412
+ )
312
413
  else:
313
- raise ValueError(f"Unknown write disposition: {write_disposition}")
414
+ raise ValueError(f"Unknown write disposition: {disposition_type}")
314
415
 
315
416
  logger.info(f"Successfully wrote {len(arrow_table)} rows to {identifier}")
316
417
  return # Success
@@ -391,7 +492,7 @@ def iceberg_rest(**kwargs):
391
492
  def _raw_capabilities_with_merge():
392
493
  """Add merge support to the destination capabilities."""
393
494
  caps = original_raw_capabilities()
394
- caps.supported_merge_strategies = ["upsert"]
495
+ caps.supported_merge_strategies = ["delete-insert", "upsert"]
395
496
  return caps
396
497
 
397
498
  # Bind the new method to the instance