deltacat 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. deltacat/__init__.py +27 -6
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/conftest.py +1 -1
  5. deltacat/catalog/main/impl.py +12 -6
  6. deltacat/catalog/model/catalog.py +65 -47
  7. deltacat/catalog/model/properties.py +1 -3
  8. deltacat/compute/__init__.py +14 -0
  9. deltacat/compute/converter/constants.py +5 -0
  10. deltacat/compute/converter/converter_session.py +78 -36
  11. deltacat/compute/converter/model/convert_input.py +24 -4
  12. deltacat/compute/converter/model/convert_result.py +61 -0
  13. deltacat/compute/converter/model/converter_session_params.py +52 -10
  14. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  15. deltacat/compute/converter/steps/convert.py +84 -36
  16. deltacat/compute/converter/steps/dedupe.py +25 -4
  17. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  18. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  19. deltacat/compute/converter/utils/io.py +82 -11
  20. deltacat/compute/converter/utils/s3u.py +13 -4
  21. deltacat/compute/jobs/__init__.py +0 -0
  22. deltacat/compute/jobs/client.py +404 -0
  23. deltacat/constants.py +4 -4
  24. deltacat/daft/daft_scan.py +7 -3
  25. deltacat/daft/translator.py +126 -0
  26. deltacat/examples/basic_logging.py +5 -3
  27. deltacat/examples/hello_world.py +4 -2
  28. deltacat/examples/indexer/__init__.py +0 -0
  29. deltacat/examples/indexer/aws/__init__.py +0 -0
  30. deltacat/examples/indexer/gcp/__init__.py +0 -0
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +199 -0
  33. deltacat/io/__init__.py +13 -0
  34. deltacat/io/dataset/__init__.py +0 -0
  35. deltacat/io/dataset/deltacat_dataset.py +91 -0
  36. deltacat/io/datasink/__init__.py +0 -0
  37. deltacat/io/datasink/deltacat_datasink.py +207 -0
  38. deltacat/io/datasource/__init__.py +0 -0
  39. deltacat/io/datasource/deltacat_datasource.py +580 -0
  40. deltacat/io/reader/__init__.py +0 -0
  41. deltacat/io/reader/deltacat_read_api.py +172 -0
  42. deltacat/storage/__init__.py +2 -0
  43. deltacat/storage/model/expression/__init__.py +47 -0
  44. deltacat/storage/model/expression/expression.py +656 -0
  45. deltacat/storage/model/expression/visitor.py +248 -0
  46. deltacat/storage/model/metafile.py +74 -42
  47. deltacat/storage/model/scan/push_down.py +32 -5
  48. deltacat/storage/model/types.py +5 -3
  49. deltacat/storage/rivulet/__init__.py +4 -4
  50. deltacat/tests/_io/reader/__init__.py +0 -0
  51. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  52. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  53. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  54. deltacat/tests/storage/model/test_expression.py +327 -0
  55. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
  56. deltacat/tests/storage/rivulet/test_dataset.py +1 -1
  57. deltacat/tests/storage/rivulet/test_manifest.py +1 -1
  58. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
  59. deltacat/tests/test_deltacat_api.py +50 -9
  60. deltacat/types/media.py +141 -43
  61. deltacat/types/tables.py +35 -7
  62. deltacat/utils/daft.py +2 -2
  63. deltacat/utils/filesystem.py +39 -9
  64. deltacat/utils/polars.py +128 -0
  65. deltacat/utils/pyarrow.py +151 -15
  66. deltacat/utils/ray_utils/concurrency.py +1 -1
  67. deltacat/utils/ray_utils/runtime.py +56 -4
  68. deltacat/utils/url.py +1284 -0
  69. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
  70. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
  71. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
  72. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
  73. {deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,8 @@
1
1
  from typing import Any, Dict, List, Optional, Union, Tuple
2
2
  import logging
3
3
 
4
+ import deltacat as dc
5
+
4
6
  from deltacat.catalog import CatalogProperties
5
7
  from deltacat.exceptions import (
6
8
  NamespaceAlreadyExistsError,
@@ -34,17 +36,17 @@ from deltacat.types.tables import TableWriteMode
34
36
  from deltacat.compute.merge_on_read import MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE
35
37
  from deltacat import logs
36
38
  from deltacat.constants import DEFAULT_NAMESPACE
37
- from deltacat.storage import metastore as storage_impl
38
39
 
39
40
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
40
41
 
41
42
  """
42
- This is the default implementation for the Catalog interface, using DeltaCAT native storage
43
+ Default Catalog interface implementation using DeltaCAT native storage.
43
44
 
44
- Note that, when this catalog implementation gets called through the normal pattern of `delegate.py`, all functions
45
- will be called the kwarg "inner" equal to the `CatalogProperties` this was initialized with.
45
+ When this is used by `delegate.py` the `Catalog` implementation `inner`
46
+ property will be set to the value returned from `intialize`.
46
47
 
47
- `CatalogProperties` has all state required to implement catalog functions, such as metastore root URI
48
+ `CatalogProperties` has all state required to implement catalog functions,
49
+ such as metastore root URI.
48
50
  """
49
51
 
50
52
 
@@ -56,6 +58,10 @@ def initialize(config: CatalogProperties = None, *args, **kwargs) -> CatalogProp
56
58
  returns CatalogProperties as the "inner" state value for a DC native catalog
57
59
  """
58
60
  if config is not None:
61
+ if not isinstance(config, CatalogProperties):
62
+ raise ValueError(
63
+ f"Expected `CatalogProperties` but found `{type(config)}`."
64
+ )
59
65
  return config
60
66
  else:
61
67
  return CatalogProperties(*args, **kwargs)
@@ -717,4 +723,4 @@ def _get_storage(**kwargs):
717
723
  if properties is not None and properties.storage is not None:
718
724
  return properties.storage
719
725
  else:
720
- return storage_impl
726
+ return dc.storage.metastore
@@ -10,7 +10,7 @@ import ray
10
10
 
11
11
  from deltacat import logs
12
12
  from deltacat.annotations import ExperimentalAPI
13
- from deltacat.catalog.main import impl as DeltacatCatalog
13
+ from deltacat.catalog.main import impl as DeltaCatCatalog
14
14
  from deltacat.catalog.iceberg import impl as IcebergCatalog
15
15
  from deltacat.catalog import CatalogProperties
16
16
  from deltacat.catalog.iceberg import IcebergCatalogConfig
@@ -22,17 +22,14 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
22
22
 
23
23
 
24
24
  class Catalog:
25
- def __init__(self, impl: ModuleType = DeltacatCatalog, *args, **kwargs):
25
+ def __init__(self, impl: ModuleType = DeltaCatCatalog, *args, **kwargs):
26
26
  """
27
27
  Constructor for a Catalog.
28
28
 
29
- The args and kwargs here will be plumbed through to the catalog initialize function, and the results
30
- are stored in Catalog.inner. Any state which is required (like: metastore root URI, pyiceberg native catalog)
31
- MUST be returned by initialize.
32
-
33
- Note: all initialization configuration MUST be pickle-able. When `Catalog` is pickled, _inner is excluded.
34
- Instead, we only pass impl/args/kwargs, which are pickled and then _inner is re-constituted by calling __init__.
35
- See `ray.util.register_serializer` in Catalogs class.
29
+ Invokes `impl.initialize(*args, **kwargs)` and stores its return value
30
+ in the `inner` property, which captures all state required to
31
+ deterministically reconstruct this Catalog instance on any node (and
32
+ must therefore be pickleable by Ray cloudpickle).
36
33
  """
37
34
  if not isinstance(self, Catalog):
38
35
  # self may contain the tuple returned from __reduce__ (ray pickle bug?)
@@ -68,7 +65,7 @@ class Catalog:
68
65
 
69
66
  Uses CatalogProperties as configuration
70
67
  """
71
- return cls(impl=DeltacatCatalog, *args, **{"config": config, **kwargs})
68
+ return cls(impl=DeltaCatCatalog, *args, **{"config": config, **kwargs})
72
69
 
73
70
  @property
74
71
  def impl(self):
@@ -104,25 +101,17 @@ class Catalogs:
104
101
  self,
105
102
  catalogs: Union[Catalog, Dict[str, Catalog]],
106
103
  default: Optional[str] = None,
107
- *args,
108
- **kwargs,
109
104
  ):
110
- if default and default not in catalogs:
111
- raise ValueError(
112
- f"Catalog {default} not found " f"in catalogs to register: {catalogs}"
113
- )
114
- if not catalogs:
115
- raise ValueError(
116
- f"No catalogs given to register. "
117
- f"Please specify one or more catalogs."
118
- )
119
-
120
- # if user only provides single Catalog, override it to be a map with default key
121
105
  if isinstance(catalogs, Catalog):
122
106
  catalogs = {DEFAULT_CATALOG: catalogs}
123
-
107
+ elif not isinstance(catalogs, dict):
108
+ raise ValueError(f"Expected Catalog or dict, but found: {catalogs}")
124
109
  self.catalogs: Dict[str, Catalog] = catalogs
125
110
  if default:
111
+ if default not in catalogs:
112
+ raise ValueError(
113
+ f"Default catalog `{default}` not found in: {catalogs}"
114
+ )
126
115
  self.default_catalog = self.catalogs[default]
127
116
  elif len(catalogs) == 1:
128
117
  self.default_catalog = list(self.catalogs.values())[0]
@@ -140,7 +129,7 @@ class Catalogs:
140
129
  if set_default:
141
130
  self.default_catalog = catalog
142
131
 
143
- def get(self, name) -> Catalog:
132
+ def get(self, name) -> Optional[Catalog]:
144
133
  return self.catalogs.get(name)
145
134
 
146
135
  def default(self) -> Optional[Catalog]:
@@ -149,7 +138,7 @@ class Catalogs:
149
138
 
150
139
  def is_initialized(*args, **kwargs) -> bool:
151
140
  """
152
- Check if DeltaCAT is initialized
141
+ Check if DeltaCAT is initialized.
153
142
  """
154
143
  global all_catalogs
155
144
 
@@ -162,22 +151,36 @@ def is_initialized(*args, **kwargs) -> bool:
162
151
  return all_catalogs is not None
163
152
 
164
153
 
154
+ def raise_if_not_initialized(
155
+ err_msg: str = "DeltaCAT is not initialized. Please call `deltacat.init()` and try again.",
156
+ ) -> None:
157
+ """
158
+ Raises a RuntimeError with the given error message if DeltaCAT is not
159
+ initialized.
160
+
161
+ :param err_msg: Custom error message to raise if DeltaCAT is not
162
+ initialized. If unspecified, the default error message is used.
163
+ """
164
+ if not is_initialized():
165
+ raise RuntimeError(err_msg)
166
+
167
+
165
168
  def init(
166
- catalogs: Union[Dict[str, Catalog], Catalog],
169
+ catalogs: Union[Dict[str, Catalog], Catalog] = {},
167
170
  default: Optional[str] = None,
168
171
  ray_init_args: Dict[str, Any] = None,
169
- *args,
172
+ *,
170
173
  force_reinitialize=False,
171
- **kwargs,
172
174
  ) -> None:
173
175
  """
174
176
  Initialize DeltaCAT catalogs.
175
177
 
176
- :param catalogs: Either a single Catalog instance or a map of string to Catalog instance
177
- :param default: The Catalog to use by default. If only one Catalog is provided, it will
178
- be set as the default
179
- :param ray_init_args: kwargs to pass to ray initialization
180
- :param force_reinitialize: if True, force the reinitialization of Ray. If false, will do nothing if ray already initialized
178
+ :param catalogs: A single Catalog instance or a map of catalog names to
179
+ Catalog instances.
180
+ :param default: The name of the default Catalog. If only one Catalog is
181
+ provided, it will always be the default.
182
+ :param ray_init_args: Keyword arguments to pass to `ray.init()`.
183
+ :param force_reinitialize: Whether to force Ray reinitialization.
181
184
  """
182
185
  global all_catalogs
183
186
 
@@ -195,6 +198,8 @@ def init(
195
198
  ray.util.register_serializer(
196
199
  Catalog, serializer=Catalog.__reduce__, deserializer=Catalog.__init__
197
200
  )
201
+ # TODO(pdames): If no catalogs are provided then re-initialize DeltaCAT
202
+ # with all catalogs from the last session
198
203
  all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
199
204
 
200
205
 
@@ -216,7 +221,6 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
216
221
  "`deltacat.init(catalogs={...})` to register one or more "
217
222
  "catalogs then retry."
218
223
  )
219
-
220
224
  if name is not None:
221
225
  catalog = ray.get(all_catalogs.get.remote(name))
222
226
  if not catalog:
@@ -225,17 +229,16 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
225
229
  f"Catalog '{name}' not found. Available catalogs: "
226
230
  f"{available_catalogs}."
227
231
  )
228
- return catalog
229
-
230
232
  else:
231
233
  catalog = ray.get(all_catalogs.default.remote())
232
234
  if not catalog:
233
235
  available_catalogs = ray.get(all_catalogs.all.remote()).values()
234
236
  raise ValueError(
235
- f"Call to get_catalog without name set failed because there is no default Catalog set. Available catalogs: "
237
+ f"Call to get_catalog without name set failed because there "
238
+ f"is no default Catalog set. Available catalogs: "
236
239
  f"{available_catalogs}."
237
240
  )
238
- return catalog
241
+ return catalog
239
242
 
240
243
 
241
244
  def put_catalog(
@@ -246,23 +249,37 @@ def put_catalog(
246
249
  ray_init_args: Dict[str, Any] = None,
247
250
  fail_if_exists: bool = False,
248
251
  **kwargs,
249
- ) -> None:
252
+ ) -> Catalog:
250
253
  """
251
254
  Add a named catalog to the global map of named catalogs. Initializes ray if not already initialized.
252
255
 
253
256
  Args:
254
- name: name of catalog
255
- catalog: catalog instance to use, if provided
256
- default: Make this the default catalog if multiple catalogs are available.
257
- ignored if this is the only catalog available, since it will always be the default catalog.
258
- ray_init_args: ray initialization args (used only if ray not already initialized)
259
- fail_if_exists: if True, raises KeyError if the catalog name already exists. Otherwise, overwrite catalog
257
+ name: Name of the catalog.
258
+ catalog: Catalog instance to use. If none is provided, then all
259
+ additional keyword arguments will be forwarded to
260
+ `CatalogProperties` for a default DeltaCAT native Catalog.
261
+ default: Make this the default catalog if multiple catalogs are
262
+ available. If only one catalog is available, it will always be the
263
+ default.
264
+ ray_init_args: Ray initialization args (used only if ray not already
265
+ initialized)
266
+ fail_if_exists: if True, raises an error if a catalog with the given
267
+ name already exists. If False, inserts or replaces the given
268
+ catalog name.
269
+ kwargs: Additional keyword arguments to forward to `CatalogProperties`
270
+ for a default DeltaCAT native Catalog.
271
+
272
+ Returns:
273
+ The catalog put in the named catalog map.
260
274
  """
261
275
  global all_catalogs
262
276
 
277
+ if not catalog:
278
+ catalog = Catalog(**kwargs)
279
+
263
280
  # Initialize, if necessary
264
281
  if not is_initialized():
265
- # NOTE - since we are initializing with a single catalog, it will be set to the default
282
+ # We are initializing a single catalog - make it the default
266
283
  if not default:
267
284
  logger.info(
268
285
  f"Calling put_catalog with set_as_default=False, "
@@ -288,3 +305,4 @@ def put_catalog(
288
305
 
289
306
  # Add the catalog (which may overwrite existing if fail_if_exists=False)
290
307
  ray.get(all_catalogs.put.remote(name, catalog, default))
308
+ return catalog
@@ -45,7 +45,7 @@ class CatalogProperties:
45
45
  Attributes:
46
46
  root (str): URI string The root path where catalog metadata and data
47
47
  files are stored. Root is determined (in prededence order) by:
48
- 1. check kwargs for "root"
48
+ 1. check "root" input argument
49
49
  2. check env variable "DELTACAT_ROOT"
50
50
  3. default to ${cwd}/.deltacat
51
51
 
@@ -61,8 +61,6 @@ class CatalogProperties:
61
61
  root: Optional[str] = None,
62
62
  filesystem: Optional[pyarrow.fs.FileSystem] = None,
63
63
  storage=None,
64
- *args,
65
- **kwargs,
66
64
  ):
67
65
  """
68
66
  Initialize a CatalogProperties instance.
@@ -0,0 +1,14 @@
1
+ from deltacat.compute.jobs.client import (
2
+ DeltaCatJobClient,
3
+ job_client,
4
+ local_job_client,
5
+ )
6
+
7
+ from ray.job_submission import JobStatus
8
+
9
+ __all__ = [
10
+ "job_client",
11
+ "local_job_client",
12
+ "DeltaCatJobClient",
13
+ "JobStatus",
14
+ ]
@@ -2,3 +2,8 @@ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
2
2
 
3
3
  # Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
4
4
  DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30
5
+
6
+
7
+ # Unique identifier delimiter to ensure different primary key don't end up with same hash when concatenated.
8
+ # e.g.: pk column a with value: 1, 12; pk column b with value: 12, 1; Without delimiter will both become "121".
9
+ IDENTIFIER_FIELD_DELIMITER = "c303282d"
@@ -1,4 +1,3 @@
1
- # from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
2
1
  from deltacat.utils.ray_utils.concurrency import (
3
2
  invoke_parallel,
4
3
  task_resource_options_provider,
@@ -20,7 +19,6 @@ from deltacat.compute.converter.steps.convert import convert
20
19
  from deltacat.compute.converter.model.convert_input import ConvertInput
21
20
  from deltacat.compute.converter.pyiceberg.overrides import (
22
21
  fetch_all_bucket_files,
23
- parquet_files_dict_to_iceberg_data_files,
24
22
  )
25
23
  from deltacat.compute.converter.utils.converter_session_utils import (
26
24
  construct_iceberg_table_prefix,
@@ -48,32 +46,46 @@ def converter_session(params: ConverterSessionParams, **kwargs):
48
46
  table_name = params.iceberg_table_name
49
47
  iceberg_table = load_table(catalog, table_name)
50
48
  enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
49
+ iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
50
+ iceberg_namespace = params.iceberg_namespace
51
+ merge_keys = params.merge_keys
52
+ compact_previous_position_delete_files = (
53
+ params.compact_previous_position_delete_files
54
+ )
55
+ task_max_parallelism = params.task_max_parallelism
56
+ s3_client_kwargs = params.s3_client_kwargs
57
+ s3_file_system = params.s3_file_system
58
+ location_provider_prefix_override = params.location_provider_prefix_override
59
+ position_delete_for_multiple_data_files = (
60
+ params.position_delete_for_multiple_data_files
61
+ )
62
+
51
63
  data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
52
64
  iceberg_table
53
65
  )
66
+
54
67
  convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
55
68
  data_file_dict=data_file_dict,
56
69
  equality_delete_dict=equality_delete_dict,
57
70
  pos_delete_dict=pos_delete_dict,
58
71
  )
59
- iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
60
- iceberg_namespace = params.iceberg_namespace
61
- iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
62
- iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
63
- table_name=table_name,
64
- iceberg_namespace=iceberg_namespace,
65
- )
66
- merge_keys = params.merge_keys
72
+
73
+ if not location_provider_prefix_override:
74
+ iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
75
+ iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
76
+ table_name=table_name,
77
+ iceberg_namespace=iceberg_namespace,
78
+ )
79
+ else:
80
+ iceberg_table_warehouse_prefix = location_provider_prefix_override
81
+
67
82
  # Using table identifier fields as merge keys if merge keys not provided
68
83
  if not merge_keys:
69
84
  identifier_fields_set = iceberg_table.schema().identifier_field_names()
70
85
  identifier_fields = list(identifier_fields_set)
71
86
  else:
72
87
  identifier_fields = merge_keys
73
- if len(identifier_fields) > 1:
74
- raise NotImplementedError(
75
- f"Multiple identifier fields lookup not supported yet."
76
- )
88
+
77
89
  convert_options_provider = functools.partial(
78
90
  task_resource_options_provider,
79
91
  resource_amount_provider=convert_resource_options_provider,
@@ -86,58 +98,88 @@ def converter_session(params: ConverterSessionParams, **kwargs):
86
98
  # Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
87
99
  max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
88
100
 
89
- compact_small_files = params.compact_small_files
90
- position_delete_for_multiple_data_files = (
91
- params.position_delete_for_multiple_data_files
92
- )
93
- task_max_parallelism = params.task_max_parallelism
94
-
95
101
  def convert_input_provider(index, item):
96
102
  return {
97
103
  "convert_input": ConvertInput.of(
98
- files_for_each_bucket=item,
104
+ convert_input_files=item,
99
105
  convert_task_index=index,
100
106
  iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
101
107
  identifier_fields=identifier_fields,
102
- compact_small_files=compact_small_files,
108
+ compact_previous_position_delete_files=compact_previous_position_delete_files,
109
+ table_io=iceberg_table.io,
110
+ table_metadata=iceberg_table.metadata,
103
111
  enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
104
112
  position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
105
113
  max_parallel_data_file_download=max_parallel_data_file_download,
114
+ s3_client_kwargs=s3_client_kwargs,
115
+ s3_file_system=s3_file_system,
106
116
  )
107
117
  }
108
118
 
119
+ logger.info(f"Getting remote convert tasks...")
109
120
  # Ray remote task: convert
110
- # Assuming that memory consume by each bucket doesn't exceed one node's memory limit.
111
121
  # TODO: Add split mechanism to split large buckets
112
122
  convert_tasks_pending = invoke_parallel(
113
- items=convert_input_files_for_all_buckets.items(),
123
+ items=convert_input_files_for_all_buckets,
114
124
  ray_task=convert,
115
125
  max_parallelism=task_max_parallelism,
116
126
  options_provider=convert_options_provider,
117
127
  kwargs_provider=convert_input_provider,
118
128
  )
129
+
119
130
  to_be_deleted_files_list = []
120
- to_be_added_files_dict_list = []
131
+ logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
132
+
121
133
  convert_results = ray.get(convert_tasks_pending)
122
- for convert_result in convert_results:
123
- to_be_deleted_files_list.extend(convert_result[0].values())
124
- to_be_added_files_dict_list.append(convert_result[1])
134
+ logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
125
135
 
126
- new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
127
- io=iceberg_table.io,
128
- table_metadata=iceberg_table.metadata,
129
- files_dict_list=to_be_added_files_dict_list,
136
+ total_position_delete_record_count = sum(
137
+ convert_result.position_delete_record_count
138
+ for convert_result in convert_results
139
+ )
140
+ total_input_data_file_record_count = sum(
141
+ convert_result.input_data_files_record_count
142
+ for convert_result in convert_results
143
+ )
144
+ total_data_file_hash_columns_in_memory_sizes = sum(
145
+ convert_result.input_data_files_hash_columns_in_memory_sizes
146
+ for convert_result in convert_results
147
+ )
148
+ total_position_delete_file_in_memory_sizes = sum(
149
+ convert_result.position_delete_in_memory_sizes
150
+ for convert_result in convert_results
151
+ )
152
+ total_position_delete_on_disk_sizes = sum(
153
+ convert_result.position_delete_on_disk_sizes
154
+ for convert_result in convert_results
130
155
  )
131
156
 
132
- if not to_be_deleted_files_list:
157
+ to_be_added_files_list = []
158
+ for convert_result in convert_results:
159
+ to_be_added_files = convert_result.to_be_added_files
160
+ to_be_deleted_files = convert_result.to_be_deleted_files
161
+
162
+ to_be_deleted_files_list.extend(to_be_deleted_files.values())
163
+ to_be_added_files_list.extend(to_be_added_files)
164
+
165
+ if not to_be_deleted_files_list and to_be_added_files_list:
133
166
  commit_append_snapshot(
134
167
  iceberg_table=iceberg_table,
135
- new_position_delete_files=new_position_delete_files,
168
+ new_position_delete_files=to_be_added_files_list,
136
169
  )
137
170
  else:
138
171
  commit_replace_snapshot(
139
172
  iceberg_table=iceberg_table,
140
- # equality_delete_files + data file that all rows are deleted
141
173
  to_be_deleted_files_list=to_be_deleted_files_list,
142
- new_position_delete_files=new_position_delete_files,
174
+ new_position_delete_files=to_be_added_files_list,
143
175
  )
176
+ logger.info(
177
+ f"Aggregated stats for {table_name}: "
178
+ f"total position delete record count: {total_position_delete_record_count}, "
179
+ f"total input data file record_count: {total_input_data_file_record_count}, "
180
+ f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
181
+ f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
182
+ f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}."
183
+ )
184
+
185
+ logger.info(f"Committed new Iceberg snapshot.")
@@ -10,11 +10,14 @@ class ConvertInput(Dict):
10
10
  convert_task_index,
11
11
  iceberg_table_warehouse_prefix,
12
12
  identifier_fields,
13
- compact_small_files,
13
+ table_io,
14
+ table_metadata,
15
+ compact_previous_position_delete_files,
14
16
  enforce_primary_key_uniqueness,
15
17
  position_delete_for_multiple_data_files,
16
18
  max_parallel_data_file_download,
17
19
  s3_file_system,
20
+ s3_client_kwargs,
18
21
  ) -> ConvertInput:
19
22
 
20
23
  result = ConvertInput()
@@ -22,13 +25,18 @@ class ConvertInput(Dict):
22
25
  result["convert_task_index"] = convert_task_index
23
26
  result["identifier_fields"] = identifier_fields
24
27
  result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
25
- result["compact_small_files"] = compact_small_files
28
+ result["table_io"] = table_io
29
+ result["table_metadata"] = table_metadata
30
+ result[
31
+ "compact_previous_position_delete_files"
32
+ ] = compact_previous_position_delete_files
26
33
  result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
27
34
  result[
28
35
  "position_delete_for_multiple_data_files"
29
36
  ] = position_delete_for_multiple_data_files
30
37
  result["max_parallel_data_file_download"] = max_parallel_data_file_download
31
38
  result["s3_file_system"] = s3_file_system
39
+ result["s3_client_kwargs"] = s3_client_kwargs
32
40
 
33
41
  return result
34
42
 
@@ -49,8 +57,16 @@ class ConvertInput(Dict):
49
57
  return self["iceberg_table_warehouse_prefix"]
50
58
 
51
59
  @property
52
- def compact_small_files(self) -> bool:
53
- return self["compact_small_files"]
60
+ def table_io(self):
61
+ return self["table_io"]
62
+
63
+ @property
64
+ def table_metadata(self):
65
+ return self["table_metadata"]
66
+
67
+ @property
68
+ def compact_previous_position_delete_files(self) -> bool:
69
+ return self["compact_previous_position_delete_files"]
54
70
 
55
71
  @property
56
72
  def enforce_primary_key_uniqueness(self) -> bool:
@@ -67,3 +83,7 @@ class ConvertInput(Dict):
67
83
  @property
68
84
  def s3_file_system(self):
69
85
  return self["s3_file_system"]
86
+
87
+ @property
88
+ def s3_client_kwargs(self):
89
+ return self["s3_client_kwargs"]
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+ from typing import Dict
3
+
4
+
5
+ class ConvertResult(Dict):
6
+ @staticmethod
7
+ def of(
8
+ convert_task_index,
9
+ to_be_added_files,
10
+ to_be_deleted_files,
11
+ position_delete_record_count,
12
+ input_data_files_record_count,
13
+ input_data_files_hash_columns_in_memory_sizes,
14
+ position_delete_in_memory_sizes,
15
+ position_delete_on_disk_sizes,
16
+ ) -> ConvertResult:
17
+
18
+ result = ConvertResult()
19
+ result["convert_task_index"] = convert_task_index
20
+ result["to_be_added_files"] = to_be_added_files
21
+ result["to_be_deleted_files"] = to_be_deleted_files
22
+ result["position_delete_record_count"] = position_delete_record_count
23
+ result["input_data_files_record_count"] = input_data_files_record_count
24
+ result[
25
+ "input_data_files_hash_columns_in_memory_sizes"
26
+ ] = input_data_files_hash_columns_in_memory_sizes
27
+ result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
28
+ result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
29
+ return result
30
+
31
+ @property
32
+ def convert_task_index(self) -> int:
33
+ return self["convert_task_index"]
34
+
35
+ @property
36
+ def to_be_added_files(self):
37
+ return self["to_be_added_files"]
38
+
39
+ @property
40
+ def to_be_deleted_files(self):
41
+ return self["to_be_deleted_files"]
42
+
43
+ @property
44
+ def position_delete_record_count(self):
45
+ return self["position_delete_record_count"]
46
+
47
+ @property
48
+ def input_data_files_record_count(self):
49
+ return self["input_data_files_record_count"]
50
+
51
+ @property
52
+ def input_data_files_hash_columns_in_memory_sizes(self):
53
+ return self["input_data_files_hash_columns_in_memory_sizes"]
54
+
55
+ @property
56
+ def position_delete_in_memory_sizes(self):
57
+ return self["position_delete_in_memory_sizes"]
58
+
59
+ @property
60
+ def position_delete_on_disk_sizes(self):
61
+ return self["position_delete_on_disk_sizes"]