deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. deltacat/__init__.py +41 -16
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/benchmark_engine.py +4 -2
  5. deltacat/benchmarking/conftest.py +1 -1
  6. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  7. deltacat/catalog/__init__.py +62 -5
  8. deltacat/catalog/main/impl.py +26 -10
  9. deltacat/catalog/model/catalog.py +165 -109
  10. deltacat/catalog/model/properties.py +25 -24
  11. deltacat/compute/__init__.py +14 -0
  12. deltacat/compute/converter/constants.py +5 -0
  13. deltacat/compute/converter/converter_session.py +78 -36
  14. deltacat/compute/converter/model/convert_input.py +24 -4
  15. deltacat/compute/converter/model/convert_result.py +61 -0
  16. deltacat/compute/converter/model/converter_session_params.py +52 -10
  17. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  18. deltacat/compute/converter/steps/convert.py +84 -36
  19. deltacat/compute/converter/steps/dedupe.py +25 -4
  20. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  21. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  22. deltacat/compute/converter/utils/io.py +82 -11
  23. deltacat/compute/converter/utils/s3u.py +13 -4
  24. deltacat/compute/jobs/client.py +406 -0
  25. deltacat/constants.py +5 -6
  26. deltacat/env.py +10 -0
  27. deltacat/examples/basic_logging.py +6 -6
  28. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  29. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  30. deltacat/examples/hello_world.py +4 -2
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +198 -0
  33. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  34. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  35. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  36. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  37. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  38. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  39. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  40. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  41. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  42. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  43. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  44. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  45. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  46. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  47. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  49. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  50. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  51. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  52. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  53. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  54. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  55. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  56. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  57. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  58. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  59. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  60. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  61. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  62. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  63. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  64. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  65. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  66. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  67. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  68. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  69. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  70. deltacat/io/__init__.py +13 -0
  71. deltacat/io/dataset/__init__.py +0 -0
  72. deltacat/io/dataset/deltacat_dataset.py +91 -0
  73. deltacat/io/datasink/__init__.py +0 -0
  74. deltacat/io/datasink/deltacat_datasink.py +207 -0
  75. deltacat/io/datasource/__init__.py +0 -0
  76. deltacat/io/datasource/deltacat_datasource.py +580 -0
  77. deltacat/io/reader/__init__.py +0 -0
  78. deltacat/io/reader/deltacat_read_api.py +172 -0
  79. deltacat/storage/__init__.py +2 -0
  80. deltacat/storage/model/expression/__init__.py +47 -0
  81. deltacat/storage/model/expression/expression.py +656 -0
  82. deltacat/storage/model/expression/visitor.py +248 -0
  83. deltacat/storage/model/metafile.py +74 -42
  84. deltacat/storage/model/scan/push_down.py +32 -5
  85. deltacat/storage/model/shard.py +6 -2
  86. deltacat/storage/model/types.py +5 -3
  87. deltacat/tests/_io/reader/__init__.py +0 -0
  88. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  89. deltacat/tests/catalog/data/__init__.py +0 -0
  90. deltacat/tests/catalog/main/__init__.py +0 -0
  91. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  92. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  93. deltacat/tests/catalog/model/__init__.py +0 -0
  94. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  95. deltacat/tests/catalog/test_catalogs.py +52 -98
  96. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  97. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  98. deltacat/tests/daft/__init__.py +0 -0
  99. deltacat/tests/daft/test_model.py +97 -0
  100. deltacat/tests/experimental/__init__.py +0 -0
  101. deltacat/tests/experimental/catalog/__init__.py +0 -0
  102. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  103. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  104. deltacat/tests/experimental/daft/__init__.py +0 -0
  105. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  106. deltacat/tests/experimental/storage/__init__.py +0 -0
  107. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  108. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  109. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  110. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  111. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  112. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  113. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  114. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  115. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  116. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  117. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  118. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  119. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  120. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  121. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  122. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  123. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  124. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  125. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  126. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  127. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  128. deltacat/tests/storage/model/test_expression.py +327 -0
  129. deltacat/tests/storage/model/test_shard.py +3 -1
  130. deltacat/tests/test_deltacat_api.py +50 -9
  131. deltacat/types/media.py +141 -43
  132. deltacat/types/tables.py +35 -7
  133. deltacat/utils/daft.py +531 -5
  134. deltacat/utils/export.py +3 -1
  135. deltacat/utils/filesystem.py +39 -9
  136. deltacat/utils/polars.py +128 -0
  137. deltacat/utils/pyarrow.py +151 -15
  138. deltacat/utils/ray_utils/concurrency.py +1 -1
  139. deltacat/utils/ray_utils/runtime.py +56 -4
  140. deltacat/utils/url.py +1284 -0
  141. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
  142. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
  143. deltacat/catalog/iceberg/__init__.py +0 -4
  144. deltacat/daft/daft_scan.py +0 -111
  145. deltacat/daft/model.py +0 -258
  146. deltacat/examples/common/fixtures.py +0 -15
  147. deltacat/storage/rivulet/__init__.py +0 -11
  148. deltacat/storage/rivulet/feather/__init__.py +0 -5
  149. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  150. /deltacat/{daft → compute/jobs}/__init__.py +0 -0
  151. /deltacat/examples/{common → experimental}/__init__.py +0 -0
  152. /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
  153. /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
  154. /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
  155. /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  156. /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
  157. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  158. /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
  159. /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  160. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  161. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  162. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
  163. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  164. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  165. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  166. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  167. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  168. /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  169. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  170. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  171. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  172. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  173. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  174. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  175. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
1
1
  from __future__ import annotations
2
+
2
3
  from typing import Optional, Any
3
4
 
5
+ import os
6
+
4
7
  import pyarrow
5
8
  from deltacat.constants import DELTACAT_ROOT
6
9
 
@@ -8,18 +11,17 @@ from deltacat.utils.filesystem import resolve_path_and_filesystem
8
11
 
9
12
 
10
13
  def get_catalog_properties(
11
- *args,
14
+ *,
12
15
  catalog: Optional[CatalogProperties] = None,
13
16
  inner: Optional[CatalogProperties] = None,
14
17
  **kwargs,
15
18
  ) -> CatalogProperties:
16
19
  """
17
- Helper function to fetch CatalogProperties instance. You are meant to call this by providing your functions
18
- kwargs, OR to directly pass through CatalogProperty configuration keys like "root" in kwargs.
20
+ Helper function to fetch CatalogProperties instance.
19
21
 
20
- This will look for a CatalogProperty value in the kwargs "catalog" or "inner". If these are found, it returns
21
- the CatalogProperty value under that kwarg. Otherwise, it will pass through kwargs to the CatalogProperties
22
- constructor.
22
+ This will look first look for CatalogProperties in either "catalog"
23
+ or "inner" and otherwise passes all keyword arguments to the
24
+ CatalogProperties constructor.
23
25
  """
24
26
  properties = catalog if catalog is not None else inner
25
27
  if properties is not None and isinstance(properties, CatalogProperties):
@@ -39,21 +41,22 @@ class CatalogProperties:
39
41
  DeltaCAT catalog instance. Properties are set from system environment
40
42
  variables unless explicit overrides are provided during initialization.
41
43
 
42
- Catalog and storage APIs rely on the property catalog to retrieve durable state about the catalog they're
43
- working against.
44
+ Catalog and storage APIs rely on the property catalog to retrieve durable
45
+ state about the catalog they're working against.
44
46
 
45
47
  Attributes:
46
- root (str): URI string The root path where catalog metadata and data
47
- files are stored. Root is determined (in prededence order) by:
48
- 1. check kwargs for "root"
49
- 2. check env variable "DELTACAT_ROOT"
50
- 3. default to ${cwd}/.deltacat
48
+ root: The root path for catalog metadata and data storage. Resolved by
49
+ searching for the root path in the following order:
50
+ 1. "root" constructor input argument
51
+ 2. "DELTACAT_ROOT" system environment variable
52
+ 3. default to "./.deltacat/"
51
53
 
52
54
  filesystem: The filesystem implementation that should be used for
53
55
  reading/writing files. If None, a filesystem will be inferred from
54
56
  the catalog root path.
55
57
 
56
- storage: Storage class implementation (overrides default filesystem storage impl)
58
+ storage: Storage class implementation (overrides default filesystem
59
+ storage impl)
57
60
  """
58
61
 
59
62
  def __init__(
@@ -61,28 +64,26 @@ class CatalogProperties:
61
64
  root: Optional[str] = None,
62
65
  filesystem: Optional[pyarrow.fs.FileSystem] = None,
63
66
  storage=None,
64
- *args,
65
- **kwargs,
66
67
  ):
67
68
  """
68
69
  Initialize a CatalogProperties instance.
69
70
 
70
71
  Args:
71
- root: A single directory path that serves as the catalog root dir.
72
+ root: Catalog root directory path. Uses the "DELTACAT_ROOT"
73
+ system environment variable if not set, and defaults to
74
+ "./.deltacat/" if this environment variable is not set.
72
75
  filesystem: The filesystem implementation that should be used for
73
76
  reading these files. If None, a filesystem will be inferred.
74
- If not None, the provided filesystem will still be validated
75
- against the provided path to ensure compatibility.
77
+ If provided, this will be validated for compatibility with the
78
+ catalog root path.
76
79
  """
77
80
  # set root, using precedence rules described in pydoc
78
81
  if root is None:
79
82
  # Check environment variables
80
- # This is set or defaulted in constants.py
81
83
  root = DELTACAT_ROOT
82
- if root is None:
83
- raise ValueError(
84
- "Expected environment variable DELTACAT_ROOT to be set or defaulted"
85
- )
84
+ if not root:
85
+ # Default to "./.deltacat/"
86
+ root = os.path.join(os.getcwd(), ".deltacat")
86
87
 
87
88
  resolved_root, resolved_filesystem = resolve_path_and_filesystem(
88
89
  path=root,
@@ -0,0 +1,14 @@
1
+ from deltacat.compute.jobs.client import (
2
+ DeltaCatJobClient,
3
+ job_client,
4
+ local_job_client,
5
+ )
6
+
7
+ from ray.job_submission import JobStatus
8
+
9
+ __all__ = [
10
+ "job_client",
11
+ "local_job_client",
12
+ "DeltaCatJobClient",
13
+ "JobStatus",
14
+ ]
@@ -2,3 +2,8 @@ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
2
2
 
3
3
  # Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
4
4
  DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30
5
+
6
+
7
+ # Unique identifier delimiter to ensure different primary key don't end up with same hash when concatenated.
8
+ # e.g.: pk column a with value: 1, 12; pk column b with value: 12, 1; Without delimiter will both become "121".
9
+ IDENTIFIER_FIELD_DELIMITER = "c303282d"
@@ -1,4 +1,3 @@
1
- # from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
2
1
  from deltacat.utils.ray_utils.concurrency import (
3
2
  invoke_parallel,
4
3
  task_resource_options_provider,
@@ -20,7 +19,6 @@ from deltacat.compute.converter.steps.convert import convert
20
19
  from deltacat.compute.converter.model.convert_input import ConvertInput
21
20
  from deltacat.compute.converter.pyiceberg.overrides import (
22
21
  fetch_all_bucket_files,
23
- parquet_files_dict_to_iceberg_data_files,
24
22
  )
25
23
  from deltacat.compute.converter.utils.converter_session_utils import (
26
24
  construct_iceberg_table_prefix,
@@ -48,32 +46,46 @@ def converter_session(params: ConverterSessionParams, **kwargs):
48
46
  table_name = params.iceberg_table_name
49
47
  iceberg_table = load_table(catalog, table_name)
50
48
  enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
49
+ iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
50
+ iceberg_namespace = params.iceberg_namespace
51
+ merge_keys = params.merge_keys
52
+ compact_previous_position_delete_files = (
53
+ params.compact_previous_position_delete_files
54
+ )
55
+ task_max_parallelism = params.task_max_parallelism
56
+ s3_client_kwargs = params.s3_client_kwargs
57
+ s3_file_system = params.s3_file_system
58
+ location_provider_prefix_override = params.location_provider_prefix_override
59
+ position_delete_for_multiple_data_files = (
60
+ params.position_delete_for_multiple_data_files
61
+ )
62
+
51
63
  data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
52
64
  iceberg_table
53
65
  )
66
+
54
67
  convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
55
68
  data_file_dict=data_file_dict,
56
69
  equality_delete_dict=equality_delete_dict,
57
70
  pos_delete_dict=pos_delete_dict,
58
71
  )
59
- iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
60
- iceberg_namespace = params.iceberg_namespace
61
- iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
62
- iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
63
- table_name=table_name,
64
- iceberg_namespace=iceberg_namespace,
65
- )
66
- merge_keys = params.merge_keys
72
+
73
+ if not location_provider_prefix_override:
74
+ iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
75
+ iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
76
+ table_name=table_name,
77
+ iceberg_namespace=iceberg_namespace,
78
+ )
79
+ else:
80
+ iceberg_table_warehouse_prefix = location_provider_prefix_override
81
+
67
82
  # Using table identifier fields as merge keys if merge keys not provided
68
83
  if not merge_keys:
69
84
  identifier_fields_set = iceberg_table.schema().identifier_field_names()
70
85
  identifier_fields = list(identifier_fields_set)
71
86
  else:
72
87
  identifier_fields = merge_keys
73
- if len(identifier_fields) > 1:
74
- raise NotImplementedError(
75
- f"Multiple identifier fields lookup not supported yet."
76
- )
88
+
77
89
  convert_options_provider = functools.partial(
78
90
  task_resource_options_provider,
79
91
  resource_amount_provider=convert_resource_options_provider,
@@ -86,58 +98,88 @@ def converter_session(params: ConverterSessionParams, **kwargs):
86
98
  # Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
87
99
  max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
88
100
 
89
- compact_small_files = params.compact_small_files
90
- position_delete_for_multiple_data_files = (
91
- params.position_delete_for_multiple_data_files
92
- )
93
- task_max_parallelism = params.task_max_parallelism
94
-
95
101
  def convert_input_provider(index, item):
96
102
  return {
97
103
  "convert_input": ConvertInput.of(
98
- files_for_each_bucket=item,
104
+ convert_input_files=item,
99
105
  convert_task_index=index,
100
106
  iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
101
107
  identifier_fields=identifier_fields,
102
- compact_small_files=compact_small_files,
108
+ compact_previous_position_delete_files=compact_previous_position_delete_files,
109
+ table_io=iceberg_table.io,
110
+ table_metadata=iceberg_table.metadata,
103
111
  enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
104
112
  position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
105
113
  max_parallel_data_file_download=max_parallel_data_file_download,
114
+ s3_client_kwargs=s3_client_kwargs,
115
+ s3_file_system=s3_file_system,
106
116
  )
107
117
  }
108
118
 
119
+ logger.info(f"Getting remote convert tasks...")
109
120
  # Ray remote task: convert
110
- # Assuming that memory consume by each bucket doesn't exceed one node's memory limit.
111
121
  # TODO: Add split mechanism to split large buckets
112
122
  convert_tasks_pending = invoke_parallel(
113
- items=convert_input_files_for_all_buckets.items(),
123
+ items=convert_input_files_for_all_buckets,
114
124
  ray_task=convert,
115
125
  max_parallelism=task_max_parallelism,
116
126
  options_provider=convert_options_provider,
117
127
  kwargs_provider=convert_input_provider,
118
128
  )
129
+
119
130
  to_be_deleted_files_list = []
120
- to_be_added_files_dict_list = []
131
+ logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
132
+
121
133
  convert_results = ray.get(convert_tasks_pending)
122
- for convert_result in convert_results:
123
- to_be_deleted_files_list.extend(convert_result[0].values())
124
- to_be_added_files_dict_list.append(convert_result[1])
134
+ logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
125
135
 
126
- new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
127
- io=iceberg_table.io,
128
- table_metadata=iceberg_table.metadata,
129
- files_dict_list=to_be_added_files_dict_list,
136
+ total_position_delete_record_count = sum(
137
+ convert_result.position_delete_record_count
138
+ for convert_result in convert_results
139
+ )
140
+ total_input_data_file_record_count = sum(
141
+ convert_result.input_data_files_record_count
142
+ for convert_result in convert_results
143
+ )
144
+ total_data_file_hash_columns_in_memory_sizes = sum(
145
+ convert_result.input_data_files_hash_columns_in_memory_sizes
146
+ for convert_result in convert_results
147
+ )
148
+ total_position_delete_file_in_memory_sizes = sum(
149
+ convert_result.position_delete_in_memory_sizes
150
+ for convert_result in convert_results
151
+ )
152
+ total_position_delete_on_disk_sizes = sum(
153
+ convert_result.position_delete_on_disk_sizes
154
+ for convert_result in convert_results
130
155
  )
131
156
 
132
- if not to_be_deleted_files_list:
157
+ to_be_added_files_list = []
158
+ for convert_result in convert_results:
159
+ to_be_added_files = convert_result.to_be_added_files
160
+ to_be_deleted_files = convert_result.to_be_deleted_files
161
+
162
+ to_be_deleted_files_list.extend(to_be_deleted_files.values())
163
+ to_be_added_files_list.extend(to_be_added_files)
164
+
165
+ if not to_be_deleted_files_list and to_be_added_files_list:
133
166
  commit_append_snapshot(
134
167
  iceberg_table=iceberg_table,
135
- new_position_delete_files=new_position_delete_files,
168
+ new_position_delete_files=to_be_added_files_list,
136
169
  )
137
170
  else:
138
171
  commit_replace_snapshot(
139
172
  iceberg_table=iceberg_table,
140
- # equality_delete_files + data file that all rows are deleted
141
173
  to_be_deleted_files_list=to_be_deleted_files_list,
142
- new_position_delete_files=new_position_delete_files,
174
+ new_position_delete_files=to_be_added_files_list,
143
175
  )
176
+ logger.info(
177
+ f"Aggregated stats for {table_name}: "
178
+ f"total position delete record count: {total_position_delete_record_count}, "
179
+ f"total input data file record_count: {total_input_data_file_record_count}, "
180
+ f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
181
+ f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
182
+ f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}."
183
+ )
184
+
185
+ logger.info(f"Committed new Iceberg snapshot.")
@@ -10,11 +10,14 @@ class ConvertInput(Dict):
10
10
  convert_task_index,
11
11
  iceberg_table_warehouse_prefix,
12
12
  identifier_fields,
13
- compact_small_files,
13
+ table_io,
14
+ table_metadata,
15
+ compact_previous_position_delete_files,
14
16
  enforce_primary_key_uniqueness,
15
17
  position_delete_for_multiple_data_files,
16
18
  max_parallel_data_file_download,
17
19
  s3_file_system,
20
+ s3_client_kwargs,
18
21
  ) -> ConvertInput:
19
22
 
20
23
  result = ConvertInput()
@@ -22,13 +25,18 @@ class ConvertInput(Dict):
22
25
  result["convert_task_index"] = convert_task_index
23
26
  result["identifier_fields"] = identifier_fields
24
27
  result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
25
- result["compact_small_files"] = compact_small_files
28
+ result["table_io"] = table_io
29
+ result["table_metadata"] = table_metadata
30
+ result[
31
+ "compact_previous_position_delete_files"
32
+ ] = compact_previous_position_delete_files
26
33
  result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
27
34
  result[
28
35
  "position_delete_for_multiple_data_files"
29
36
  ] = position_delete_for_multiple_data_files
30
37
  result["max_parallel_data_file_download"] = max_parallel_data_file_download
31
38
  result["s3_file_system"] = s3_file_system
39
+ result["s3_client_kwargs"] = s3_client_kwargs
32
40
 
33
41
  return result
34
42
 
@@ -49,8 +57,16 @@ class ConvertInput(Dict):
49
57
  return self["iceberg_table_warehouse_prefix"]
50
58
 
51
59
  @property
52
- def compact_small_files(self) -> bool:
53
- return self["compact_small_files"]
60
+ def table_io(self):
61
+ return self["table_io"]
62
+
63
+ @property
64
+ def table_metadata(self):
65
+ return self["table_metadata"]
66
+
67
+ @property
68
+ def compact_previous_position_delete_files(self) -> bool:
69
+ return self["compact_previous_position_delete_files"]
54
70
 
55
71
  @property
56
72
  def enforce_primary_key_uniqueness(self) -> bool:
@@ -67,3 +83,7 @@ class ConvertInput(Dict):
67
83
  @property
68
84
  def s3_file_system(self):
69
85
  return self["s3_file_system"]
86
+
87
+ @property
88
+ def s3_client_kwargs(self):
89
+ return self["s3_client_kwargs"]
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+ from typing import Dict
3
+
4
+
5
+ class ConvertResult(Dict):
6
+ @staticmethod
7
+ def of(
8
+ convert_task_index,
9
+ to_be_added_files,
10
+ to_be_deleted_files,
11
+ position_delete_record_count,
12
+ input_data_files_record_count,
13
+ input_data_files_hash_columns_in_memory_sizes,
14
+ position_delete_in_memory_sizes,
15
+ position_delete_on_disk_sizes,
16
+ ) -> ConvertResult:
17
+
18
+ result = ConvertResult()
19
+ result["convert_task_index"] = convert_task_index
20
+ result["to_be_added_files"] = to_be_added_files
21
+ result["to_be_deleted_files"] = to_be_deleted_files
22
+ result["position_delete_record_count"] = position_delete_record_count
23
+ result["input_data_files_record_count"] = input_data_files_record_count
24
+ result[
25
+ "input_data_files_hash_columns_in_memory_sizes"
26
+ ] = input_data_files_hash_columns_in_memory_sizes
27
+ result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
28
+ result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
29
+ return result
30
+
31
+ @property
32
+ def convert_task_index(self) -> int:
33
+ return self["convert_task_index"]
34
+
35
+ @property
36
+ def to_be_added_files(self):
37
+ return self["to_be_added_files"]
38
+
39
+ @property
40
+ def to_be_deleted_files(self):
41
+ return self["to_be_deleted_files"]
42
+
43
+ @property
44
+ def position_delete_record_count(self):
45
+ return self["position_delete_record_count"]
46
+
47
+ @property
48
+ def input_data_files_record_count(self):
49
+ return self["input_data_files_record_count"]
50
+
51
+ @property
52
+ def input_data_files_hash_columns_in_memory_sizes(self):
53
+ return self["input_data_files_hash_columns_in_memory_sizes"]
54
+
55
+ @property
56
+ def position_delete_in_memory_sizes(self):
57
+ return self["position_delete_in_memory_sizes"]
58
+
59
+ @property
60
+ def position_delete_on_disk_sizes(self):
61
+ return self["position_delete_on_disk_sizes"]
@@ -1,6 +1,10 @@
1
1
  from __future__ import annotations
2
2
  from typing import Optional, Dict
3
- from deltacat.compute.converter.constants import DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
3
+ from deltacat.compute.converter.constants import (
4
+ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM,
5
+ )
6
+ from deltacat.constants import DEFAULT_NAMESPACE
7
+ from fsspec import AbstractFileSystem
4
8
 
5
9
 
6
10
  class ConverterSessionParams(dict):
@@ -18,15 +22,15 @@ class ConverterSessionParams(dict):
18
22
  assert (
19
23
  params.get("iceberg_warehouse_bucket_name") is not None
20
24
  ), "iceberg_warehouse_bucket_name is a required arg"
21
- assert (
22
- params.get("iceberg_namespace") is not None
23
- ), "iceberg_namespace is a required arg"
24
25
  result = ConverterSessionParams(params)
25
26
 
27
+ result.iceberg_namespace = params.get("iceberg_namespace", DEFAULT_NAMESPACE)
26
28
  result.enforce_primary_key_uniqueness = params.get(
27
29
  "enforce_primary_key_uniqueness", False
28
30
  )
29
- result.compact_small_files = params.get("compact_small_files", False)
31
+ result.compact_previous_position_delete_files = params.get(
32
+ "compact_previous_position_delete_files", False
33
+ )
30
34
 
31
35
  # For Iceberg v3 spec, option to produce delete vector that can establish 1:1 mapping with data files.
32
36
  result.position_delete_for_multiple_data_files = params.get(
@@ -36,6 +40,10 @@ class ConverterSessionParams(dict):
36
40
  "task_max_parallelism", DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
37
41
  )
38
42
  result.merge_keys = params.get("merge_keys", None)
43
+ result.s3_client_kwargs = params.get("s3_client_kwargs", {})
44
+ result.s3_file_system = params.get("s3_file_system", None)
45
+ result.s3_prefix_override = params.get("s3_prefix_override", None)
46
+
39
47
  return result
40
48
 
41
49
  @property
@@ -54,6 +62,10 @@ class ConverterSessionParams(dict):
54
62
  def iceberg_namespace(self) -> str:
55
63
  return self["iceberg_namespace"]
56
64
 
65
+ @iceberg_namespace.setter
66
+ def iceberg_namespace(self, iceberg_namespace) -> None:
67
+ self["iceberg_namespace"] = iceberg_namespace
68
+
57
69
  @property
58
70
  def enforce_primary_key_uniqueness(self) -> bool:
59
71
  return self["enforce_primary_key_uniqueness"]
@@ -63,12 +75,16 @@ class ConverterSessionParams(dict):
63
75
  self["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
64
76
 
65
77
  @property
66
- def compact_small_files(self) -> bool:
67
- return self["compact_small_files"]
78
+ def compact_previous_position_delete_files(self) -> bool:
79
+ return self["compact_previous_position_delete_files"]
68
80
 
69
- @compact_small_files.setter
70
- def compact_small_files(self, compact_small_files) -> None:
71
- self["compact_small_files"] = compact_small_files
81
+ @compact_previous_position_delete_files.setter
82
+ def compact_previous_position_delete_files(
83
+ self, compact_previous_position_delete_files
84
+ ) -> None:
85
+ self[
86
+ "compact_previous_position_delete_files"
87
+ ] = compact_previous_position_delete_files
72
88
 
73
89
  @property
74
90
  def position_delete_for_multiple_data_files(self) -> bool:
@@ -97,3 +113,29 @@ class ConverterSessionParams(dict):
97
113
  @merge_keys.setter
98
114
  def merge_keys(self, merge_keys) -> None:
99
115
  self["merge_keys"] = merge_keys
116
+
117
+ @property
118
+ def s3_client_kwargs(self) -> Dict:
119
+ return self["s3_client_kwargs"]
120
+
121
+ @s3_client_kwargs.setter
122
+ def s3_client_kwargs(self, s3_client_kwargs) -> None:
123
+ self["s3_client_kwargs"] = s3_client_kwargs
124
+
125
+ @property
126
+ def s3_file_system(self) -> AbstractFileSystem:
127
+ return self["s3_file_system"]
128
+
129
+ @s3_file_system.setter
130
+ def s3_file_system(self, s3_file_system) -> None:
131
+ self["s3_file_system"] = s3_file_system
132
+
133
+ @property
134
+ def location_provider_prefix_override(self) -> str:
135
+ return self["location_provider_prefix_override"]
136
+
137
+ @location_provider_prefix_override.setter
138
+ def location_provider_prefix_override(
139
+ self, location_provider_prefix_override
140
+ ) -> None:
141
+ self["location_provider_prefix_override"] = location_provider_prefix_override