deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -8,6 +8,11 @@ from deltacat.api import (
8
8
  list,
9
9
  put,
10
10
  )
11
+ from deltacat.storage.model.transaction import (
12
+ transaction,
13
+ transactions,
14
+ read_transaction,
15
+ )
11
16
  from deltacat.catalog import ( # noqa: F401
12
17
  alter_namespace,
13
18
  alter_table,
@@ -28,6 +33,7 @@ from deltacat.catalog import ( # noqa: F401
28
33
  truncate_table,
29
34
  write_to_table,
30
35
  init,
36
+ init_local,
31
37
  is_initialized,
32
38
  clear_catalogs,
33
39
  get_catalog,
@@ -44,6 +50,12 @@ from deltacat.compute import (
44
50
  local_job_client,
45
51
  )
46
52
  from deltacat.storage import (
53
+ BucketingStrategy,
54
+ BucketTransform,
55
+ BucketTransformParameters,
56
+ DayTransform,
57
+ HourTransform,
58
+ IdentityTransform,
47
59
  Dataset,
48
60
  DistributedDataset,
49
61
  Field,
@@ -59,6 +71,17 @@ from deltacat.storage import (
59
71
  SortKey,
60
72
  SortOrder,
61
73
  SortScheme,
74
+ TableProperties,
75
+ TransactionStatus,
76
+ Transform,
77
+ TransformName,
78
+ TransformParameters,
79
+ TruncateTransform,
80
+ TruncateTransformParameters,
81
+ TruncateStrategy,
82
+ UnknownTransform,
83
+ VoidTransform,
84
+ YearTransform,
62
85
  NullOrder,
63
86
  )
64
87
  from deltacat.types.media import (
@@ -67,10 +90,26 @@ from deltacat.types.media import (
67
90
  DatasetType,
68
91
  DatastoreType,
69
92
  )
70
-
71
- from deltacat.types.tables import TableWriteMode
93
+ from deltacat.types.tables import (
94
+ TableWriteMode,
95
+ TableProperty,
96
+ TableReadOptimizationLevel,
97
+ SchemaEvolutionMode,
98
+ from_pandas,
99
+ from_pyarrow,
100
+ from_manifest_table,
101
+ to_pyarrow,
102
+ to_pandas,
103
+ dataset_length,
104
+ dataset_size,
105
+ dataset_column_names,
106
+ dataset_schema,
107
+ )
72
108
  from deltacat.utils.url import DeltaCatUrl
73
109
 
110
+ write = write_to_table
111
+ read = read_table
112
+
74
113
  __iceberg__ = []
75
114
  if importlib.util.find_spec("pyiceberg") is not None:
76
115
  from deltacat.experimental.catalog.iceberg import ( # noqa: F401
@@ -83,7 +122,7 @@ if importlib.util.find_spec("pyiceberg") is not None:
83
122
 
84
123
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
85
124
 
86
- __version__ = "2.0.0b11"
125
+ __version__ = "2.0.0b12"
87
126
 
88
127
 
89
128
  __all__ = [
@@ -94,6 +133,8 @@ __all__ = [
94
133
  "get",
95
134
  "list",
96
135
  "put",
136
+ "transaction",
137
+ "transactions",
97
138
  "alter_table",
98
139
  "create_table",
99
140
  "drop_table",
@@ -110,9 +151,13 @@ __all__ = [
110
151
  "create_namespace",
111
152
  "drop_namespace",
112
153
  "default_namespace",
154
+ "write",
113
155
  "write_to_table",
156
+ "read",
114
157
  "read_table",
158
+ "read_transaction",
115
159
  "init",
160
+ "init_local",
116
161
  "is_initialized",
117
162
  "clear_catalogs",
118
163
  "get_catalog",
@@ -120,6 +165,18 @@ __all__ = [
120
165
  "pop_catalog",
121
166
  "put_catalog",
122
167
  "raise_if_not_initialized",
168
+ "dataset_length",
169
+ "dataset_size",
170
+ "dataset_column_names",
171
+ "dataset_schema",
172
+ "from_pandas",
173
+ "from_pyarrow",
174
+ "from_manifest_table",
175
+ "to_pandas",
176
+ "to_pyarrow",
177
+ "BucketingStrategy",
178
+ "BucketTransform",
179
+ "BucketTransformParameters",
123
180
  "Catalog",
124
181
  "CatalogProperties",
125
182
  "ContentType",
@@ -127,13 +184,17 @@ __all__ = [
127
184
  "Dataset",
128
185
  "DatasetType",
129
186
  "DatastoreType",
187
+ "DayTransform",
130
188
  "DeltaCatUrl",
131
189
  "DistributedDataset",
132
190
  "Field",
191
+ "HourTransform",
192
+ "IdentityTransform",
133
193
  "LifecycleState",
134
194
  "ListResult",
135
195
  "LocalDataset",
136
196
  "LocalTable",
197
+ "MonthTransform",
137
198
  "Namespace",
138
199
  "NullOrder",
139
200
  "PartitionKey",
@@ -145,6 +206,20 @@ __all__ = [
145
206
  "SortScheme",
146
207
  "TableDefinition",
147
208
  "TableWriteMode",
209
+ "TableProperties",
210
+ "TableProperty",
211
+ "TableReadOptimizationLevel",
212
+ "SchemaEvolutionMode",
213
+ "TransactionStatus",
214
+ "Transform",
215
+ "TransformName",
216
+ "TransformParameters",
217
+ "TruncateTransform",
218
+ "TruncateTransformParameters",
219
+ "TruncateStrategy",
220
+ "UnknownTransform",
221
+ "VoidTransform",
222
+ "YearTransform",
148
223
  ]
149
224
 
150
225
  __all__ += __iceberg__
deltacat/api.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import time
2
2
  from dataclasses import dataclass
3
3
  from typing import Any, Union, List, Optional, Dict, Callable, Tuple
4
+ import logging
4
5
 
5
6
  import ray
6
7
  import deltacat as dc
@@ -15,6 +16,12 @@ from deltacat.io import (
15
16
  DeltacatReadType,
16
17
  )
17
18
  from deltacat.storage import (
19
+ Namespace,
20
+ Table,
21
+ TableVersion,
22
+ Stream,
23
+ Partition,
24
+ Delta,
18
25
  Dataset,
19
26
  DistributedDataset,
20
27
  ListResult,
@@ -44,6 +51,9 @@ from deltacat.utils.ray_utils.runtime import (
44
51
  other_live_node_resource_keys,
45
52
  find_max_single_node_resource_type,
46
53
  )
54
+ from deltacat import logs
55
+
56
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
47
57
 
48
58
  """
49
59
  # CLI Example of Copying from Source to Dest without file conversion
@@ -72,38 +82,6 @@ from deltacat.utils.ray_utils.runtime import (
72
82
  """
73
83
 
74
84
 
75
- def _copy_dc(
76
- source: DeltaCatUrl,
77
- destination: DeltaCatUrl,
78
- recursive: bool = False,
79
- ) -> Metafile:
80
- if recursive:
81
- src_obj = list(source, recursive=True)
82
- else:
83
- src_obj = get(source) if not source.url.endswith("/*") else list(source)
84
- """
85
- dc_dest_url = DeltacatUrl(destination)
86
- # TODO(pdames): Add writer with support for Ray Dataset DeltaCAT Sink &
87
- # Recursive DeltaCAT source object copies. Ideally, the Ray Dataset read
88
- # is lazy, and only indexes metadata about the objects at source instead
89
- # of eagerly converting them to PyArrow-based Blocks.
90
- dc_dest_url.writer(src_obj, recursive=recursive)
91
- """
92
-
93
- src_parts = source.url.split("/")
94
- src_parts = [part for part in src_parts if part]
95
- dst_parts = destination.url.split("/")
96
- dst_parts = [part for part in dst_parts if part]
97
- dc.raise_if_not_initialized()
98
- if len(src_parts) != len(dst_parts):
99
- # TODO(pdames): Better error message.
100
- raise ValueError(
101
- f"Cannot copy {source} to {destination}. "
102
- f"Source and destination must share the same type."
103
- )
104
- return put(destination, metafile=src_obj)
105
-
106
-
107
85
  def copy(
108
86
  src: DeltaCatUrl,
109
87
  dst: DeltaCatUrl,
@@ -123,6 +101,7 @@ def copy(
123
101
  "gz": 35,
124
102
  "bz2": 35,
125
103
  "zip": 35,
104
+ "zst": 35,
126
105
  "7z": 35,
127
106
  "*": 2.5,
128
107
  },
@@ -135,11 +114,10 @@ def copy(
135
114
  Copies data from the source datastore to the destination datastore. By
136
115
  default, this method launches one parallel Ray process to read/transform
137
116
  each input file found in the source followed by one parallel Ray process
138
- to write each output file to the destination. Files written to the
139
- destination are split or combined to contain uniform record counts. To
140
- ensure that adequate resources are available to complete the operation,
141
- you may optionally specify minimum cluster and/or worker CPUs to wait for
142
- before starting parallel processing.
117
+ to write each output file to the destination. To ensure that adequate
118
+ resources are available to complete the operation, you may optionally
119
+ specify minimum cluster and/or worker CPUs to wait for before starting
120
+ parallel processing.
143
121
 
144
122
  Args:
145
123
  src: DeltaCAT URL of the source datastore to read.
@@ -190,6 +168,73 @@ def copy(
190
168
  )
191
169
 
192
170
 
171
+ def _copy_objects_in_order(
172
+ src_objects: List[Metafile],
173
+ destination: DeltaCatUrl,
174
+ ) -> Union[Metafile, List[Metafile]]:
175
+ dc_dest_url = DeltaCatUrl(destination.url)
176
+ catalog_name = dc_dest_url.catalog_name
177
+
178
+ copied_results = []
179
+
180
+ # Group objects by type for hierarchical copying
181
+ # Copy objects in strict hierarchical order
182
+ # Namespace -> Table -> TableVersion -> Stream -> Partition -> Delta
183
+ ordered_objects_by_type = {
184
+ Namespace: [],
185
+ Table: [],
186
+ TableVersion: [],
187
+ Stream: [],
188
+ Partition: [],
189
+ Delta: [],
190
+ }
191
+
192
+ for obj in src_objects:
193
+ obj_class = Metafile.get_class(obj.to_serializable())
194
+ ordered_objects_by_type[obj_class].append(obj)
195
+
196
+ # TODO(pdames): Support copying uncommitted streams/partitions.
197
+ # TODO(pdames): Support parallel/distributed copies.
198
+ for obj_class, objects in ordered_objects_by_type.items():
199
+ if objects:
200
+ logger.info(f"Copying {len(objects)} {obj_class} objects...")
201
+ if obj_class == TableVersion:
202
+ # sort table versions by ascending table version
203
+ objects.sort(key=lambda x: x.current_version_number())
204
+ if obj_class == Delta:
205
+ # sort deltas by ascending stream position
206
+ objects.sort(key=lambda x: x.stream_position)
207
+ for i, obj in enumerate(objects):
208
+ logger.info(f"Copying object {i+1}/{len(objects)}: {obj.url}")
209
+ dest_url = DeltaCatUrl(obj.url(catalog_name=catalog_name))
210
+ logger.info(f"Destination URL for object {i+1}/{len(objects)}: {dest_url}")
211
+ result = put(dest_url, metafile=obj)
212
+ copied_results.append(result)
213
+ logger.info(f"Successfully copied object {i+1}/{len(objects)}")
214
+ return copied_results[0] if len(copied_results) == 1 else copied_results
215
+
216
+
217
+ def _copy_dc(
218
+ source: DeltaCatUrl,
219
+ destination: DeltaCatUrl,
220
+ recursive: bool = False,
221
+ ) -> Union[Metafile, List[Metafile]]:
222
+ dc.raise_if_not_initialized()
223
+ if len(source.url.split("/")) != len(destination.url.split("/")):
224
+ # TODO(pdames): Better error message.
225
+ raise ValueError(
226
+ f"Cannot copy {source} to {destination}. "
227
+ f"Source and destination must share the same type."
228
+ )
229
+ if recursive:
230
+ src_objects = list(DeltaCatUrl(source.url.rstrip("/**")), recursive=True)
231
+ elif source.url.endswith("/*"):
232
+ src_objects = list(DeltaCatUrl(source.url.rstrip("/*")))
233
+ else:
234
+ src_objects = [get(source)]
235
+ return _copy_objects_in_order(src_objects, destination)
236
+
237
+
193
238
  def concat(source, destination):
194
239
  raise NotImplementedError
195
240
 
@@ -214,9 +259,13 @@ def _list_all_metafiles(
214
259
  metafiles: ListResult[Metafile] = lister(**kwargs)
215
260
  list_results.append(metafiles)
216
261
  if recursive:
262
+ # Process each level of the hierarchy
263
+ current_level_metafiles = [mf for mf in metafiles.all_items()]
264
+
217
265
  for lister, kwarg_name, kwarg_val_resolver_fn in reader.listers:
266
+ next_level_metafiles = []
218
267
  # each subsequent lister needs to inject missing keyword args from the parent metafile
219
- for metafile in metafiles.all_items():
268
+ for metafile in current_level_metafiles:
220
269
  kwargs_update = (
221
270
  {kwarg_name: kwarg_val_resolver_fn(metafile)}
222
271
  if kwarg_name and kwarg_val_resolver_fn
@@ -226,8 +275,11 @@ def _list_all_metafiles(
226
275
  **kwargs,
227
276
  **kwargs_update,
228
277
  }
229
- metafiles = lister(**lister_kwargs)
230
- list_results.append(metafiles)
278
+ child_metafiles = lister(**lister_kwargs)
279
+ list_results.append(child_metafiles)
280
+ next_level_metafiles.extend(child_metafiles.all_items())
281
+ # Move to the next level for the next iteration
282
+ current_level_metafiles = next_level_metafiles
231
283
  return [
232
284
  metafile for list_result in list_results for metafile in list_result.all_items()
233
285
  ]
@@ -308,7 +360,7 @@ def put(
308
360
  *args,
309
361
  **kwargs,
310
362
  ) -> Union[Metafile, str]:
311
- writer = DeltaCatUrlWriter(url, metafile)
363
+ writer = DeltaCatUrlWriter(url, metafile=metafile)
312
364
  return writer.write(*args, **kwargs)
313
365
 
314
366
 
@@ -351,6 +403,7 @@ def _copy_external_ray(
351
403
  "gz": 35,
352
404
  "bz2": 35,
353
405
  "zip": 35,
406
+ "zst": 35,
354
407
  "7z": 35,
355
408
  "*": 2.5,
356
409
  },
@@ -359,7 +412,7 @@ def _copy_external_ray(
359
412
  writer_args: Dict[str, Any] = {},
360
413
  filesystem: pafs.FileSystem = None,
361
414
  ) -> str:
362
- print(f"DeltaCAT Copy Invocation Received at: {time.time_ns()}")
415
+ logger.info(f"DeltaCAT Copy Invocation Received at: {time.time_ns()}")
363
416
 
364
417
  if not isinstance(src, DeltaCatUrl):
365
418
  raise ValueError(f"Expected `src` to be a `DeltaCatUrl` but got `{src}`.")
@@ -367,30 +420,32 @@ def _copy_external_ray(
367
420
  # wait for required resources
368
421
  head_cpu_count = int(current_node_resources()["CPU"])
369
422
  if minimum_worker_cpus > 0:
370
- print(f"Waiting for {minimum_worker_cpus} worker CPUs...")
423
+ logger.info(f"Waiting for {minimum_worker_cpus} worker CPUs...")
371
424
  live_cpu_waiter(
372
425
  min_live_cpus=minimum_worker_cpus + head_cpu_count,
373
426
  )
374
- print(f"{minimum_worker_cpus} worker CPUs found!")
427
+ logger.info(f"{minimum_worker_cpus} worker CPUs found!")
375
428
  # start job execution
376
429
  cluster_resources = ray.cluster_resources()
377
- print(f"Cluster Resources: {cluster_resources}")
378
- print(f"Available Cluster Resources: {ray.available_resources()}")
430
+ logger.info(f"Cluster Resources: {cluster_resources}")
431
+ logger.info(f"Available Cluster Resources: {ray.available_resources()}")
379
432
  cluster_cpus = int(cluster_resources["CPU"])
380
- print(f"Cluster CPUs: {cluster_cpus}")
433
+ logger.info(f"Cluster CPUs: {cluster_cpus}")
381
434
  all_node_resource_keys = live_node_resource_keys()
382
- print(f"Found {len(all_node_resource_keys)} live nodes: {all_node_resource_keys}")
435
+ logger.info(
436
+ f"Found {len(all_node_resource_keys)} live nodes: {all_node_resource_keys}"
437
+ )
383
438
  worker_node_resource_keys = other_live_node_resource_keys()
384
- print(
439
+ logger.info(
385
440
  f"Found {len(worker_node_resource_keys)} live worker nodes: {worker_node_resource_keys}"
386
441
  )
387
442
  worker_cpu_count = cluster_cpus - head_cpu_count
388
- print(f"Total worker CPUs: {worker_cpu_count}")
443
+ logger.info(f"Total worker CPUs: {worker_cpu_count}")
389
444
 
390
445
  # estimate memory requirements based on file extension
391
446
  estimated_memory_bytes = 0
392
447
  if extension_to_memory_multiplier:
393
- print(f"Resolving stats collection filesystem for: {src.url_path}.")
448
+ logger.info(f"Resolving stats collection filesystem for: {src.url_path}.")
394
449
  path, filesystem = resolve_path_and_filesystem(src.url_path, filesystem)
395
450
  if isinstance(filesystem, pafs.GcsFileSystem):
396
451
  from datetime import timedelta
@@ -402,7 +457,7 @@ def _copy_external_ray(
402
457
  anonymous=True,
403
458
  retry_time_limit=timedelta(seconds=10),
404
459
  )
405
- print(f"Using filesystem {type(filesystem)} to get file size of: {path}")
460
+ logger.info(f"Using filesystem {type(filesystem)} to get file size of: {path}")
406
461
  file_info = get_file_info(path, filesystem)
407
462
  if file_info.type != FileType.File:
408
463
  raise ValueError(
@@ -413,11 +468,11 @@ def _copy_external_ray(
413
468
  if inflation_multiplier is None:
414
469
  inflation_multiplier = extension_to_memory_multiplier.get("*")
415
470
  estimated_memory_bytes = inflation_multiplier * file_info.size
416
- print(
471
+ logger.info(
417
472
  f"Estimated Memory Required for Copy: "
418
473
  f"{estimated_memory_bytes/BYTES_PER_GIBIBYTE} GiB"
419
474
  )
420
- print(f"Starting DeltaCAT Copy at: {time.time_ns()}")
475
+ logger.info(f"Starting DeltaCAT Copy at: {time.time_ns()}")
421
476
 
422
477
  index_result = None
423
478
  num_cpus = 1
@@ -436,31 +491,31 @@ def _copy_external_ray(
436
491
  reader_args=reader_args,
437
492
  writer_args=writer_args,
438
493
  )
439
- print(f"Time to Launch Copy Task: {latency} seconds")
494
+ logger.info(f"Time to Launch Copy Task: {latency} seconds")
440
495
  try:
441
496
  index_result, latency = timed_invocation(
442
497
  ray.get,
443
498
  copy_task_pending,
444
499
  )
445
500
  except OutOfMemoryError as e:
446
- print(f"Copy Task Ran Out of Memory: {e}")
501
+ logger.warning(f"Copy Task Ran Out of Memory: {e}")
447
502
  max_single_node_cpus = min(
448
503
  max_allowed_cpus, find_max_single_node_resource_type("CPU")
449
504
  )
450
505
  num_cpus += 1
451
506
  if num_cpus > max_single_node_cpus:
452
507
  raise e
453
- print(f"Retrying Failed Copy Task with {num_cpus} dedicated CPUs")
508
+ logger.info(f"Retrying Failed Copy Task with {num_cpus} dedicated CPUs")
454
509
 
455
- print(f"Time to Launch Copy Task: {latency} seconds")
456
- print(f"Time to Complete Copy Task: {latency} seconds")
510
+ logger.info(f"Time to Launch Copy Task: {latency} seconds")
511
+ logger.info(f"Time to Complete Copy Task: {latency} seconds")
457
512
 
458
513
  total_gib_indexed = index_result.table_size / BYTES_PER_GIBIBYTE
459
514
 
460
- print(f"Records Copied: {index_result.table_length}")
461
- print(f"Bytes Copied: {total_gib_indexed} GiB")
462
- print(f"Conversion Rate: {total_gib_indexed/latency} GiB/s")
463
- print(f"Finished Copy at: {time.time_ns()}")
515
+ logger.info(f"Records Copied: {index_result.table_length}")
516
+ logger.info(f"Bytes Copied: {total_gib_indexed} GiB")
517
+ logger.info(f"Conversion Rate: {total_gib_indexed/latency} GiB/s")
518
+ logger.info(f"Finished Copy at: {time.time_ns()}")
464
519
 
465
520
  return dst.url
466
521
 
@@ -484,13 +539,13 @@ def copy_task(
484
539
  transforms=transforms,
485
540
  reader_args=reader_args,
486
541
  )
487
- print(f"Time to read {src.url_path}: {latency} seconds")
542
+ logger.debug(f"Time to read {src.url_path}: {latency} seconds")
488
543
 
489
544
  table_size = get_table_size(table)
490
- print(f"Table Size: {table_size/BYTES_PER_GIBIBYTE} GiB")
545
+ logger.debug(f"Table Size: {table_size/BYTES_PER_GIBIBYTE} GiB")
491
546
 
492
547
  table_length = get_table_length(table)
493
- print(f"Table Records: {table_length}")
548
+ logger.debug(f"Table Records: {table_length}")
494
549
 
495
550
  writer = DeltaCatUrlWriter(dest, dataset_type)
496
551
  written_file_path, latency = timed_invocation(
@@ -499,7 +554,7 @@ def copy_task(
499
554
  table,
500
555
  **writer_args,
501
556
  )
502
- print(f"Time to write {written_file_path}: {latency}")
557
+ logger.debug(f"Time to write {written_file_path}: {latency}")
503
558
 
504
559
  return CopyResult(table_size, table_length)
505
560
 
deltacat/aws/constants.py CHANGED
@@ -1,32 +1,9 @@
1
- import botocore
2
1
  from typing import Set
3
- from daft.exceptions import DaftTransientError
4
2
  from deltacat.utils.common import env_integer, env_string
5
3
 
6
4
 
7
5
  DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
8
- DEFAULT_FILE_READ_TIMEOUT_MS = env_integer(
9
- "DEFAULT_FILE_READ_TIMEOUT_MS", 300_000
10
- ) # 5 mins
11
6
  BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
12
7
  BOTO_TIMEOUT_ERROR_CODES: Set[str] = {"ReadTimeoutError", "ConnectTimeoutError"}
13
8
  BOTO_THROTTLING_ERROR_CODES: Set[str] = {"Throttling", "SlowDown"}
14
- RETRYABLE_TRANSIENT_ERRORS = (
15
- OSError,
16
- botocore.exceptions.ConnectionError,
17
- botocore.exceptions.HTTPClientError,
18
- botocore.exceptions.NoCredentialsError,
19
- botocore.exceptions.ConnectTimeoutError,
20
- botocore.exceptions.ReadTimeoutError,
21
- DaftTransientError,
22
- )
23
9
  AWS_REGION = env_string("AWS_REGION", "us-east-1")
24
- UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY = env_integer(
25
- "UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY", 10 * 60
26
- )
27
- UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY = env_integer(
28
- "UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY", 30 * 60
29
- )
30
- DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY = env_integer(
31
- "DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY", 30 * 60
32
- )