deltacat 0.1.6__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +183 -194
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +249 -198
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +153 -260
- deltacat/compute/compactor/steps/hash_bucket.py +56 -56
- deltacat/compute/compactor/steps/materialize.py +139 -100
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +131 -90
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -42
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +8 -10
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +276 -228
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +36 -29
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/METADATA +21 -11
- deltacat-0.1.11.dist-info/RECORD +110 -0
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
- deltacat-0.1.6.dist-info/RECORD +0 -108
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,11 @@
|
|
1
|
-
import pyarrow as pa
|
2
|
-
import numpy as np
|
3
1
|
from itertools import repeat
|
4
2
|
from typing import Union
|
5
3
|
|
6
|
-
|
4
|
+
import numpy as np
|
5
|
+
import pyarrow as pa
|
6
|
+
|
7
7
|
from deltacat.compute.compactor import DeltaFileEnvelope
|
8
|
+
from deltacat.storage import DeltaType
|
8
9
|
|
9
10
|
_SYS_COL_UUID = "4000f124-dfbd-48c6-885b-7b22621a6d41"
|
10
11
|
|
@@ -65,10 +66,7 @@ _IS_SOURCE_COLUMN_FIELD = pa.field(
|
|
65
66
|
|
66
67
|
|
67
68
|
def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
68
|
-
return pa.array(
|
69
|
-
obj,
|
70
|
-
_PK_HASH_COLUMN_TYPE
|
71
|
-
)
|
69
|
+
return pa.array(obj, _PK_HASH_COLUMN_TYPE)
|
72
70
|
|
73
71
|
|
74
72
|
def pk_hash_column_np(table: pa.Table) -> np.ndarray:
|
@@ -79,6 +77,10 @@ def pk_hash_column(table: pa.Table) -> pa.ChunkedArray:
|
|
79
77
|
return table[_PK_HASH_COLUMN_NAME]
|
80
78
|
|
81
79
|
|
80
|
+
def delta_type_column_np(table: pa.Table) -> np.ndarray:
|
81
|
+
return table[_DELTA_TYPE_COLUMN_NAME].to_numpy()
|
82
|
+
|
83
|
+
|
82
84
|
def delta_type_column(table: pa.Table) -> pa.ChunkedArray:
|
83
85
|
return table[_DELTA_TYPE_COLUMN_NAME]
|
84
86
|
|
@@ -101,8 +103,7 @@ def stream_position_column_np(table: pa.Table) -> np.ndarray:
|
|
101
103
|
return table[_PARTITION_STREAM_POSITION_COLUMN_NAME].to_numpy()
|
102
104
|
|
103
105
|
|
104
|
-
def get_file_index_column_array(obj)
|
105
|
-
-> Union[pa.Array, pa.ChunkedArray]:
|
106
|
+
def get_file_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
106
107
|
return pa.array(
|
107
108
|
obj,
|
108
109
|
_ORDERED_FILE_IDX_COLUMN_TYPE,
|
@@ -113,8 +114,7 @@ def file_index_column_np(table: pa.Table) -> np.ndarray:
|
|
113
114
|
return table[_ORDERED_FILE_IDX_COLUMN_NAME].to_numpy()
|
114
115
|
|
115
116
|
|
116
|
-
def get_record_index_column_array(obj) ->
|
117
|
-
Union[pa.Array, pa.ChunkedArray]:
|
117
|
+
def get_record_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
118
118
|
return pa.array(
|
119
119
|
obj,
|
120
120
|
_ORDERED_RECORD_IDX_COLUMN_TYPE,
|
@@ -144,7 +144,8 @@ def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
|
144
144
|
|
145
145
|
|
146
146
|
def project_delta_file_metadata_on_table(
|
147
|
-
|
147
|
+
delta_file_envelope: DeltaFileEnvelope,
|
148
|
+
) -> pa.Table:
|
148
149
|
|
149
150
|
table = delta_file_envelope.table
|
150
151
|
|
@@ -181,42 +182,33 @@ def project_delta_file_metadata_on_table(
|
|
181
182
|
return table
|
182
183
|
|
183
184
|
|
184
|
-
def append_stream_position_column(
|
185
|
-
table: pa.Table,
|
186
|
-
stream_positions):
|
185
|
+
def append_stream_position_column(table: pa.Table, stream_positions):
|
187
186
|
|
188
187
|
table = table.append_column(
|
189
188
|
_PARTITION_STREAM_POSITION_COLUMN_FIELD,
|
190
|
-
get_stream_position_column_array(stream_positions)
|
189
|
+
get_stream_position_column_array(stream_positions),
|
191
190
|
)
|
192
191
|
return table
|
193
192
|
|
194
193
|
|
195
|
-
def append_file_idx_column(
|
196
|
-
table: pa.Table,
|
197
|
-
ordered_file_indices):
|
194
|
+
def append_file_idx_column(table: pa.Table, ordered_file_indices):
|
198
195
|
|
199
196
|
table = table.append_column(
|
200
197
|
_ORDERED_FILE_IDX_COLUMN_FIELD,
|
201
|
-
get_file_index_column_array(ordered_file_indices)
|
198
|
+
get_file_index_column_array(ordered_file_indices),
|
202
199
|
)
|
203
200
|
return table
|
204
201
|
|
205
202
|
|
206
|
-
def append_pk_hash_column(
|
207
|
-
table: pa.Table,
|
208
|
-
pk_hashes) -> pa.Table:
|
203
|
+
def append_pk_hash_column(table: pa.Table, pk_hashes) -> pa.Table:
|
209
204
|
|
210
205
|
table = table.append_column(
|
211
|
-
_PK_HASH_COLUMN_FIELD,
|
212
|
-
get_pk_hash_column_array(pk_hashes)
|
206
|
+
_PK_HASH_COLUMN_FIELD, get_pk_hash_column_array(pk_hashes)
|
213
207
|
)
|
214
208
|
return table
|
215
209
|
|
216
210
|
|
217
|
-
def append_record_idx_col(
|
218
|
-
table: pa.Table,
|
219
|
-
ordered_record_indices) -> pa.Table:
|
211
|
+
def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
|
220
212
|
|
221
213
|
table = table.append_column(
|
222
214
|
_ORDERED_RECORD_IDX_COLUMN_FIELD,
|
@@ -225,9 +217,7 @@ def append_record_idx_col(
|
|
225
217
|
return table
|
226
218
|
|
227
219
|
|
228
|
-
def append_dedupe_task_idx_col(
|
229
|
-
table: pa.Table,
|
230
|
-
dedupe_task_indices) -> pa.Table:
|
220
|
+
def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table:
|
231
221
|
|
232
222
|
table = table.append_column(
|
233
223
|
_DEDUPE_TASK_IDX_COLUMN_FIELD,
|
@@ -244,9 +234,7 @@ def delta_type_from_field(delta_type_field: bool) -> DeltaType:
|
|
244
234
|
return DeltaType.UPSERT if delta_type_field else DeltaType.DELETE
|
245
235
|
|
246
236
|
|
247
|
-
def append_delta_type_col(
|
248
|
-
table: pa.Table,
|
249
|
-
delta_types) -> pa.Table:
|
237
|
+
def append_delta_type_col(table: pa.Table, delta_types) -> pa.Table:
|
250
238
|
|
251
239
|
table = table.append_column(
|
252
240
|
_DELTA_TYPE_COLUMN_FIELD,
|
@@ -255,9 +243,7 @@ def append_delta_type_col(
|
|
255
243
|
return table
|
256
244
|
|
257
245
|
|
258
|
-
def append_is_source_col(
|
259
|
-
table: pa.Table,
|
260
|
-
booleans) -> pa.Table:
|
246
|
+
def append_is_source_col(table: pa.Table, booleans) -> pa.Table:
|
261
247
|
|
262
248
|
table = table.append_column(
|
263
249
|
_IS_SOURCE_COLUMN_FIELD,
|
@@ -267,8 +253,13 @@ def append_is_source_col(
|
|
267
253
|
|
268
254
|
|
269
255
|
def get_minimal_hb_schema() -> pa.schema:
|
270
|
-
return pa.schema(
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
256
|
+
return pa.schema(
|
257
|
+
[
|
258
|
+
_PK_HASH_COLUMN_FIELD,
|
259
|
+
_ORDERED_RECORD_IDX_COLUMN_FIELD,
|
260
|
+
_ORDERED_FILE_IDX_COLUMN_FIELD,
|
261
|
+
_PARTITION_STREAM_POSITION_COLUMN_FIELD,
|
262
|
+
_DELTA_TYPE_COLUMN_FIELD,
|
263
|
+
_IS_SOURCE_COLUMN_FIELD,
|
264
|
+
]
|
265
|
+
)
|