deltacat 0.1.6__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +183 -194
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +249 -198
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +153 -260
  22. deltacat/compute/compactor/steps/hash_bucket.py +56 -56
  23. deltacat/compute/compactor/steps/materialize.py +139 -100
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +131 -90
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -42
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +8 -10
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +276 -228
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +36 -29
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/METADATA +21 -11
  79. deltacat-0.1.11.dist-info/RECORD +110 -0
  80. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
  81. deltacat-0.1.6.dist-info/RECORD +0 -108
  82. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
  83. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,11 @@
1
- import pyarrow as pa
2
- import numpy as np
3
1
  from itertools import repeat
4
2
  from typing import Union
5
3
 
6
- from deltacat.storage import DeltaType
4
+ import numpy as np
5
+ import pyarrow as pa
6
+
7
7
  from deltacat.compute.compactor import DeltaFileEnvelope
8
+ from deltacat.storage import DeltaType
8
9
 
9
10
  _SYS_COL_UUID = "4000f124-dfbd-48c6-885b-7b22621a6d41"
10
11
 
@@ -65,10 +66,7 @@ _IS_SOURCE_COLUMN_FIELD = pa.field(
65
66
 
66
67
 
67
68
  def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
68
- return pa.array(
69
- obj,
70
- _PK_HASH_COLUMN_TYPE
71
- )
69
+ return pa.array(obj, _PK_HASH_COLUMN_TYPE)
72
70
 
73
71
 
74
72
  def pk_hash_column_np(table: pa.Table) -> np.ndarray:
@@ -79,6 +77,10 @@ def pk_hash_column(table: pa.Table) -> pa.ChunkedArray:
79
77
  return table[_PK_HASH_COLUMN_NAME]
80
78
 
81
79
 
80
+ def delta_type_column_np(table: pa.Table) -> np.ndarray:
81
+ return table[_DELTA_TYPE_COLUMN_NAME].to_numpy()
82
+
83
+
82
84
  def delta_type_column(table: pa.Table) -> pa.ChunkedArray:
83
85
  return table[_DELTA_TYPE_COLUMN_NAME]
84
86
 
@@ -101,8 +103,7 @@ def stream_position_column_np(table: pa.Table) -> np.ndarray:
101
103
  return table[_PARTITION_STREAM_POSITION_COLUMN_NAME].to_numpy()
102
104
 
103
105
 
104
- def get_file_index_column_array(obj) \
105
- -> Union[pa.Array, pa.ChunkedArray]:
106
+ def get_file_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
106
107
  return pa.array(
107
108
  obj,
108
109
  _ORDERED_FILE_IDX_COLUMN_TYPE,
@@ -113,8 +114,7 @@ def file_index_column_np(table: pa.Table) -> np.ndarray:
113
114
  return table[_ORDERED_FILE_IDX_COLUMN_NAME].to_numpy()
114
115
 
115
116
 
116
- def get_record_index_column_array(obj) -> \
117
- Union[pa.Array, pa.ChunkedArray]:
117
+ def get_record_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
118
118
  return pa.array(
119
119
  obj,
120
120
  _ORDERED_RECORD_IDX_COLUMN_TYPE,
@@ -144,7 +144,8 @@ def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
144
144
 
145
145
 
146
146
  def project_delta_file_metadata_on_table(
147
- delta_file_envelope: DeltaFileEnvelope) -> pa.Table:
147
+ delta_file_envelope: DeltaFileEnvelope,
148
+ ) -> pa.Table:
148
149
 
149
150
  table = delta_file_envelope.table
150
151
 
@@ -181,42 +182,33 @@ def project_delta_file_metadata_on_table(
181
182
  return table
182
183
 
183
184
 
184
- def append_stream_position_column(
185
- table: pa.Table,
186
- stream_positions):
185
+ def append_stream_position_column(table: pa.Table, stream_positions):
187
186
 
188
187
  table = table.append_column(
189
188
  _PARTITION_STREAM_POSITION_COLUMN_FIELD,
190
- get_stream_position_column_array(stream_positions)
189
+ get_stream_position_column_array(stream_positions),
191
190
  )
192
191
  return table
193
192
 
194
193
 
195
- def append_file_idx_column(
196
- table: pa.Table,
197
- ordered_file_indices):
194
+ def append_file_idx_column(table: pa.Table, ordered_file_indices):
198
195
 
199
196
  table = table.append_column(
200
197
  _ORDERED_FILE_IDX_COLUMN_FIELD,
201
- get_file_index_column_array(ordered_file_indices)
198
+ get_file_index_column_array(ordered_file_indices),
202
199
  )
203
200
  return table
204
201
 
205
202
 
206
- def append_pk_hash_column(
207
- table: pa.Table,
208
- pk_hashes) -> pa.Table:
203
+ def append_pk_hash_column(table: pa.Table, pk_hashes) -> pa.Table:
209
204
 
210
205
  table = table.append_column(
211
- _PK_HASH_COLUMN_FIELD,
212
- get_pk_hash_column_array(pk_hashes)
206
+ _PK_HASH_COLUMN_FIELD, get_pk_hash_column_array(pk_hashes)
213
207
  )
214
208
  return table
215
209
 
216
210
 
217
- def append_record_idx_col(
218
- table: pa.Table,
219
- ordered_record_indices) -> pa.Table:
211
+ def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
220
212
 
221
213
  table = table.append_column(
222
214
  _ORDERED_RECORD_IDX_COLUMN_FIELD,
@@ -225,9 +217,7 @@ def append_record_idx_col(
225
217
  return table
226
218
 
227
219
 
228
- def append_dedupe_task_idx_col(
229
- table: pa.Table,
230
- dedupe_task_indices) -> pa.Table:
220
+ def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table:
231
221
 
232
222
  table = table.append_column(
233
223
  _DEDUPE_TASK_IDX_COLUMN_FIELD,
@@ -244,9 +234,7 @@ def delta_type_from_field(delta_type_field: bool) -> DeltaType:
244
234
  return DeltaType.UPSERT if delta_type_field else DeltaType.DELETE
245
235
 
246
236
 
247
- def append_delta_type_col(
248
- table: pa.Table,
249
- delta_types) -> pa.Table:
237
+ def append_delta_type_col(table: pa.Table, delta_types) -> pa.Table:
250
238
 
251
239
  table = table.append_column(
252
240
  _DELTA_TYPE_COLUMN_FIELD,
@@ -255,9 +243,7 @@ def append_delta_type_col(
255
243
  return table
256
244
 
257
245
 
258
- def append_is_source_col(
259
- table: pa.Table,
260
- booleans) -> pa.Table:
246
+ def append_is_source_col(table: pa.Table, booleans) -> pa.Table:
261
247
 
262
248
  table = table.append_column(
263
249
  _IS_SOURCE_COLUMN_FIELD,
@@ -267,8 +253,13 @@ def append_is_source_col(
267
253
 
268
254
 
269
255
  def get_minimal_hb_schema() -> pa.schema:
270
- return pa.schema([
271
- _PK_HASH_COLUMN_FIELD,
272
- _ORDERED_RECORD_IDX_COLUMN_FIELD,
273
- _ORDERED_FILE_IDX_COLUMN_FIELD
274
- ])
256
+ return pa.schema(
257
+ [
258
+ _PK_HASH_COLUMN_FIELD,
259
+ _ORDERED_RECORD_IDX_COLUMN_FIELD,
260
+ _ORDERED_FILE_IDX_COLUMN_FIELD,
261
+ _PARTITION_STREAM_POSITION_COLUMN_FIELD,
262
+ _DELTA_TYPE_COLUMN_FIELD,
263
+ _IS_SOURCE_COLUMN_FIELD,
264
+ ]
265
+ )