deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +297 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +95 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +4 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +22 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +509 -0
  20. deltacat/compute/compactor_v2/constants.py +37 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +143 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +469 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
  30. deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
  31. deltacat/compute/compactor_v2/utils/io.py +152 -0
  32. deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
  33. deltacat/compute/compactor_v2/utils/task_options.py +221 -0
  34. deltacat/compute/metastats/meta_stats.py +4 -2
  35. deltacat/compute/metastats/stats.py +1 -0
  36. deltacat/compute/metastats/utils/io.py +4 -0
  37. deltacat/compute/stats/utils/io.py +20 -5
  38. deltacat/exceptions.py +4 -0
  39. deltacat/io/memcached_object_store.py +37 -14
  40. deltacat/logs.py +4 -3
  41. deltacat/storage/interface.py +8 -1
  42. deltacat/storage/model/types.py +2 -1
  43. deltacat/tests/aws/test_clients.py +16 -3
  44. deltacat/tests/compute/__init__.py +0 -0
  45. deltacat/tests/compute/common.py +96 -0
  46. deltacat/tests/compute/compactor/__init__.py +0 -0
  47. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  48. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
  49. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  50. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  51. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  52. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  53. deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
  54. deltacat/tests/compute/testcases.py +395 -0
  55. deltacat/tests/io/test_memcached_object_store.py +5 -4
  56. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  57. deltacat/tests/test_utils/pyarrow.py +49 -0
  58. deltacat/tests/test_utils/utils.py +13 -0
  59. deltacat/tests/utils/data/__init__.py +0 -0
  60. deltacat/tests/utils/test_daft.py +76 -0
  61. deltacat/tests/utils/test_pyarrow.py +133 -0
  62. deltacat/tests/utils/test_resources.py +23 -20
  63. deltacat/types/media.py +1 -0
  64. deltacat/types/partial_download.py +83 -0
  65. deltacat/types/tables.py +6 -0
  66. deltacat/utils/arguments.py +25 -0
  67. deltacat/utils/daft.py +87 -0
  68. deltacat/utils/placement.py +20 -3
  69. deltacat/utils/pyarrow.py +218 -1
  70. deltacat/utils/ray_utils/concurrency.py +26 -1
  71. deltacat/utils/resources.py +72 -1
  72. deltacat/utils/s3fs.py +21 -0
  73. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
  76. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  77. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  78. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,29 @@
1
1
  from __future__ import annotations
2
-
2
+ import importlib
3
3
  import copy
4
4
  import json
5
5
  from typing import Any, Dict, List, Optional
6
-
6
+ from deltacat.io.object_store import IObjectStore
7
+ from deltacat.utils.common import ReadKwargsProvider
7
8
  from deltacat.types.media import ContentType
9
+ from deltacat.utils.placement import PlacementGroupConfig
10
+ from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
11
+ from deltacat.storage import (
12
+ interface as unimplemented_deltacat_storage,
13
+ PartitionLocator,
14
+ SortKey,
15
+ )
16
+ from deltacat.compute.compactor_v2.constants import (
17
+ MAX_RECORDS_PER_COMPACTED_FILE,
18
+ MIN_DELTA_BYTES_IN_BATCH,
19
+ MIN_FILES_IN_BATCH,
20
+ AVERAGE_RECORD_SIZE_BYTES,
21
+ TASK_MAX_PARALLELISM,
22
+ DROP_DUPLICATES,
23
+ )
24
+ from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
25
+ from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
26
+ from deltacat.utils.metrics import MetricsConfig
8
27
 
9
28
 
10
29
  class CompactPartitionParams(dict):
@@ -16,104 +35,324 @@ class CompactPartitionParams(dict):
16
35
  def of(params: Optional[Dict]) -> CompactPartitionParams:
17
36
  if params is None:
18
37
  params = {}
19
- compact_partition_params = CompactPartitionParams()
20
- compact_partition_params["destination_partition_locator"] = params.get(
21
- "destination_partition_locator"
22
- )
23
- compact_partition_params["last_stream_position_to_compact"] = params.get(
24
- "last_stream_position_to_compact"
38
+
39
+ assert (
40
+ params.get("destination_partition_locator") is not None
41
+ ), "destination_partition_locator is a required arg"
42
+ assert (
43
+ params.get("last_stream_position_to_compact") is not None
44
+ ), "last_stream_position_to_compact is a required arg"
45
+ assert (
46
+ params.get("source_partition_locator") is not None
47
+ ), "source_partition_locator is a required arg"
48
+ assert (
49
+ params.get("compaction_artifact_s3_bucket") is not None
50
+ ), "compaction_artifact_s3_bucket is a required arg"
51
+
52
+ result = CompactPartitionParams(params)
53
+
54
+ # TODO: move defaults to single file
55
+ result.records_per_compacted_file = params.get(
56
+ "records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
25
57
  )
26
- compact_partition_params["source_partition_locator"] = params.get(
27
- "source_partition_locator"
58
+ result.compacted_file_content_type = params.get(
59
+ "compacted_file_content_type", ContentType.PARQUET
28
60
  )
29
- compact_partition_params["primary_keys"] = params.get("primary_keys")
30
- compact_partition_params["rebase_source_partition_locator"] = params.get(
31
- "rebase_source_partition_locator"
61
+ result.object_store = params.get("object_store", RayPlasmaObjectStore())
62
+ result.enable_profiler = params.get("enable_profiler", False)
63
+ result.deltacat_storage = params.get(
64
+ "deltacat_storage", unimplemented_deltacat_storage
32
65
  )
33
- compact_partition_params["rebase_source_partition_high_watermark"] = params.get(
34
- "rebase_source_partition_high_watermark"
66
+ result.s3_client_kwargs = params.get("s3_client_kwargs", {})
67
+ result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
68
+ result.list_deltas_kwargs = params.get("list_deltas_kwargs", {})
69
+ result.s3_table_writer_kwargs = params.get("s3_table_writer_kwargs", {})
70
+ result.bit_width_of_sort_keys = validate_sort_keys(
71
+ result.source_partition_locator,
72
+ result.sort_keys,
73
+ result.deltacat_storage,
74
+ result.deltacat_storage_kwargs,
35
75
  )
36
- compact_partition_params["hash_bucket_count"] = params.get("hash_bucket_count")
37
- compact_partition_params["deltacat_storage"] = params.get("deltacat_storage")
38
- compact_partition_params["compaction_artifact_s3_bucket"] = params.get(
39
- "compaction_artifact_s3_bucket"
76
+ result.task_max_parallelism = params.get(
77
+ "task_max_parallelism", TASK_MAX_PARALLELISM
40
78
  )
41
- compact_partition_params["properties"] = params.get("properties")
42
- compact_partition_params["compacted_file_content_type"] = params.get(
43
- "compacted_file_content_type"
79
+ result.min_files_in_batch = params.get("min_files_in_batch", MIN_FILES_IN_BATCH)
80
+ result.min_delta_bytes_in_batch = params.get(
81
+ "min_delta_bytes_in_batch", MIN_DELTA_BYTES_IN_BATCH
44
82
  )
45
- compact_partition_params["list_deltas_kwargs"] = params.get(
46
- "list_deltas_kwargs"
83
+ result.previous_inflation = params.get(
84
+ "previous_inflation", PYARROW_INFLATION_MULTIPLIER
47
85
  )
48
- compact_partition_params["pg_config"] = params.get("pg_config")
49
- compact_partition_params["read_kwargs_provider"] = params.get(
50
- "read_kwargs_provider"
86
+ result.average_record_size_bytes = params.get(
87
+ "average_record_size_bytes", AVERAGE_RECORD_SIZE_BYTES
51
88
  )
52
- compact_partition_params["s3_table_writer_kwargs"] = params.get(
53
- "s3_table_writer_kwargs"
89
+ result.hash_group_count = params.get(
90
+ "hash_group_count", result.hash_bucket_count
54
91
  )
55
- return compact_partition_params
92
+ result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
93
+
94
+ if not importlib.util.find_spec("memray"):
95
+ result.enable_profiler = False
96
+
97
+ if result.primary_keys:
98
+ result.primary_keys = sorted(result.primary_keys)
99
+
100
+ # assertions
101
+ assert (
102
+ result.source_partition_locator.partition_values
103
+ == result.destination_partition_locator.partition_values
104
+ ), "Source and destination partitions values must be equal"
105
+
106
+ assert (
107
+ result.records_per_compacted_file and result.records_per_compacted_file >= 1
108
+ ), "Max records per output file must be a positive value"
109
+
110
+ return result
56
111
 
57
112
  @property
58
- def destination_partition_locator(self) -> Optional[dict]:
59
- return self["destination_partition_locator"]
113
+ def destination_partition_locator(self) -> PartitionLocator:
114
+ val = self["destination_partition_locator"]
115
+ if not isinstance(val, PartitionLocator):
116
+ val = PartitionLocator(val)
117
+
118
+ return val
119
+
120
+ @destination_partition_locator.setter
121
+ def destination_partition_locator(self, locator: PartitionLocator) -> None:
122
+ self["destination_partition_locator"] = locator
60
123
 
61
124
  @property
62
- def last_stream_position_to_compact(self) -> Optional[int]:
125
+ def last_stream_position_to_compact(self) -> int:
63
126
  return self["last_stream_position_to_compact"]
64
127
 
128
+ @last_stream_position_to_compact.setter
129
+ def last_stream_position_to_compact(self, stream_position: int) -> None:
130
+ self["last_stream_position_to_compact"] = stream_position
131
+
65
132
  @property
66
- def source_partition_locator(self) -> Optional[dict]:
67
- return self["source_partition_locator"]
133
+ def source_partition_locator(self) -> PartitionLocator:
134
+ val = self["source_partition_locator"]
135
+ if not isinstance(val, PartitionLocator):
136
+ val = PartitionLocator(val)
137
+ return val
138
+
139
+ @source_partition_locator.setter
140
+ def source_partition_locator(self, locator: PartitionLocator) -> None:
141
+ self["source_partition_locator"] = locator
68
142
 
69
143
  @property
70
- def primary_keys(self) -> Optional[List[str]]:
71
- return list(self["primary_keys"])
144
+ def compaction_artifact_s3_bucket(self) -> str:
145
+ return self["compaction_artifact_s3_bucket"]
146
+
147
+ @compaction_artifact_s3_bucket.setter
148
+ def compaction_artifact_s3_bucket(self, s3_bucket: str) -> None:
149
+ self["compaction_artifact_s3_bucket"] = s3_bucket
72
150
 
73
151
  @property
74
- def rebase_source_partition_locator(self) -> Optional[dict]:
75
- return self["rebase_source_partition_locator"]
152
+ def deltacat_storage(self) -> unimplemented_deltacat_storage:
153
+ return self["deltacat_storage"]
154
+
155
+ @deltacat_storage.setter
156
+ def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
157
+ self["deltacat_storage"] = storage
76
158
 
77
159
  @property
78
- def rebase_source_partition_high_watermark(self) -> Optional[int]:
79
- return self["rebase_source_partition_high_watermark"]
160
+ def object_store(self) -> IObjectStore:
161
+ return self["object_store"]
162
+
163
+ @object_store.setter
164
+ def object_store(self, obj_store: IObjectStore) -> None:
165
+ self["object_store"] = obj_store
80
166
 
81
167
  @property
82
- def hash_bucket_count(self) -> Optional[int]:
83
- return self["hash_bucket_count"]
168
+ def compacted_file_content_type(self) -> ContentType:
169
+ return self["compacted_file_content_type"]
170
+
171
+ @compacted_file_content_type.setter
172
+ def compacted_file_content_type(self, content_type: ContentType) -> None:
173
+ self["compacted_file_content_type"] = content_type
84
174
 
85
175
  @property
86
- def deltacat_storage(self) -> Optional[str]:
87
- return self["deltacat_storage"]
176
+ def task_max_parallelism(self) -> int:
177
+ return self["task_max_parallelism"]
178
+
179
+ @task_max_parallelism.setter
180
+ def task_max_parallelism(self, max_parallelism: int) -> None:
181
+ self["task_max_parallelism"] = max_parallelism
88
182
 
89
183
  @property
90
- def compaction_artifact_s3_bucket(self) -> Optional[str]:
91
- return self["compaction_artifact_s3_bucket"]
184
+ def average_record_size_bytes(self) -> float:
185
+ return self["average_record_size_bytes"]
186
+
187
+ @average_record_size_bytes.setter
188
+ def average_record_size_bytes(self, average_record_size_bytes: float) -> None:
189
+ self["average_record_size_bytes"] = average_record_size_bytes
92
190
 
93
191
  @property
94
- def properties(self) -> Optional[Dict[str, str]]:
95
- return self["properties"]
192
+ def min_files_in_batch(self) -> float:
193
+ return self["min_files_in_batch"]
194
+
195
+ @min_files_in_batch.setter
196
+ def min_files_in_batch(self, min_files_in_batch: float) -> None:
197
+ self["min_files_in_batch"] = min_files_in_batch
96
198
 
97
199
  @property
98
- def compacted_file_content_type(self) -> Optional[ContentType]:
99
- return self["compacted_file_content_type"]
200
+ def min_delta_bytes_in_batch(self) -> float:
201
+ return self["min_delta_bytes_in_batch"]
202
+
203
+ @min_delta_bytes_in_batch.setter
204
+ def min_delta_bytes_in_batch(self, min_delta_bytes_in_batch: float) -> None:
205
+ self["min_delta_bytes_in_batch"] = min_delta_bytes_in_batch
100
206
 
101
207
  @property
102
- def list_deltas_kwargs(self) -> Optional[dict]:
103
- return self["list_deltas_kwargs"]
208
+ def previous_inflation(self) -> float:
209
+ return self["previous_inflation"]
210
+
211
+ @previous_inflation.setter
212
+ def previous_inflation(self, previous_inflation: float) -> None:
213
+ self["previous_inflation"] = previous_inflation
104
214
 
105
215
  @property
106
- def pg_config(self) -> Optional[Any]:
107
- return self["pg_config"]
216
+ def enable_profiler(self) -> bool:
217
+ return self["enable_profiler"]
218
+
219
+ @enable_profiler.setter
220
+ def enable_profiler(self, value: bool) -> None:
221
+ self["enable_profiler"] = value
108
222
 
109
223
  @property
110
- def read_kwargs_provider(self) -> Optional[Any]:
111
- return self["read_kwargs_provider"]
224
+ def list_deltas_kwargs(self) -> dict:
225
+ return self["list_deltas_kwargs"]
226
+
227
+ @list_deltas_kwargs.setter
228
+ def list_deltas_kwargs(self, kwargs: dict) -> None:
229
+ self["list_deltas_kwargs"] = kwargs
112
230
 
113
231
  @property
114
- def s3_table_writer_kwargs(self) -> Optional[Any]:
232
+ def s3_table_writer_kwargs(self) -> dict:
115
233
  return self["s3_table_writer_kwargs"]
116
234
 
235
+ @s3_table_writer_kwargs.setter
236
+ def s3_table_writer_kwargs(self, kwargs: dict) -> None:
237
+ self["s3_table_writer_kwargs"] = kwargs
238
+
239
+ @property
240
+ def deltacat_storage_kwargs(self) -> dict:
241
+ return self["deltacat_storage_kwargs"]
242
+
243
+ @deltacat_storage_kwargs.setter
244
+ def deltacat_storage_kwargs(self, kwargs: dict) -> None:
245
+ self["deltacat_storage_kwargs"] = kwargs
246
+
247
+ @property
248
+ def s3_client_kwargs(self) -> dict:
249
+ return self["s3_client_kwargs"]
250
+
251
+ @s3_client_kwargs.setter
252
+ def s3_client_kwargs(self, kwargs: dict) -> None:
253
+ self["s3_client_kwargs"] = kwargs
254
+
255
+ @property
256
+ def records_per_compacted_file(self) -> int:
257
+ return self["records_per_compacted_file"]
258
+
259
+ @records_per_compacted_file.setter
260
+ def records_per_compacted_file(self, count: int) -> None:
261
+ self["records_per_compacted_file"] = count
262
+
263
+ @property
264
+ def drop_duplicates(self) -> bool:
265
+ return self["drop_duplicates"]
266
+
267
+ @drop_duplicates.setter
268
+ def drop_duplicates(self, value: bool):
269
+ self["drop_duplicates"] = value
270
+
271
+ @property
272
+ def bit_width_of_sort_keys(self) -> int:
273
+ return self["bit_width_of_sort_keys"]
274
+
275
+ @bit_width_of_sort_keys.setter
276
+ def bit_width_of_sort_keys(self, width: int) -> None:
277
+ self["bit_width_of_sort_keys"] = width
278
+
279
+ @property
280
+ def hash_bucket_count(self) -> Optional[int]:
281
+ return self.get("hash_bucket_count")
282
+
283
+ @hash_bucket_count.setter
284
+ def hash_bucket_count(self, count: int) -> None:
285
+ self["hash_bucket_count"] = count
286
+
287
+ @property
288
+ def hash_group_count(self) -> int:
289
+ return self["hash_group_count"]
290
+
291
+ @hash_group_count.setter
292
+ def hash_group_count(self, count: int) -> None:
293
+ self["hash_group_count"] = count
294
+
295
+ @property
296
+ def primary_keys(self) -> Optional[List[str]]:
297
+ return self.get("primary_keys")
298
+
299
+ @primary_keys.setter
300
+ def primary_keys(self, keys: List[str]) -> None:
301
+ self["primary_keys"] = keys
302
+
303
+ @property
304
+ def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
305
+ val = self.get("rebase_source_partition_locator")
306
+
307
+ if val and not isinstance(val, PartitionLocator):
308
+ val = PartitionLocator(val)
309
+
310
+ return val
311
+
312
+ @rebase_source_partition_locator.setter
313
+ def rebase_source_partition_locator(self, locator: PartitionLocator) -> None:
314
+ self["rebase_source_partition_locator"] = locator
315
+
316
+ @property
317
+ def rebase_source_partition_high_watermark(self) -> Optional[int]:
318
+ return self.get("rebase_source_partition_high_watermark")
319
+
320
+ @rebase_source_partition_high_watermark.setter
321
+ def rebase_source_partition_high_watermark(self, high_watermark: int) -> None:
322
+ self["rebase_source_partition_high_watermark"] = high_watermark
323
+
324
+ @property
325
+ def pg_config(self) -> Optional[PlacementGroupConfig]:
326
+ return self.get("pg_config")
327
+
328
+ @pg_config.setter
329
+ def pg_config(self, config: PlacementGroupConfig) -> None:
330
+ self["pg_config"] = config
331
+
332
+ @property
333
+ def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
334
+ return self.get("read_kwargs_provider")
335
+
336
+ @read_kwargs_provider.setter
337
+ def read_kwargs_provider(self, kwargs_provider: ReadKwargsProvider) -> None:
338
+ self["read_kwargs_provider"] = kwargs_provider
339
+
340
+ @property
341
+ def sort_keys(self) -> Optional[List[SortKey]]:
342
+ return self.get("sort_keys")
343
+
344
+ @sort_keys.setter
345
+ def sort_keys(self, keys: List[SortKey]) -> None:
346
+ self["sort_keys"] = keys
347
+
348
+ @property
349
+ def metrics_config(self) -> Optional[MetricsConfig]:
350
+ return self.get("metrics_config")
351
+
352
+ @metrics_config.setter
353
+ def metrics_config(self, config: MetricsConfig) -> None:
354
+ self["metrics_config"] = config
355
+
117
356
  @staticmethod
118
357
  def json_handler_for_compact_partition_params(obj):
119
358
  """