deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +188 -218
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +259 -316
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +152 -259
  22. deltacat/compute/compactor/steps/hash_bucket.py +57 -73
  23. deltacat/compute/compactor/steps/materialize.py +138 -99
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +131 -90
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -42
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +8 -10
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  64. deltacat/types/media.py +3 -4
  65. deltacat/types/tables.py +31 -21
  66. deltacat/utils/common.py +5 -11
  67. deltacat/utils/numpy.py +20 -22
  68. deltacat/utils/pandas.py +73 -100
  69. deltacat/utils/performance.py +3 -9
  70. deltacat/utils/placement.py +276 -231
  71. deltacat/utils/pyarrow.py +302 -89
  72. deltacat/utils/ray_utils/collections.py +2 -1
  73. deltacat/utils/ray_utils/concurrency.py +38 -32
  74. deltacat/utils/ray_utils/dataset.py +28 -28
  75. deltacat/utils/ray_utils/performance.py +5 -9
  76. deltacat/utils/ray_utils/runtime.py +9 -10
  77. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
  78. deltacat-0.1.11.dist-info/RECORD +110 -0
  79. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
  80. deltacat/autoscaler/events/__init__.py +0 -0
  81. deltacat/autoscaler/events/compaction/__init__.py +0 -0
  82. deltacat/autoscaler/events/compaction/cluster.py +0 -82
  83. deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
  84. deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
  85. deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
  86. deltacat/autoscaler/events/compaction/input.py +0 -27
  87. deltacat/autoscaler/events/compaction/process.py +0 -25
  88. deltacat/autoscaler/events/compaction/session_manager.py +0 -13
  89. deltacat/autoscaler/events/compaction/utils.py +0 -216
  90. deltacat/autoscaler/events/compaction/workflow.py +0 -303
  91. deltacat/autoscaler/events/dispatcher.py +0 -95
  92. deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
  93. deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
  94. deltacat/autoscaler/events/event_store.py +0 -55
  95. deltacat/autoscaler/events/exceptions.py +0 -6
  96. deltacat/autoscaler/events/processor.py +0 -177
  97. deltacat/autoscaler/events/session_manager.py +0 -25
  98. deltacat/autoscaler/events/states.py +0 -88
  99. deltacat/autoscaler/events/workflow.py +0 -54
  100. deltacat/autoscaler/node_group.py +0 -230
  101. deltacat/autoscaler/utils.py +0 -69
  102. deltacat-0.1.8.dist-info/RECORD +0 -131
  103. /deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
  104. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
  105. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,37 +1,37 @@
1
- import ray
2
- import pyarrow as pa
3
- import numpy as np
4
1
  import logging
5
-
6
- from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
7
2
  from itertools import chain
3
+ from typing import Generator, List, Optional, Tuple
4
+
5
+ import numpy as np
6
+ import pyarrow as pa
7
+ import ray
8
+ from ray.types import ObjectRef
8
9
 
9
10
  from deltacat import logs
10
- from deltacat.compute.compactor import DeltaAnnotated, DeltaFileEnvelope, \
11
- SortKey
12
- from deltacat.compute.compactor.utils.primary_key_index import \
13
- group_hash_bucket_indices, group_record_indices_by_hash_bucket
11
+ from deltacat.compute.compactor import DeltaAnnotated, DeltaFileEnvelope, SortKey
12
+ from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
13
+ from deltacat.compute.compactor.utils import system_columns as sc
14
+ from deltacat.compute.compactor.utils.primary_key_index import (
15
+ group_hash_bucket_indices,
16
+ group_record_indices_by_hash_bucket,
17
+ )
14
18
  from deltacat.storage import interface as unimplemented_deltacat_storage
15
19
  from deltacat.types.media import StorageType
16
20
  from deltacat.utils.common import sha1_digest
17
- from deltacat.compute.compactor.utils import system_columns as sc
18
-
19
- from typing import List, Optional, Generator, Tuple
20
-
21
- from ray.types import ObjectRef
22
21
 
23
22
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
24
23
 
25
- _PK_BYTES_DELIMITER = b'L6kl7u5f'
24
+ _PK_BYTES_DELIMITER = b"L6kl7u5f"
26
25
 
27
26
  HashBucketGroupToObjectId = np.ndarray
28
- HashBucketResult = Tuple[HashBucketGroupToObjectId, List[ObjectRef[DeltaFileEnvelopeGroups]]]
27
+ HashBucketResult = Tuple[
28
+ HashBucketGroupToObjectId, List[ObjectRef[DeltaFileEnvelopeGroups]]
29
+ ]
29
30
 
30
31
 
31
- def group_by_pk_hash_bucket(
32
- table: pa.Table,
33
- num_buckets: int,
34
- primary_keys: List[str]) -> np.ndarray:
32
+ def _group_by_pk_hash_bucket(
33
+ table: pa.Table, num_buckets: int, primary_keys: List[str]
34
+ ) -> np.ndarray:
35
35
 
36
36
  # generate the primary key digest column
37
37
  all_pk_column_fields = []
@@ -39,7 +39,7 @@ def group_by_pk_hash_bucket(
39
39
  # casting a primary key column to numpy also ensures no nulls exist
40
40
  column_fields = table[pk_name].to_numpy()
41
41
  all_pk_column_fields.append(column_fields)
42
- hash_column_generator = hash_pk_bytes_generator(all_pk_column_fields)
42
+ hash_column_generator = _hash_pk_bytes_generator(all_pk_column_fields)
43
43
  table = sc.append_pk_hash_column(table, hash_column_generator)
44
44
 
45
45
  # drop primary key columns to free up memory
@@ -62,31 +62,27 @@ def group_by_pk_hash_bucket(
62
62
  return hash_bucket_to_table
63
63
 
64
64
 
65
- def hash_pk_bytes_generator(all_column_fields) -> Generator[bytes, None, None]:
65
+ def _hash_pk_bytes_generator(all_column_fields) -> Generator[bytes, None, None]:
66
66
  for field_index in range(len(all_column_fields[0])):
67
67
  bytes_to_join = []
68
68
  for column_fields in all_column_fields:
69
- bytes_to_join.append(
70
- bytes(str(column_fields[field_index]), "utf-8")
71
- )
69
+ bytes_to_join.append(bytes(str(column_fields[field_index]), "utf-8"))
72
70
  yield sha1_digest(_PK_BYTES_DELIMITER.join(bytes_to_join))
73
71
 
74
72
 
75
- def group_file_records_by_pk_hash_bucket(
76
- annotated_delta: DeltaAnnotated,
77
- num_hash_buckets: int,
78
- primary_keys: List[str],
79
- sort_key_names: List[str],
80
- ignore_missing_manifest: bool = False,
81
- deltacat_storage=unimplemented_deltacat_storage) \
82
- -> Optional[DeltaFileEnvelopeGroups]:
73
+ def _group_file_records_by_pk_hash_bucket(
74
+ annotated_delta: DeltaAnnotated,
75
+ num_hash_buckets: int,
76
+ primary_keys: List[str],
77
+ sort_key_names: List[str],
78
+ deltacat_storage=unimplemented_deltacat_storage,
79
+ ) -> Optional[DeltaFileEnvelopeGroups]:
83
80
 
84
81
  # read input parquet s3 objects into a list of delta file envelopes
85
- delta_file_envelopes = read_delta_file_envelopes(
82
+ delta_file_envelopes = _read_delta_file_envelopes(
86
83
  annotated_delta,
87
84
  primary_keys,
88
85
  sort_key_names,
89
- ignore_missing_manifest,
90
86
  deltacat_storage,
91
87
  )
92
88
  if delta_file_envelopes is None:
@@ -95,7 +91,7 @@ def group_file_records_by_pk_hash_bucket(
95
91
  # group the data by primary key hash value
96
92
  hb_to_delta_file_envelopes = np.empty([num_hash_buckets], dtype="object")
97
93
  for dfe in delta_file_envelopes:
98
- hash_bucket_to_table = group_by_pk_hash_bucket(
94
+ hash_bucket_to_table = _group_by_pk_hash_bucket(
99
95
  dfe.table,
100
96
  num_hash_buckets,
101
97
  primary_keys,
@@ -106,44 +102,33 @@ def group_file_records_by_pk_hash_bucket(
106
102
  hb_to_delta_file_envelopes[hb] = []
107
103
  hb_to_delta_file_envelopes[hb].append(
108
104
  DeltaFileEnvelope.of(
109
- dfe.stream_position,
110
- dfe.file_index,
111
- dfe.delta_type,
112
- table))
105
+ dfe.stream_position, dfe.file_index, dfe.delta_type, table
106
+ )
107
+ )
113
108
  return hb_to_delta_file_envelopes
114
109
 
115
110
 
116
- def read_delta_file_envelopes(
117
- annotated_delta: DeltaAnnotated,
118
- primary_keys: List[str],
119
- sort_key_names: List[str],
120
- ignore_missing_manifest: bool = False,
121
- deltacat_storage=unimplemented_deltacat_storage) \
122
- -> Optional[List[DeltaFileEnvelope]]:
111
+ def _read_delta_file_envelopes(
112
+ annotated_delta: DeltaAnnotated,
113
+ primary_keys: List[str],
114
+ sort_key_names: List[str],
115
+ deltacat_storage=unimplemented_deltacat_storage,
116
+ ) -> Optional[List[DeltaFileEnvelope]]:
123
117
 
124
118
  columns_to_read = list(chain(primary_keys, sort_key_names))
125
- missing_ids=[]
126
- tables_and_missing_ids = deltacat_storage.download_delta(
119
+ tables = deltacat_storage.download_delta(
127
120
  annotated_delta,
128
- max_parallelism=1, # if >1, will use python multiprocessing
121
+ max_parallelism=1,
129
122
  columns=columns_to_read,
130
123
  storage_type=StorageType.LOCAL,
131
- ignore_missing_manifest=ignore_missing_manifest,
132
124
  )
133
- if ignore_missing_manifest:
134
- missing_ids = tables_and_missing_ids[1]
135
- tables=tables_and_missing_ids[0]
136
- else:
137
- tables = tables_and_missing_ids
138
125
  annotations = annotated_delta.annotations
139
- if len(missing_ids)>0:
140
- print(f"missing files:{len(missing_ids)}")
141
- for id_missing in sorted(missing_ids, reverse=True):
142
- del annotations[id_missing]
143
- assert(len(tables) == len(annotations),
144
- f"Unexpected Error: Length of downloaded delta manifest tables "
145
- f"({len(tables)}) doesn't match the length of delta manifest "
146
- f"annotations ({len(annotations)}).")
126
+ assert (
127
+ len(tables) == len(annotations),
128
+ f"Unexpected Error: Length of downloaded delta manifest tables "
129
+ f"({len(tables)}) doesn't match the length of delta manifest "
130
+ f"annotations ({len(annotations)}).",
131
+ )
147
132
  if not tables:
148
133
  return None
149
134
 
@@ -161,22 +146,21 @@ def read_delta_file_envelopes(
161
146
 
162
147
  @ray.remote(num_returns=2)
163
148
  def hash_bucket(
164
- annotated_delta: DeltaAnnotated,
165
- primary_keys: List[str],
166
- sort_keys: List[SortKey],
167
- num_buckets: int,
168
- num_groups: int,
169
- ignore_missing_manifest: bool = False,
170
- deltacat_storage=unimplemented_deltacat_storage) -> HashBucketResult:
149
+ annotated_delta: DeltaAnnotated,
150
+ primary_keys: List[str],
151
+ sort_keys: List[SortKey],
152
+ num_buckets: int,
153
+ num_groups: int,
154
+ deltacat_storage=unimplemented_deltacat_storage,
155
+ ) -> HashBucketResult:
171
156
 
172
157
  logger.info(f"Starting hash bucket task...")
173
158
  sort_key_names = [key.key_name for key in sort_keys]
174
- delta_file_envelope_groups = group_file_records_by_pk_hash_bucket(
159
+ delta_file_envelope_groups = _group_file_records_by_pk_hash_bucket(
175
160
  annotated_delta,
176
161
  num_buckets,
177
162
  primary_keys,
178
163
  sort_key_names,
179
- ignore_missing_manifest,
180
164
  deltacat_storage,
181
165
  )
182
166
  hash_bucket_group_to_obj_id, object_refs = group_hash_bucket_indices(
@@ -1,57 +1,113 @@
1
- import logging,time
2
- import ray
3
- import pyarrow as pa
4
-
1
+ import logging
2
+ import time
5
3
  from collections import defaultdict
6
-
7
- from deltacat.compute.compactor.steps.dedupe import DedupeTaskIndexWithObjectId, \
8
- DeltaFileLocatorToRecords
9
4
  from itertools import chain, repeat
5
+ from typing import List, Optional, Tuple
10
6
 
11
- from pyarrow import compute as pc
12
-
7
+ import pyarrow as pa
8
+ import ray
13
9
  from ray import cloudpickle
14
10
 
15
11
  from deltacat import logs
16
- from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator, \
17
- interface as unimplemented_deltacat_storage
18
- from deltacat.compute.compactor import MaterializeResult, PyArrowWriteResult, \
19
- RoundCompletionInfo
20
- from deltacat.compute.compactor.utils import system_columns as sc
21
- from deltacat.types.media import ContentType, DELIMITED_TEXT_CONTENT_TYPES
22
- from typing import List, Tuple, Optional
23
-
24
- from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
25
-
12
+ from deltacat.compute.compactor import (
13
+ MaterializeResult,
14
+ PyArrowWriteResult,
15
+ RoundCompletionInfo,
16
+ )
17
+ from deltacat.compute.compactor.steps.dedupe import (
18
+ DedupeTaskIndexWithObjectId,
19
+ DeltaFileLocatorToRecords,
20
+ )
21
+ from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
22
+ from deltacat.storage import interface as unimplemented_deltacat_storage
23
+ from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
26
24
  from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
27
- from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowCsvPureUtf8
25
+ from deltacat.utils.performance import timed_invocation
26
+ from deltacat.utils.pyarrow import (
27
+ ReadKwargsProviderPyArrowCsvPureUtf8,
28
+ ReadKwargsProviderPyArrowSchemaOverride,
29
+ RecordBatchTables,
30
+ )
28
31
 
29
32
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
30
33
 
31
34
 
32
35
  @ray.remote
33
36
  def materialize(
34
- source_partition_locator: PartitionLocator,
35
- round_completion_info: Optional[RoundCompletionInfo],
36
- partition: Partition,
37
- mat_bucket_index: int,
38
- dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
39
- max_records_per_output_file: int,
40
- compacted_file_content_type: ContentType,
41
- schema: Optional[pa.Schema] = None,
42
- deltacat_storage=unimplemented_deltacat_storage) -> MaterializeResult:
37
+ source_partition_locator: PartitionLocator,
38
+ round_completion_info: Optional[RoundCompletionInfo],
39
+ partition: Partition,
40
+ mat_bucket_index: int,
41
+ dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
42
+ max_records_per_output_file: int,
43
+ compacted_file_content_type: ContentType,
44
+ schema: Optional[pa.Schema] = None,
45
+ deltacat_storage=unimplemented_deltacat_storage,
46
+ ) -> MaterializeResult:
47
+ # TODO (rkenmi): Add docstrings for the steps in the compaction workflow
48
+ # https://github.com/ray-project/deltacat/issues/79
49
+ def _materialize(compacted_tables: List[pa.Table]) -> MaterializeResult:
50
+ compacted_table = pa.concat_tables(compacted_tables)
51
+
52
+ if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
53
+ # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
54
+ # TODO (pdames): compare performance to pandas-native materialize path
55
+ df = compacted_table.to_pandas(
56
+ split_blocks=True, self_destruct=True, zero_copy_only=True
57
+ )
58
+ compacted_table = df
59
+ delta, stage_delta_time = timed_invocation(
60
+ deltacat_storage.stage_delta,
61
+ compacted_table,
62
+ partition,
63
+ max_records_per_entry=max_records_per_output_file,
64
+ content_type=compacted_file_content_type,
65
+ )
66
+ compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
67
+ compacted_table
68
+ )
69
+ logger.debug(
70
+ f"Time taken for materialize task"
71
+ f" to upload {len(compacted_table)} records"
72
+ f" of size {compacted_table_size} is: {stage_delta_time}s"
73
+ )
74
+ manifest = delta.manifest
75
+ manifest_records = manifest.meta.record_count
76
+ assert (
77
+ manifest_records == len(compacted_table),
78
+ f"Unexpected Error: Materialized delta manifest record count "
79
+ f"({manifest_records}) does not equal compacted table record count "
80
+ f"({len(compacted_table)})",
81
+ )
82
+ materialize_result = MaterializeResult.of(
83
+ delta,
84
+ mat_bucket_index,
85
+ # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
86
+ # and in-memory-table-bytes instead of tight coupling to paBytes
87
+ PyArrowWriteResult.of(
88
+ len(manifest.entries),
89
+ TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
90
+ manifest.meta.content_length,
91
+ len(compacted_table),
92
+ ),
93
+ )
94
+ logger.info(f"Materialize result: {materialize_result}")
95
+ return materialize_result
43
96
 
44
- logger.info(f"Starting materialize task...")
97
+ logger.info(
98
+ f"Starting materialize task with"
99
+ f" materialize bucket index: {mat_bucket_index}..."
100
+ )
101
+ start = time.time()
45
102
  dedupe_task_idx_and_obj_ref_tuples = [
46
103
  (
47
- t[0],
48
- cloudpickle.loads(t[1])
49
- ) for t in dedupe_task_idx_and_obj_id_tuples
104
+ t1,
105
+ cloudpickle.loads(t2),
106
+ )
107
+ for t1, t2 in dedupe_task_idx_and_obj_id_tuples
50
108
  ]
51
109
  logger.info(f"Resolved materialize task obj refs...")
52
- dedupe_task_indices, obj_refs = zip(
53
- *dedupe_task_idx_and_obj_ref_tuples
54
- )
110
+ dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
55
111
  # this depends on `ray.get` result order matching input order, as per the
56
112
  # contract established in: https://github.com/ray-project/ray/pull/16763
57
113
  src_file_records_list = ray.get(list(obj_refs))
@@ -63,19 +119,23 @@ def materialize(
63
119
  (record_numbers, repeat(dedupe_task_idx, len(record_numbers)))
64
120
  )
65
121
  manifest_cache = {}
66
- compacted_tables = []
122
+ materialized_results: List[MaterializeResult] = []
123
+ record_batch_tables = RecordBatchTables(max_records_per_output_file)
67
124
  for src_dfl in sorted(all_src_file_records.keys()):
68
- record_numbers_dd_task_idx_tpl_list: List[Tuple[DeltaFileLocatorToRecords, repeat]] = \
69
- all_src_file_records[src_dfl]
125
+ record_numbers_dd_task_idx_tpl_list: List[
126
+ Tuple[DeltaFileLocatorToRecords, repeat]
127
+ ] = all_src_file_records[src_dfl]
70
128
  record_numbers_tpl, dedupe_task_idx_iter_tpl = zip(
71
129
  *record_numbers_dd_task_idx_tpl_list
72
130
  )
73
131
  is_src_partition_file_np = src_dfl.is_source_delta
74
132
  src_stream_position_np = src_dfl.stream_position
75
133
  src_file_idx_np = src_dfl.file_index
76
- src_file_partition_locator = source_partition_locator \
77
- if is_src_partition_file_np \
134
+ src_file_partition_locator = (
135
+ source_partition_locator
136
+ if is_src_partition_file_np
78
137
  else round_completion_info.compacted_delta_locator.partition_locator
138
+ )
79
139
  delta_locator = DeltaLocator.of(
80
140
  src_file_partition_locator,
81
141
  src_stream_position_np.item(),
@@ -95,75 +155,54 @@ def materialize(
95
155
  # enforce a consistent schema if provided, when reading files into PyArrow tables
96
156
  elif schema is not None:
97
157
  read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
98
- schema=schema)
99
- pa_table = deltacat_storage.download_delta_manifest_entry(
158
+ schema=schema
159
+ )
160
+ pa_table, download_delta_manifest_entry_time = timed_invocation(
161
+ deltacat_storage.download_delta_manifest_entry,
100
162
  Delta.of(delta_locator, None, None, None, manifest),
101
163
  src_file_idx_np.item(),
102
- file_reader_kwargs=read_kwargs_provider,
164
+ file_reader_kwargs_provider=read_kwargs_provider,
165
+ )
166
+ logger.debug(
167
+ f"Time taken for materialize task"
168
+ f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
169
+ f" is: {download_delta_manifest_entry_time}s"
103
170
  )
104
171
  mask_pylist = list(repeat(False, len(pa_table)))
105
172
  record_numbers = chain.from_iterable(record_numbers_tpl)
173
+ # TODO(raghumdani): reference the same file URIs while writing the files
174
+ # instead of copying the data over and creating new files.
106
175
  for record_number in record_numbers:
107
176
  mask_pylist[record_number] = True
108
177
  mask = pa.array(mask_pylist)
109
- compacted_table = pa_table.filter(mask)
178
+ pa_table = pa_table.filter(mask)
179
+ record_batch_tables.append(pa_table)
180
+ if record_batch_tables.has_batches():
181
+ batched_tables = record_batch_tables.evict()
182
+ materialized_results.append(_materialize(batched_tables))
110
183
 
111
- # appending, sorting, taking, and dropping has 2-3X latency of a
112
- # single filter on average, and thus provides better average
113
- # performance than repeatedly filtering the table in dedupe task index
114
- # order
115
- dedupe_task_indices = chain.from_iterable(dedupe_task_idx_iter_tpl)
116
- compacted_table = sc.append_dedupe_task_idx_col(
117
- compacted_table,
118
- dedupe_task_indices,
119
- )
120
- pa_sort_keys = [(sc._DEDUPE_TASK_IDX_COLUMN_NAME, "ascending")]
121
- compacted_table = compacted_table.take(
122
- pc.sort_indices(compacted_table, sort_keys=pa_sort_keys),
123
- )
124
- compacted_table = compacted_table.drop(
125
- [sc._DEDUPE_TASK_IDX_COLUMN_NAME]
126
- )
127
- compacted_tables.append(compacted_table)
184
+ if record_batch_tables.has_remaining():
185
+ materialized_results.append(_materialize(record_batch_tables.remaining))
128
186
 
129
- # TODO (pdames): save memory by writing output files eagerly whenever
130
- # len(compacted_table) >= max_records_per_output_file (but don't write
131
- # partial slices from the compacted_table remainder every time!)
132
- compacted_table = pa.concat_tables(compacted_tables)
133
- if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
134
- # convert to pandas since pyarrow doesn't support custom delimiters
135
- # and doesn't support utf-8 conversion of all types (e.g. Decimal128)
136
- # TODO (pdames): compare performance to pandas-native materialize path
137
- df = compacted_table.to_pandas(
138
- split_blocks=True,
139
- self_destruct=True,
140
- )
141
- del compacted_table
142
- compacted_table = df
143
- delta = deltacat_storage.stage_delta(
144
- compacted_table,
145
- partition,
146
- max_records_per_entry=max_records_per_output_file,
147
- content_type=compacted_file_content_type,
187
+ merged_delta = Delta.merge_deltas([mr.delta for mr in materialized_results])
188
+ assert (
189
+ materialized_results and len(materialized_results) > 0
190
+ ), f"Expected at least one materialized result in materialize step."
191
+
192
+ write_results = [mr.pyarrow_write_result for mr in materialized_results]
193
+ logger.debug(
194
+ f"{len(write_results)} files written"
195
+ f" with records: {[wr.records for wr in write_results]}"
148
196
  )
149
- manifest = delta.manifest
150
- manifest_records = manifest.meta.record_count
151
- assert(manifest_records == len(compacted_table),
152
- f"Unexpected Error: Materialized delta manifest record count "
153
- f"({manifest_records}) does not equal compacted table record count "
154
- f"({len(compacted_table)})")
155
- materialize_result = MaterializeResult.of(
156
- delta,
157
- mat_bucket_index,
158
- # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
159
- # and in-memory-table-bytes instead of tight coupling to paBytes
160
- PyArrowWriteResult.of(
161
- len(manifest.entries),
162
- TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
163
- manifest.meta.content_length,
164
- len(compacted_table),
197
+ # Merge all new deltas into one for this materialize bucket index
198
+ merged_materialize_result = MaterializeResult.of(
199
+ merged_delta,
200
+ materialized_results[0].task_index,
201
+ PyArrowWriteResult.union(
202
+ [mr.pyarrow_write_result for mr in materialized_results]
165
203
  ),
166
204
  )
167
- logger.info(f"Materialize result: {materialize_result}")
168
205
  logger.info(f"Finished materialize task...")
169
- return materialize_result
206
+ end = time.time()
207
+ logger.info(f"Materialize task ended in {end - start}s")
208
+ return merged_materialize_result
@@ -1,22 +1,21 @@
1
1
  import logging
2
- import ray
3
- import pyarrow as pa
4
- import numpy as np
2
+ from typing import List, Tuple
5
3
 
4
+ import numpy as np
5
+ import pyarrow as pa
6
+ import ray
6
7
  from ray.types import ObjectRef
7
8
 
8
9
  from deltacat import logs
9
10
  from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator
10
11
  from deltacat.compute.compactor.utils import primary_key_index as pki
11
12
 
12
- from typing import List, Tuple
13
-
14
13
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
14
 
16
15
 
17
16
  def group_file_records_by_pk_hash_bucket(
18
- pki_table: pa.Table,
19
- num_buckets: int) -> np.ndarray:
17
+ pki_table: pa.Table, num_buckets: int
18
+ ) -> np.ndarray:
20
19
  # generate the new table for each new hash bucket
21
20
  hash_bucket_to_indices = pki.group_record_indices_by_hash_bucket(
22
21
  pki_table,
@@ -29,13 +28,14 @@ def group_file_records_by_pk_hash_bucket(
29
28
  return hash_bucket_to_table
30
29
 
31
30
 
32
- @ray.remote(num_cpus=1,num_returns=2)
31
+ @ray.remote(num_cpus=1, num_returns=2)
33
32
  def rehash_bucket(
34
- hash_bucket_index: int,
35
- s3_bucket: str,
36
- old_pki_version_locator: PrimaryKeyIndexVersionLocator,
37
- num_buckets: int,
38
- num_groups: int) -> Tuple[np.ndarray, List[ObjectRef]]:
33
+ hash_bucket_index: int,
34
+ s3_bucket: str,
35
+ old_pki_version_locator: PrimaryKeyIndexVersionLocator,
36
+ num_buckets: int,
37
+ num_groups: int,
38
+ ) -> Tuple[np.ndarray, List[ObjectRef]]:
39
39
 
40
40
  logger.info(f"Starting rehash bucket task...")
41
41
  tables = pki.download_hash_bucket_entries(
@@ -1,28 +1,26 @@
1
- import ray
2
1
  import logging
3
- import pyarrow as pa
4
2
  from collections import defaultdict
5
- from ray import cloudpickle
6
- from deltacat import logs
3
+ from typing import Any, List, Tuple
7
4
 
5
+ import pyarrow as pa
6
+ import ray
7
+ from ray import cloudpickle
8
8
  from ray.types import ObjectRef
9
9
 
10
- from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator, \
11
- PyArrowWriteResult
10
+ from deltacat import logs
11
+ from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator, PyArrowWriteResult
12
12
  from deltacat.compute.compactor.utils import primary_key_index as pki
13
13
 
14
- from typing import Any, List, Tuple
15
-
16
14
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
17
15
 
18
16
 
19
17
  @ray.remote(num_cpus=1, num_returns=2)
20
18
  def rewrite_index(
21
- object_ids: List[Any],
22
- s3_bucket: str,
23
- new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
24
- max_records_per_index_file: int) -> \
25
- Tuple[PyArrowWriteResult, List[ObjectRef]]:
19
+ object_ids: List[Any],
20
+ s3_bucket: str,
21
+ new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
22
+ max_records_per_index_file: int,
23
+ ) -> Tuple[PyArrowWriteResult, List[ObjectRef]]:
26
24
 
27
25
  logger.info(f"Starting rewrite primary key index task...")
28
26
  object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]