deltacat 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/compaction_session.py +184 -29
  3. deltacat/compute/compactor/model/compact_partition_params.py +153 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
  5. deltacat/compute/compactor/model/dedupe_result.py +3 -0
  6. deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
  7. deltacat/compute/compactor/model/delta_file_locator.py +11 -6
  8. deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
  9. deltacat/compute/compactor/model/materialize_result.py +27 -6
  10. deltacat/compute/compactor/model/round_completion_info.py +9 -0
  11. deltacat/compute/compactor/steps/dedupe.py +35 -19
  12. deltacat/compute/compactor/steps/hash_bucket.py +41 -16
  13. deltacat/compute/compactor/steps/materialize.py +73 -70
  14. deltacat/compute/compactor/utils/io.py +15 -0
  15. deltacat/compute/compactor/utils/primary_key_index.py +9 -15
  16. deltacat/compute/compactor/utils/round_completion_file.py +13 -4
  17. deltacat/compute/compactor/utils/system_columns.py +32 -0
  18. deltacat/io/__init__.py +0 -7
  19. deltacat/io/file_object_store.py +48 -0
  20. deltacat/io/memcached_object_store.py +121 -0
  21. deltacat/io/object_store.py +51 -0
  22. deltacat/io/ray_plasma_object_store.py +23 -0
  23. deltacat/io/redis_object_store.py +114 -0
  24. deltacat/io/s3_object_store.py +44 -0
  25. deltacat/storage/model/delta.py +2 -1
  26. deltacat/tests/compactor/test_compact_partition_params.py +237 -0
  27. deltacat/tests/compactor/utils/test_io.py +27 -5
  28. deltacat/tests/io/__init__.py +0 -0
  29. deltacat/tests/io/test_file_object_store.py +86 -0
  30. deltacat/tests/io/test_memcached_object_store.py +158 -0
  31. deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
  32. deltacat/tests/io/test_redis_object_store.py +103 -0
  33. deltacat/tests/io/test_s3_object_store.py +59 -0
  34. deltacat/tests/utils/test_record_batch_tables.py +1 -1
  35. deltacat/tests/utils/test_resources.py +9 -0
  36. deltacat/utils/ray_utils/concurrency.py +0 -2
  37. deltacat/utils/resources.py +30 -18
  38. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
  39. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +42 -27
  40. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
  41. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
  42. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0
@@ -5,10 +5,10 @@ from uuid import uuid4
5
5
  from collections import defaultdict
6
6
  from contextlib import nullcontext
7
7
  from itertools import chain, repeat
8
- from typing import List, Optional, Tuple, Dict, Any, Union
8
+ from typing import List, Optional, Tuple, Dict, Any
9
9
  import pyarrow as pa
10
+ import numpy as np
10
11
  import ray
11
- from ray import cloudpickle
12
12
  from deltacat import logs
13
13
  from deltacat.compute.compactor import (
14
14
  MaterializeResult,
@@ -27,15 +27,13 @@ from deltacat.storage import (
27
27
  PartitionLocator,
28
28
  Manifest,
29
29
  ManifestEntry,
30
- LocalDataset,
31
- LocalTable,
32
- DistributedDataset,
33
30
  )
34
31
  from deltacat.storage import interface as unimplemented_deltacat_storage
35
32
  from deltacat.utils.common import ReadKwargsProvider
36
33
  from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
37
34
  from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
38
35
  from deltacat.utils.performance import timed_invocation
36
+ from deltacat.io.object_store import IObjectStore
39
37
  from deltacat.utils.pyarrow import (
40
38
  ReadKwargsProviderPyArrowCsvPureUtf8,
41
39
  ReadKwargsProviderPyArrowSchemaOverride,
@@ -46,6 +44,7 @@ from deltacat.utils.ray_utils.runtime import (
46
44
  get_current_ray_worker_id,
47
45
  )
48
46
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
47
+ from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
49
48
 
50
49
  if importlib.util.find_spec("memray"):
51
50
  import memray
@@ -62,29 +61,15 @@ def materialize(
62
61
  dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
63
62
  max_records_per_output_file: int,
64
63
  compacted_file_content_type: ContentType,
64
+ enable_manifest_entry_copy_by_reference: bool,
65
65
  enable_profiler: bool,
66
66
  metrics_config: MetricsConfig,
67
67
  schema: Optional[pa.Schema] = None,
68
68
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
69
69
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
70
+ object_store: Optional[IObjectStore] = None,
70
71
  deltacat_storage=unimplemented_deltacat_storage,
71
72
  ):
72
- def _stage_delta_implementation(
73
- data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
74
- partition: Partition,
75
- stage_delta_from_existing_manifest: Optional[bool],
76
- ) -> Delta:
77
- if stage_delta_from_existing_manifest:
78
- delta = Delta.of(
79
- locator=DeltaLocator.of(partition.locator),
80
- delta_type=DeltaType.UPSERT,
81
- meta=manifest.meta,
82
- manifest=data,
83
- previous_stream_position=partition.stream_position,
84
- properties={},
85
- )
86
- return delta
87
-
88
73
  def _stage_delta_from_manifest_entry_reference_list(
89
74
  manifest_entry_list_reference: List[ManifestEntry],
90
75
  partition: Partition,
@@ -94,11 +79,13 @@ def materialize(
94
79
  delta_type == DeltaType.UPSERT
95
80
  ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
96
81
  manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
97
- delta = _stage_delta_implementation(
98
- data=manifest,
99
- partition=partition,
82
+ delta = Delta.of(
83
+ locator=DeltaLocator.of(partition.locator),
100
84
  delta_type=delta_type,
101
- stage_delta_from_existing_manifest=True,
85
+ meta=manifest.meta,
86
+ manifest=manifest,
87
+ previous_stream_position=partition.stream_position,
88
+ properties={},
102
89
  )
103
90
  return delta
104
91
 
@@ -160,18 +147,11 @@ def materialize(
160
147
  f"dedupe_{worker_id}_{task_id}.bin"
161
148
  ) if enable_profiler else nullcontext():
162
149
  start = time.time()
163
- dedupe_task_idx_and_obj_ref_tuples = [
164
- (
165
- t1,
166
- cloudpickle.loads(t2),
167
- )
168
- for t1, t2 in dedupe_task_idx_and_obj_id_tuples
169
- ]
170
150
  logger.info(f"Resolved materialize task obj refs...")
171
- dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
151
+ dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_id_tuples)
172
152
  # this depends on `ray.get` result order matching input order, as per the
173
153
  # contract established in: https://github.com/ray-project/ray/pull/16763
174
- src_file_records_list = ray.get(list(obj_refs))
154
+ src_file_records_list = object_store.get_many(list(obj_refs))
175
155
  all_src_file_records = defaultdict(list)
176
156
  for i, src_file_records in enumerate(src_file_records_list):
177
157
  dedupe_task_idx = dedupe_task_indices[i]
@@ -195,13 +175,13 @@ def materialize(
195
175
  is_src_partition_file_np = src_dfl.is_source_delta
196
176
  src_stream_position_np = src_dfl.stream_position
197
177
  src_file_idx_np = src_dfl.file_index
178
+ src_file_record_count = src_dfl.file_record_count.item()
198
179
  count_of_src_dfl += 1
199
180
  src_file_partition_locator = (
200
181
  source_partition_locator
201
182
  if is_src_partition_file_np
202
183
  else round_completion_info.compacted_delta_locator.partition_locator
203
184
  )
204
-
205
185
  delta_locator = DeltaLocator.of(
206
186
  src_file_partition_locator,
207
187
  src_stream_position_np.item(),
@@ -223,43 +203,45 @@ def materialize(
223
203
  read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
224
204
  schema=schema
225
205
  )
226
- pa_table, download_delta_manifest_entry_time = timed_invocation(
227
- deltacat_storage.download_delta_manifest_entry,
228
- Delta.of(delta_locator, None, None, None, manifest),
229
- src_file_idx_np.item(),
230
- file_reader_kwargs_provider=read_kwargs_provider,
231
- )
232
- logger.debug(
233
- f"Time taken for materialize task"
234
- f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
235
- f" is: {download_delta_manifest_entry_time}s"
236
- )
237
206
  record_numbers = chain.from_iterable(record_numbers_tpl)
238
207
  record_numbers_length = 0
239
- mask_pylist = list(repeat(False, len(pa_table)))
208
+ mask_pylist = list(repeat(False, src_file_record_count))
240
209
  for record_number in record_numbers:
241
210
  record_numbers_length += 1
242
211
  mask_pylist[record_number] = True
243
212
  if (
244
- record_numbers_length == len(pa_table)
213
+ round_completion_info
214
+ and enable_manifest_entry_copy_by_reference
215
+ and record_numbers_length == src_file_record_count
245
216
  and src_file_partition_locator
246
217
  == round_completion_info.compacted_delta_locator.partition_locator
247
218
  ):
248
219
  logger.debug(
249
220
  f"Untouched manifest file found, "
250
221
  f"record numbers length: {record_numbers_length} "
251
- f"same as downloaded table length: {len(pa_table)}"
222
+ f"same as downloaded table length: {src_file_record_count}"
252
223
  )
253
224
  untouched_src_manifest_entry = manifest.entries[src_file_idx_np.item()]
254
225
  manifest_entry_list_reference.append(untouched_src_manifest_entry)
255
226
  referenced_pyarrow_write_result = PyArrowWriteResult.of(
256
- len(untouched_src_manifest_entry.entries),
257
- TABLE_CLASS_TO_SIZE_FUNC[type(pa_table)](pa_table),
258
- manifest.meta.content_length,
259
- len(pa_table),
227
+ 1,
228
+ untouched_src_manifest_entry.meta.source_content_length,
229
+ untouched_src_manifest_entry.meta.content_length,
230
+ src_file_record_count,
260
231
  )
261
232
  referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)
262
233
  else:
234
+ pa_table, download_delta_manifest_entry_time = timed_invocation(
235
+ deltacat_storage.download_delta_manifest_entry,
236
+ Delta.of(delta_locator, None, None, None, manifest),
237
+ src_file_idx_np.item(),
238
+ file_reader_kwargs_provider=read_kwargs_provider,
239
+ )
240
+ logger.debug(
241
+ f"Time taken for materialize task"
242
+ f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
243
+ f" is: {download_delta_manifest_entry_time}s"
244
+ )
263
245
  mask = pa.array(mask_pylist)
264
246
  pa_table = pa_table.filter(mask)
265
247
  record_batch_tables.append(pa_table)
@@ -274,15 +256,11 @@ def materialize(
274
256
 
275
257
  referenced_manifest_delta = (
276
258
  _stage_delta_from_manifest_entry_reference_list(
277
- manifest_entry_list_reference
259
+ manifest_entry_list_reference, partition
278
260
  )
279
261
  if manifest_entry_list_reference
280
262
  else None
281
263
  )
282
- if referenced_manifest_delta:
283
- logger.info(
284
- f"Got delta with {len(referenced_manifest_delta.manifest.entries)} referenced manifest entries"
285
- )
286
264
 
287
265
  merged_materialized_delta = [mr.delta for mr in materialized_results]
288
266
  merged_materialized_delta.append(referenced_manifest_delta)
@@ -290,33 +268,58 @@ def materialize(
290
268
  [d for d in merged_materialized_delta if d is not None]
291
269
  )
292
270
 
293
- write_results_union = referenced_pyarrow_write_results
271
+ write_results_union = [*referenced_pyarrow_write_results]
294
272
  if materialized_results:
295
273
  for mr in materialized_results:
296
274
  write_results_union.append(mr.pyarrow_write_result)
297
275
  write_result = PyArrowWriteResult.union(write_results_union)
276
+ referenced_write_result = PyArrowWriteResult.union(
277
+ referenced_pyarrow_write_results
278
+ )
279
+
280
+ if referenced_manifest_delta:
281
+ logger.info(
282
+ f"Got delta with {len(referenced_manifest_delta.manifest.entries)} referenced manifest entries"
283
+ )
284
+ assert referenced_write_result.files == len(
285
+ referenced_manifest_delta.manifest.entries
286
+ ), "The files referenced must match with the entries in the delta"
287
+
288
+ assert write_result.files == len(
289
+ merged_delta.manifest.entries
290
+ ), "The total number of files written by materialize must match manifest entries"
298
291
 
299
292
  logger.debug(
300
- f"{len(write_results_union)} files written"
301
- f" with records: {[wr.records for wr in write_results_union]}"
302
- )
303
- # Merge all new deltas into one for this materialize bucket index
304
- merged_materialize_result = MaterializeResult.of(
305
- merged_delta,
306
- mat_bucket_index,
307
- write_result,
308
- len(manifest_entry_list_reference),
309
- count_of_src_dfl,
293
+ f"{write_result.files} files written"
294
+ f" with records: {write_result.records}"
310
295
  )
311
296
 
312
297
  logger.info(f"Finished materialize task...")
313
298
  end = time.time()
314
299
  duration = end - start
300
+
301
+ emit_metrics_time = 0.0
315
302
  if metrics_config:
316
- emit_timer_metrics(
303
+ emit_result, latency = timed_invocation(
304
+ func=emit_timer_metrics,
317
305
  metrics_name="materialize",
318
306
  value=duration,
319
307
  metrics_config=metrics_config,
320
308
  )
309
+ emit_metrics_time = latency
321
310
  logger.info(f"Materialize task ended in {end - start}s")
311
+
312
+ peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
313
+
314
+ # Merge all new deltas into one for this materialize bucket index
315
+ merged_materialize_result = MaterializeResult.of(
316
+ merged_delta,
317
+ mat_bucket_index,
318
+ write_result,
319
+ referenced_write_result,
320
+ np.double(peak_memory_usage_bytes),
321
+ np.double(emit_metrics_time),
322
+ np.double(time.time()),
323
+ )
324
+
322
325
  return merged_materialize_result
@@ -16,6 +16,9 @@ from deltacat import logs
16
16
  from deltacat.compute.compactor import DeltaAnnotated
17
17
  from typing import Dict, List, Optional, Tuple, Union
18
18
  from deltacat.compute.compactor import HighWatermark
19
+ from deltacat.compute.compactor.model.compaction_session_audit_info import (
20
+ CompactionSessionAuditInfo,
21
+ )
19
22
 
20
23
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
21
24
 
@@ -94,6 +97,7 @@ def limit_input_deltas(
94
97
  hash_bucket_count: int,
95
98
  user_hash_bucket_chunk_size: int,
96
99
  input_deltas_stats: Dict[int, DeltaStats],
100
+ compaction_audit: CompactionSessionAuditInfo,
97
101
  deltacat_storage=unimplemented_deltacat_storage,
98
102
  ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
99
103
  # TODO (pdames): when row counts are available in metadata, use them
@@ -236,6 +240,11 @@ def limit_input_deltas(
236
240
  # TODO (pdames): Test and add value for min_file_counts
237
241
  )
238
242
 
243
+ compaction_audit.set_input_size_bytes(delta_bytes)
244
+ compaction_audit.set_input_file_count(delta_manifest_entries)
245
+ compaction_audit.set_total_cluster_memory_bytes(worker_task_mem)
246
+ compaction_audit.set_hash_bucket_count(hash_bucket_count)
247
+
239
248
  logger.info(f"Hash bucket chunk size: {hash_bucket_chunk_size}")
240
249
  logger.info(f"Hash bucket count: {hash_bucket_count}")
241
250
  logger.info(f"Input uniform delta count: {len(rebatched_da_list)}")
@@ -246,6 +255,7 @@ def limit_input_deltas(
246
255
  def fit_input_deltas(
247
256
  input_deltas: List[Delta],
248
257
  cluster_resources: Dict[str, float],
258
+ compaction_audit: CompactionSessionAuditInfo,
249
259
  hash_bucket_count: Optional[int],
250
260
  deltacat_storage=unimplemented_deltacat_storage,
251
261
  ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
@@ -314,6 +324,11 @@ def fit_input_deltas(
314
324
  math.ceil(total_memory / MEMORY_TO_HASH_BUCKET_COUNT_RATIO)
315
325
  )
316
326
 
327
+ compaction_audit.set_input_file_count(total_files)
328
+ compaction_audit.set_input_size_bytes(delta_bytes)
329
+ compaction_audit.set_total_cluster_memory_bytes(total_memory)
330
+ compaction_audit.set_hash_bucket_count(hash_bucket_count)
331
+
317
332
  logger.info(
318
333
  f"Input delta bytes: {delta_bytes}, Total files: {total_files}, The worker_cpus: {worker_cpus}, "
319
334
  f" total_memory: {total_memory}, and hash_bucket_count: {hash_bucket_count}"
@@ -7,7 +7,6 @@ import numpy as np
7
7
  import pyarrow as pa
8
8
  import ray
9
9
  import s3fs
10
- from ray import cloudpickle
11
10
  from ray.types import ObjectRef
12
11
 
13
12
  from deltacat import logs
@@ -30,6 +29,7 @@ from deltacat.types.media import ContentEncoding, ContentType
30
29
  from deltacat.types.tables import get_table_slicer, get_table_writer
31
30
  from deltacat.utils.common import ReadKwargsProvider
32
31
  from deltacat.utils.ray_utils.concurrency import invoke_parallel
32
+ from deltacat.io.object_store import IObjectStore
33
33
 
34
34
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
35
35
 
@@ -203,7 +203,10 @@ def group_record_indices_by_hash_bucket(
203
203
 
204
204
 
205
205
  def group_hash_bucket_indices(
206
- hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
206
+ hash_bucket_object_groups: np.ndarray,
207
+ num_buckets: int,
208
+ num_groups: int,
209
+ object_store: Optional[IObjectStore] = None,
207
210
  ) -> Tuple[np.ndarray, List[ObjectRef]]:
208
211
  """
209
212
  Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
@@ -226,19 +229,10 @@ def group_hash_bucket_indices(
226
229
  for hb_group, obj in enumerate(hb_group_to_object):
227
230
  if obj is None:
228
231
  continue
229
- obj_ref = ray.put(obj)
230
- pickled_obj_ref = cloudpickle.dumps(obj_ref)
231
- object_refs.append(pickled_obj_ref)
232
- hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
233
- # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
234
- # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
235
- # (e.g., if the ObjectRef is deserialized by a non-Ray process).
236
- # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
237
- # The object now has a permanent reference and the data can't be freed from Ray’s object store.
238
- # Manually deleting the untrackable object references offsets these permanent references and
239
- # helps to allow these objects to be garbage collected normally.
240
- del obj_ref
241
- del pickled_obj_ref
232
+ object_ref = object_store.put(obj)
233
+ object_refs.append(object_ref)
234
+ hash_bucket_group_to_obj_id[hb_group] = object_ref
235
+ del object_ref
242
236
  return hash_bucket_group_to_obj_id, object_refs
243
237
 
244
238
 
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  import logging
3
-
3
+ from typing import Dict, Any
4
4
  from deltacat import logs
5
5
  from deltacat.compute.compactor import RoundCompletionInfo
6
6
  from deltacat.storage import PartitionLocator
@@ -19,7 +19,9 @@ def get_round_completion_file_s3_url(
19
19
 
20
20
 
21
21
  def read_round_completion_file(
22
- bucket: str, source_partition_locator: PartitionLocator
22
+ bucket: str,
23
+ source_partition_locator: PartitionLocator,
24
+ **s3_client_kwargs: Optional[Dict[str, Any]],
23
25
  ) -> RoundCompletionInfo:
24
26
 
25
27
  round_completion_file_url = get_round_completion_file_s3_url(
@@ -28,7 +30,7 @@ def read_round_completion_file(
28
30
  )
29
31
  logger.info(f"reading round completion file from: {round_completion_file_url}")
30
32
  round_completion_info = None
31
- result = s3_utils.download(round_completion_file_url, False)
33
+ result = s3_utils.download(round_completion_file_url, False, **s3_client_kwargs)
32
34
  if result:
33
35
  json_str = result["Body"].read().decode("utf-8")
34
36
  round_completion_info = RoundCompletionInfo(json.loads(json_str))
@@ -41,7 +43,10 @@ def write_round_completion_file(
41
43
  source_partition_locator: Optional[PartitionLocator],
42
44
  round_completion_info: RoundCompletionInfo,
43
45
  completion_file_s3_url: str = None,
46
+ **s3_client_kwargs: Optional[Dict[str, Any]],
44
47
  ) -> str:
48
+ if bucket is None and completion_file_s3_url is None:
49
+ raise AssertionError("Either bucket or completion_file_s3_url must be passed")
45
50
 
46
51
  logger.info(f"writing round completion file contents: {round_completion_info}")
47
52
  if completion_file_s3_url is None:
@@ -50,6 +55,10 @@ def write_round_completion_file(
50
55
  source_partition_locator,
51
56
  )
52
57
  logger.info(f"writing round completion file to: {completion_file_s3_url}")
53
- s3_utils.upload(completion_file_s3_url, str(json.dumps(round_completion_info)))
58
+ s3_utils.upload(
59
+ completion_file_s3_url,
60
+ str(json.dumps(round_completion_info)),
61
+ **s3_client_kwargs,
62
+ )
54
63
  logger.info(f"round completion file written to: {completion_file_s3_url}")
55
64
  return completion_file_s3_url
@@ -64,6 +64,13 @@ _IS_SOURCE_COLUMN_FIELD = pa.field(
64
64
  _IS_SOURCE_COLUMN_TYPE,
65
65
  )
66
66
 
67
+ _FILE_RECORD_COUNT_COLUMN_NAME = _get_sys_col_name("file_record_count")
68
+ _FILE_RECORD_COUNT_COLUMN_TYPE = pa.int64()
69
+ _FILE_RECORD_COUNT_COLUMN_FIELD = pa.field(
70
+ _FILE_RECORD_COUNT_COLUMN_NAME,
71
+ _FILE_RECORD_COUNT_COLUMN_TYPE,
72
+ )
73
+
67
74
 
68
75
  def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
69
76
  return pa.array(obj, _PK_HASH_COLUMN_TYPE)
@@ -143,6 +150,17 @@ def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
143
150
  )
144
151
 
145
152
 
153
+ def file_record_count_column_np(table: pa.Table) -> np.ndarray:
154
+ return table[_FILE_RECORD_COUNT_COLUMN_NAME].to_numpy()
155
+
156
+
157
+ def get_file_record_count_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
158
+ return pa.array(
159
+ obj,
160
+ _FILE_RECORD_COUNT_COLUMN_TYPE,
161
+ )
162
+
163
+
146
164
  def project_delta_file_metadata_on_table(
147
165
  delta_file_envelope: DeltaFileEnvelope,
148
166
  ) -> pa.Table:
@@ -179,6 +197,12 @@ def project_delta_file_metadata_on_table(
179
197
  len(table),
180
198
  )
181
199
  table = append_is_source_col(table, is_source_iterator)
200
+
201
+ # append row count column
202
+ file_record_count_iterator = repeat(
203
+ delta_file_envelope.file_record_count, len(table)
204
+ )
205
+ table = append_file_record_count_col(table, file_record_count_iterator)
182
206
  return table
183
207
 
184
208
 
@@ -252,6 +276,14 @@ def append_is_source_col(table: pa.Table, booleans) -> pa.Table:
252
276
  return table
253
277
 
254
278
 
279
+ def append_file_record_count_col(table: pa.Table, file_record_count):
280
+ table = table.append_column(
281
+ _FILE_RECORD_COUNT_COLUMN_FIELD,
282
+ get_file_record_count_column_array(file_record_count),
283
+ )
284
+ return table
285
+
286
+
255
287
  def get_minimal_hb_schema() -> pa.schema:
256
288
  return pa.schema(
257
289
  [
deltacat/io/__init__.py CHANGED
@@ -1,7 +0,0 @@
1
- from deltacat.io.dataset import DeltacatDataset
2
- from deltacat.io.read_api import read_redshift
3
-
4
- __all__ = [
5
- "DeltacatDataset",
6
- "read_redshift",
7
- ]
@@ -0,0 +1,48 @@
1
+ import logging
2
+ from ray import cloudpickle
3
+ import time
4
+ from deltacat.io.object_store import IObjectStore
5
+ from typing import Any, List
6
+ from deltacat import logs
7
+ import os
8
+ import uuid
9
+ from builtins import open
10
+
11
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
+
13
+
14
+ class FileObjectStore(IObjectStore):
15
+ """
16
+ An implementation of object store that uses file system.
17
+ """
18
+
19
+ def __init__(self, dir_path: str) -> None:
20
+ self.dir_path = dir_path
21
+ super().__init__()
22
+
23
+ def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
24
+ result = []
25
+
26
+ for obj in objects:
27
+ serialized = cloudpickle.dumps(obj)
28
+ ref = f"{self.dir_path}/{uuid.uuid4()}"
29
+ with open(ref, "xb") as f:
30
+ f.write(serialized)
31
+
32
+ result.append(ref)
33
+
34
+ return result
35
+
36
+ def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
37
+ result = []
38
+ start = time.monotonic()
39
+ for ref in refs:
40
+ with open(ref, "rb") as f:
41
+ serialized = f.read()
42
+ loaded = cloudpickle.loads(serialized)
43
+ result.append(loaded)
44
+ os.remove(ref)
45
+ end = time.monotonic()
46
+
47
+ logger.info(f"The total time taken to read all objects is: {end - start}")
48
+ return result
@@ -0,0 +1,121 @@
1
+ import logging
2
+ from ray import cloudpickle
3
+ from collections import defaultdict
4
+ import time
5
+ from deltacat.io.object_store import IObjectStore
6
+ from typing import Any, List
7
+ from deltacat import logs
8
+ import uuid
9
+ import socket
10
+ from pymemcache.client.base import Client
11
+ from pymemcache.client.retrying import RetryingClient
12
+ from pymemcache.exceptions import MemcacheUnexpectedCloseError
13
+
14
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
+
16
+
17
+ class MemcachedObjectStore(IObjectStore):
18
+ """
19
+ An implementation of object store that uses Memcached.
20
+ """
21
+
22
+ def __init__(self, port=11212) -> None:
23
+ self.client_cache = {}
24
+ self.current_ip = None
25
+ self.SEPARATOR = "_"
26
+ self.port = port
27
+ super().__init__()
28
+
29
+ def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
30
+ input = {}
31
+ result = []
32
+ current_ip = self._get_current_ip()
33
+ for obj in objects:
34
+ serialized = cloudpickle.dumps(obj)
35
+ uid = uuid.uuid4()
36
+ ref = self._create_ref(uid, current_ip)
37
+ input[uid.__str__()] = serialized
38
+ result.append(ref)
39
+
40
+ client = self._get_client_by_ip(current_ip)
41
+ if client.set_many(input, noreply=False):
42
+ raise RuntimeError("Unable to write few keys to cache")
43
+
44
+ return result
45
+
46
+ def put(self, obj: object, *args, **kwargs) -> Any:
47
+ serialized = cloudpickle.dumps(obj)
48
+ uid = uuid.uuid4()
49
+ current_ip = self._get_current_ip()
50
+ ref = self._create_ref(uid, current_ip)
51
+ client = self._get_client_by_ip(current_ip)
52
+
53
+ if client.set(uid.__str__(), serialized):
54
+ return ref
55
+ else:
56
+ raise RuntimeError("Unable to write to cache")
57
+
58
+ def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
59
+ result = []
60
+ uid_per_ip = defaultdict(lambda: [])
61
+
62
+ start = time.monotonic()
63
+ for ref in refs:
64
+ uid, ip = ref.split(self.SEPARATOR)
65
+ uid_per_ip[ip].append(uid)
66
+
67
+ for (ip, uids) in uid_per_ip.items():
68
+ client = self._get_client_by_ip(ip)
69
+ cache_result = client.get_many(uids)
70
+ assert len(cache_result) == len(
71
+ uids
72
+ ), f"Not all values were returned from cache as {len(cache_result)} != {len(uids)}"
73
+
74
+ values = cache_result.values()
75
+ total_bytes = 0
76
+
77
+ deserialize_start = time.monotonic()
78
+ for serialized in values:
79
+ deserialized = cloudpickle.loads(serialized)
80
+ total_bytes += len(serialized)
81
+ result.append(deserialized)
82
+
83
+ deserialize_end = time.monotonic()
84
+ logger.debug(
85
+ f"The time taken to deserialize {total_bytes} bytes is: {deserialize_end - deserialize_start}",
86
+ )
87
+
88
+ end = time.monotonic()
89
+
90
+ logger.info(f"The total time taken to read all objects is: {end - start}")
91
+ return result
92
+
93
+ def get(self, ref: Any, *args, **kwargs) -> object:
94
+ uid, ip = ref.split(self.SEPARATOR)
95
+ client = self._get_client_by_ip(ip)
96
+ serialized = client.get(uid)
97
+ return cloudpickle.loads(serialized)
98
+
99
+ def _create_ref(self, uid, ip) -> str:
100
+ return f"{uid}{self.SEPARATOR}{ip}"
101
+
102
+ def _get_client_by_ip(self, ip_address: str):
103
+ if ip_address in self.client_cache:
104
+ return self.client_cache[ip_address]
105
+
106
+ base_client = Client((ip_address, self.port))
107
+ client = RetryingClient(
108
+ base_client,
109
+ attempts=3,
110
+ retry_delay=0.01,
111
+ retry_for=[MemcacheUnexpectedCloseError],
112
+ )
113
+
114
+ self.client_cache[ip_address] = client
115
+ return client
116
+
117
+ def _get_current_ip(self):
118
+ if self.current_ip is None:
119
+ self.current_ip = socket.gethostbyname(socket.gethostname())
120
+
121
+ return self.current_ip