deltacat 0.1.18b4__py3-none-any.whl → 0.1.18b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
43
43
 
44
44
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
45
45
 
46
- __version__ = "0.1.18b4"
46
+ __version__ = "0.1.18b7"
47
47
 
48
48
 
49
49
  __all__ = [
@@ -16,6 +16,8 @@ from deltacat.compute.compactor import (
16
16
  )
17
17
  from deltacat.compute.compactor.model.dedupe_result import DedupeResult
18
18
  from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
19
+ from deltacat.io.object_store import IObjectStore
20
+ from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
19
21
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
20
22
  from deltacat.compute.stats.models.delta_stats import DeltaStats
21
23
  from deltacat.storage import (
@@ -112,6 +114,8 @@ def compact_partition(
112
114
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
113
115
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
114
116
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
117
+ object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
118
+ s3_client_kwargs: Optional[Dict[str, Any]] = None,
115
119
  deltacat_storage=unimplemented_deltacat_storage,
116
120
  **kwargs,
117
121
  ) -> Optional[str]:
@@ -151,6 +155,8 @@ def compact_partition(
151
155
  list_deltas_kwargs,
152
156
  read_kwargs_provider,
153
157
  s3_table_writer_kwargs,
158
+ object_store,
159
+ s3_client_kwargs,
154
160
  deltacat_storage,
155
161
  **kwargs,
156
162
  )
@@ -196,6 +202,8 @@ def _execute_compaction_round(
196
202
  list_deltas_kwargs: Optional[Dict[str, Any]],
197
203
  read_kwargs_provider: Optional[ReadKwargsProvider],
198
204
  s3_table_writer_kwargs: Optional[Dict[str, Any]],
205
+ object_store: Optional[IObjectStore],
206
+ s3_client_kwargs: Optional[Dict[str, Any]],
199
207
  deltacat_storage=unimplemented_deltacat_storage,
200
208
  **kwargs,
201
209
  ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
@@ -287,6 +295,13 @@ def _execute_compaction_round(
287
295
  )
288
296
  logger.info(f"Round completion file: {round_completion_info}")
289
297
 
298
+ enable_manifest_entry_copy_by_reference = (
299
+ False if rebase_source_partition_locator else True
300
+ )
301
+ logger.info(
302
+ f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
303
+ )
304
+
290
305
  # discover input delta files
291
306
  # For rebase:
292
307
  # Copy the old compacted table to a new destination, plus any new deltas from rebased source
@@ -318,7 +333,11 @@ def _execute_compaction_round(
318
333
  delta_discovery_end - delta_discovery_start
319
334
  )
320
335
 
321
- s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
336
+ s3_utils.upload(
337
+ compaction_audit.audit_url,
338
+ str(json.dumps(compaction_audit)),
339
+ **s3_client_kwargs,
340
+ )
322
341
 
323
342
  if not input_deltas:
324
343
  logger.info("No input deltas found to compact.")
@@ -392,6 +411,7 @@ def _execute_compaction_round(
392
411
  enable_profiler=enable_profiler,
393
412
  metrics_config=metrics_config,
394
413
  read_kwargs_provider=read_kwargs_provider,
414
+ object_store=object_store,
395
415
  deltacat_storage=deltacat_storage,
396
416
  )
397
417
 
@@ -411,7 +431,11 @@ def _execute_compaction_round(
411
431
  hb_end - hb_start,
412
432
  )
413
433
 
414
- s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
434
+ s3_utils.upload(
435
+ compaction_audit.audit_url,
436
+ str(json.dumps(compaction_audit)),
437
+ **s3_client_kwargs,
438
+ )
415
439
 
416
440
  all_hash_group_idx_to_obj_id = defaultdict(list)
417
441
  for hb_result in hb_results:
@@ -453,11 +477,16 @@ def _execute_compaction_round(
453
477
  logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
454
478
 
455
479
  dedupe_start = time.monotonic()
456
-
480
+ dd_max_parallelism = int(
481
+ max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
482
+ )
483
+ logger.info(
484
+ f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
485
+ )
457
486
  dd_tasks_pending = invoke_parallel(
458
487
  items=all_hash_group_idx_to_obj_id.values(),
459
488
  ray_task=dd.dedupe,
460
- max_parallelism=max_parallelism,
489
+ max_parallelism=dd_max_parallelism,
461
490
  options_provider=round_robin_opt_provider,
462
491
  kwargs_provider=lambda index, item: {
463
492
  "dedupe_task_index": index,
@@ -467,6 +496,7 @@ def _execute_compaction_round(
467
496
  num_materialize_buckets=num_materialize_buckets,
468
497
  enable_profiler=enable_profiler,
469
498
  metrics_config=metrics_config,
499
+ object_store=object_store,
470
500
  )
471
501
 
472
502
  dedupe_invoke_end = time.monotonic()
@@ -520,7 +550,11 @@ def _execute_compaction_round(
520
550
  # parallel step 3:
521
551
  # materialize records to keep by index
522
552
 
523
- s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
553
+ s3_utils.upload(
554
+ compaction_audit.audit_url,
555
+ str(json.dumps(compaction_audit)),
556
+ **s3_client_kwargs,
557
+ )
524
558
 
525
559
  materialize_start = time.monotonic()
526
560
 
@@ -537,12 +571,14 @@ def _execute_compaction_round(
537
571
  round_completion_info=round_completion_info,
538
572
  source_partition_locator=source_partition_locator,
539
573
  partition=partition,
574
+ enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
540
575
  max_records_per_output_file=records_per_compacted_file,
541
576
  compacted_file_content_type=compacted_file_content_type,
542
577
  enable_profiler=enable_profiler,
543
578
  metrics_config=metrics_config,
544
579
  read_kwargs_provider=read_kwargs_provider,
545
580
  s3_table_writer_kwargs=s3_table_writer_kwargs,
581
+ object_store=object_store,
546
582
  deltacat_storage=deltacat_storage,
547
583
  )
548
584
 
@@ -620,7 +656,11 @@ def _execute_compaction_round(
620
656
  mat_results, telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
621
657
  )
622
658
 
623
- s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
659
+ s3_utils.upload(
660
+ compaction_audit.audit_url,
661
+ str(json.dumps(compaction_audit)),
662
+ **s3_client_kwargs,
663
+ )
624
664
 
625
665
  new_round_completion_info = RoundCompletionInfo.of(
626
666
  last_stream_position_compacted,
@@ -1,5 +1,6 @@
1
1
  import importlib
2
2
  import logging
3
+ from typing import Optional
3
4
  import time
4
5
  from collections import defaultdict
5
6
  from contextlib import nullcontext
@@ -8,7 +9,6 @@ import numpy as np
8
9
  import pyarrow as pa
9
10
  import pyarrow.compute as pc
10
11
  import ray
11
- from ray import cloudpickle
12
12
 
13
13
  from deltacat import logs
14
14
  from deltacat.compute.compactor import (
@@ -25,6 +25,7 @@ from deltacat.utils.ray_utils.runtime import (
25
25
  )
26
26
  from deltacat.utils.performance import timed_invocation
27
27
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
28
+ from deltacat.io.object_store import IObjectStore
28
29
  from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
29
30
 
30
31
  if importlib.util.find_spec("memray"):
@@ -106,6 +107,7 @@ def _timed_dedupe(
106
107
  num_materialize_buckets: int,
107
108
  dedupe_task_index: int,
108
109
  enable_profiler: bool,
110
+ object_store: Optional[IObjectStore],
109
111
  ):
110
112
  task_id = get_current_ray_task_id()
111
113
  worker_id = get_current_ray_worker_id()
@@ -114,15 +116,12 @@ def _timed_dedupe(
114
116
  ) if enable_profiler else nullcontext():
115
117
  # TODO (pdames): mitigate risk of running out of memory here in cases of
116
118
  # severe skew of primary key updates in deltas
117
- src_file_records_obj_refs = [
118
- cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
119
- ]
120
119
  logger.info(
121
120
  f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
122
- f"groups for {len(src_file_records_obj_refs)} object refs..."
121
+ f"groups for {len(object_ids)} object refs..."
123
122
  )
124
123
 
125
- delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
124
+ delta_file_envelope_groups_list = object_store.get_many(object_ids)
126
125
  hb_index_to_delta_file_envelopes_list = defaultdict(list)
127
126
  for delta_file_envelope_groups in delta_file_envelope_groups_list:
128
127
  for hb_idx, dfes in enumerate(delta_file_envelope_groups):
@@ -201,7 +200,6 @@ def _timed_dedupe(
201
200
  src_file_id_to_row_indices[src_dfl].append(row_idx_col[row_idx])
202
201
 
203
202
  logger.info(f"Finished all dedupe rounds...")
204
- mat_bucket_to_src_file_record_count = defaultdict(dict)
205
203
  mat_bucket_to_src_file_records: Dict[
206
204
  MaterializeBucketIndex, DeltaFileLocatorToRecords
207
205
  ] = defaultdict(dict)
@@ -213,22 +211,17 @@ def _timed_dedupe(
213
211
  mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
214
212
  src_row_indices,
215
213
  )
216
- mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(
217
- src_row_indices
218
- )
219
214
 
220
215
  mat_bucket_to_dd_idx_obj_id: Dict[
221
216
  MaterializeBucketIndex, DedupeTaskIndexWithObjectId
222
217
  ] = {}
223
218
  for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
224
- object_ref = ray.put(src_file_records)
225
- pickled_object_ref = cloudpickle.dumps(object_ref)
219
+ object_ref = object_store.put(src_file_records)
226
220
  mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
227
221
  dedupe_task_index,
228
- pickled_object_ref,
222
+ object_ref,
229
223
  )
230
224
  del object_ref
231
- del pickled_object_ref
232
225
  logger.info(
233
226
  f"Count of materialize buckets with object refs: "
234
227
  f"{len(mat_bucket_to_dd_idx_obj_id)}"
@@ -253,6 +246,7 @@ def dedupe(
253
246
  dedupe_task_index: int,
254
247
  enable_profiler: bool,
255
248
  metrics_config: MetricsConfig,
249
+ object_store: Optional[IObjectStore],
256
250
  ) -> DedupeResult:
257
251
  logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
258
252
  dedupe_result, duration = timed_invocation(
@@ -262,6 +256,7 @@ def dedupe(
262
256
  num_materialize_buckets=num_materialize_buckets,
263
257
  dedupe_task_index=dedupe_task_index,
264
258
  enable_profiler=enable_profiler,
259
+ object_store=object_store,
265
260
  )
266
261
 
267
262
  emit_metrics_time = 0.0
@@ -31,6 +31,7 @@ from deltacat.utils.ray_utils.runtime import (
31
31
  from deltacat.utils.common import ReadKwargsProvider
32
32
  from deltacat.utils.performance import timed_invocation
33
33
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
34
+ from deltacat.io.object_store import IObjectStore
34
35
  from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
35
36
 
36
37
  if importlib.util.find_spec("memray"):
@@ -179,6 +180,7 @@ def _timed_hash_bucket(
179
180
  num_groups: int,
180
181
  enable_profiler: bool,
181
182
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
183
+ object_store: Optional[IObjectStore] = None,
182
184
  deltacat_storage=unimplemented_deltacat_storage,
183
185
  ):
184
186
  task_id = get_current_ray_task_id()
@@ -207,9 +209,7 @@ def _timed_hash_bucket(
207
209
  deltacat_storage,
208
210
  )
209
211
  hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
210
- delta_file_envelope_groups,
211
- num_buckets,
212
- num_groups,
212
+ delta_file_envelope_groups, num_buckets, num_groups, object_store
213
213
  )
214
214
 
215
215
  peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
@@ -233,6 +233,7 @@ def hash_bucket(
233
233
  enable_profiler: bool,
234
234
  metrics_config: MetricsConfig,
235
235
  read_kwargs_provider: Optional[ReadKwargsProvider],
236
+ object_store: Optional[IObjectStore],
236
237
  deltacat_storage=unimplemented_deltacat_storage,
237
238
  ) -> HashBucketResult:
238
239
 
@@ -247,6 +248,7 @@ def hash_bucket(
247
248
  num_groups=num_groups,
248
249
  enable_profiler=enable_profiler,
249
250
  read_kwargs_provider=read_kwargs_provider,
251
+ object_store=object_store,
250
252
  deltacat_storage=deltacat_storage,
251
253
  )
252
254
 
@@ -5,11 +5,10 @@ from uuid import uuid4
5
5
  from collections import defaultdict
6
6
  from contextlib import nullcontext
7
7
  from itertools import chain, repeat
8
- from typing import List, Optional, Tuple, Dict, Any, Union
8
+ from typing import List, Optional, Tuple, Dict, Any
9
9
  import pyarrow as pa
10
10
  import numpy as np
11
11
  import ray
12
- from ray import cloudpickle
13
12
  from deltacat import logs
14
13
  from deltacat.compute.compactor import (
15
14
  MaterializeResult,
@@ -28,15 +27,13 @@ from deltacat.storage import (
28
27
  PartitionLocator,
29
28
  Manifest,
30
29
  ManifestEntry,
31
- LocalDataset,
32
- LocalTable,
33
- DistributedDataset,
34
30
  )
35
31
  from deltacat.storage import interface as unimplemented_deltacat_storage
36
32
  from deltacat.utils.common import ReadKwargsProvider
37
33
  from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
38
34
  from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
39
35
  from deltacat.utils.performance import timed_invocation
36
+ from deltacat.io.object_store import IObjectStore
40
37
  from deltacat.utils.pyarrow import (
41
38
  ReadKwargsProviderPyArrowCsvPureUtf8,
42
39
  ReadKwargsProviderPyArrowSchemaOverride,
@@ -64,29 +61,15 @@ def materialize(
64
61
  dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
65
62
  max_records_per_output_file: int,
66
63
  compacted_file_content_type: ContentType,
64
+ enable_manifest_entry_copy_by_reference: bool,
67
65
  enable_profiler: bool,
68
66
  metrics_config: MetricsConfig,
69
67
  schema: Optional[pa.Schema] = None,
70
68
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
71
69
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
70
+ object_store: Optional[IObjectStore] = None,
72
71
  deltacat_storage=unimplemented_deltacat_storage,
73
72
  ):
74
- def _stage_delta_implementation(
75
- data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
76
- partition: Partition,
77
- stage_delta_from_existing_manifest: Optional[bool],
78
- ) -> Delta:
79
- if stage_delta_from_existing_manifest:
80
- delta = Delta.of(
81
- locator=DeltaLocator.of(partition.locator),
82
- delta_type=DeltaType.UPSERT,
83
- meta=manifest.meta,
84
- manifest=data,
85
- previous_stream_position=partition.stream_position,
86
- properties={},
87
- )
88
- return delta
89
-
90
73
  def _stage_delta_from_manifest_entry_reference_list(
91
74
  manifest_entry_list_reference: List[ManifestEntry],
92
75
  partition: Partition,
@@ -96,10 +79,13 @@ def materialize(
96
79
  delta_type == DeltaType.UPSERT
97
80
  ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
98
81
  manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
99
- delta = _stage_delta_implementation(
100
- data=manifest,
101
- partition=partition,
102
- stage_delta_from_existing_manifest=True,
82
+ delta = Delta.of(
83
+ locator=DeltaLocator.of(partition.locator),
84
+ delta_type=delta_type,
85
+ meta=manifest.meta,
86
+ manifest=manifest,
87
+ previous_stream_position=partition.stream_position,
88
+ properties={},
103
89
  )
104
90
  return delta
105
91
 
@@ -161,18 +147,11 @@ def materialize(
161
147
  f"dedupe_{worker_id}_{task_id}.bin"
162
148
  ) if enable_profiler else nullcontext():
163
149
  start = time.time()
164
- dedupe_task_idx_and_obj_ref_tuples = [
165
- (
166
- t1,
167
- cloudpickle.loads(t2),
168
- )
169
- for t1, t2 in dedupe_task_idx_and_obj_id_tuples
170
- ]
171
150
  logger.info(f"Resolved materialize task obj refs...")
172
- dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
151
+ dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_id_tuples)
173
152
  # this depends on `ray.get` result order matching input order, as per the
174
153
  # contract established in: https://github.com/ray-project/ray/pull/16763
175
- src_file_records_list = ray.get(list(obj_refs))
154
+ src_file_records_list = object_store.get_many(list(obj_refs))
176
155
  all_src_file_records = defaultdict(list)
177
156
  for i, src_file_records in enumerate(src_file_records_list):
178
157
  dedupe_task_idx = dedupe_task_indices[i]
@@ -231,7 +210,9 @@ def materialize(
231
210
  record_numbers_length += 1
232
211
  mask_pylist[record_number] = True
233
212
  if (
234
- record_numbers_length == src_file_record_count
213
+ round_completion_info
214
+ and enable_manifest_entry_copy_by_reference
215
+ and record_numbers_length == src_file_record_count
235
216
  and src_file_partition_locator
236
217
  == round_completion_info.compacted_delta_locator.partition_locator
237
218
  ):
@@ -244,8 +225,8 @@ def materialize(
244
225
  manifest_entry_list_reference.append(untouched_src_manifest_entry)
245
226
  referenced_pyarrow_write_result = PyArrowWriteResult.of(
246
227
  1,
247
- manifest.meta.source_content_length,
248
- manifest.meta.content_length,
228
+ untouched_src_manifest_entry.meta.source_content_length,
229
+ untouched_src_manifest_entry.meta.content_length,
249
230
  src_file_record_count,
250
231
  )
251
232
  referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)
@@ -7,7 +7,6 @@ import numpy as np
7
7
  import pyarrow as pa
8
8
  import ray
9
9
  import s3fs
10
- from ray import cloudpickle
11
10
  from ray.types import ObjectRef
12
11
 
13
12
  from deltacat import logs
@@ -30,6 +29,7 @@ from deltacat.types.media import ContentEncoding, ContentType
30
29
  from deltacat.types.tables import get_table_slicer, get_table_writer
31
30
  from deltacat.utils.common import ReadKwargsProvider
32
31
  from deltacat.utils.ray_utils.concurrency import invoke_parallel
32
+ from deltacat.io.object_store import IObjectStore
33
33
 
34
34
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
35
35
 
@@ -203,7 +203,10 @@ def group_record_indices_by_hash_bucket(
203
203
 
204
204
 
205
205
  def group_hash_bucket_indices(
206
- hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
206
+ hash_bucket_object_groups: np.ndarray,
207
+ num_buckets: int,
208
+ num_groups: int,
209
+ object_store: Optional[IObjectStore] = None,
207
210
  ) -> Tuple[np.ndarray, List[ObjectRef]]:
208
211
  """
209
212
  Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
@@ -226,19 +229,10 @@ def group_hash_bucket_indices(
226
229
  for hb_group, obj in enumerate(hb_group_to_object):
227
230
  if obj is None:
228
231
  continue
229
- obj_ref = ray.put(obj)
230
- pickled_obj_ref = cloudpickle.dumps(obj_ref)
231
- object_refs.append(pickled_obj_ref)
232
- hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
233
- # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
234
- # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
235
- # (e.g., if the ObjectRef is deserialized by a non-Ray process).
236
- # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
237
- # The object now has a permanent reference and the data can't be freed from Ray’s object store.
238
- # Manually deleting the untrackable object references offsets these permanent references and
239
- # helps to allow these objects to be garbage collected normally.
240
- del obj_ref
241
- del pickled_obj_ref
232
+ object_ref = object_store.put(obj)
233
+ object_refs.append(object_ref)
234
+ hash_bucket_group_to_obj_id[hb_group] = object_ref
235
+ del object_ref
242
236
  return hash_bucket_group_to_obj_id, object_refs
243
237
 
244
238
 
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  import logging
3
-
3
+ from typing import Dict, Any
4
4
  from deltacat import logs
5
5
  from deltacat.compute.compactor import RoundCompletionInfo
6
6
  from deltacat.storage import PartitionLocator
@@ -19,7 +19,9 @@ def get_round_completion_file_s3_url(
19
19
 
20
20
 
21
21
  def read_round_completion_file(
22
- bucket: str, source_partition_locator: PartitionLocator
22
+ bucket: str,
23
+ source_partition_locator: PartitionLocator,
24
+ **s3_client_kwargs: Optional[Dict[str, Any]],
23
25
  ) -> RoundCompletionInfo:
24
26
 
25
27
  round_completion_file_url = get_round_completion_file_s3_url(
@@ -28,7 +30,7 @@ def read_round_completion_file(
28
30
  )
29
31
  logger.info(f"reading round completion file from: {round_completion_file_url}")
30
32
  round_completion_info = None
31
- result = s3_utils.download(round_completion_file_url, False)
33
+ result = s3_utils.download(round_completion_file_url, False, **s3_client_kwargs)
32
34
  if result:
33
35
  json_str = result["Body"].read().decode("utf-8")
34
36
  round_completion_info = RoundCompletionInfo(json.loads(json_str))
@@ -41,6 +43,7 @@ def write_round_completion_file(
41
43
  source_partition_locator: Optional[PartitionLocator],
42
44
  round_completion_info: RoundCompletionInfo,
43
45
  completion_file_s3_url: str = None,
46
+ **s3_client_kwargs: Optional[Dict[str, Any]],
44
47
  ) -> str:
45
48
  if bucket is None and completion_file_s3_url is None:
46
49
  raise AssertionError("Either bucket or completion_file_s3_url must be passed")
@@ -52,6 +55,10 @@ def write_round_completion_file(
52
55
  source_partition_locator,
53
56
  )
54
57
  logger.info(f"writing round completion file to: {completion_file_s3_url}")
55
- s3_utils.upload(completion_file_s3_url, str(json.dumps(round_completion_info)))
58
+ s3_utils.upload(
59
+ completion_file_s3_url,
60
+ str(json.dumps(round_completion_info)),
61
+ **s3_client_kwargs,
62
+ )
56
63
  logger.info(f"round completion file written to: {completion_file_s3_url}")
57
64
  return completion_file_s3_url
deltacat/io/__init__.py CHANGED
@@ -1,7 +0,0 @@
1
- from deltacat.io.dataset import DeltacatDataset
2
- from deltacat.io.read_api import read_redshift
3
-
4
- __all__ = [
5
- "DeltacatDataset",
6
- "read_redshift",
7
- ]
@@ -0,0 +1,48 @@
1
+ import logging
2
+ from ray import cloudpickle
3
+ import time
4
+ from deltacat.io.object_store import IObjectStore
5
+ from typing import Any, List
6
+ from deltacat import logs
7
+ import os
8
+ import uuid
9
+ from builtins import open
10
+
11
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
+
13
+
14
+ class FileObjectStore(IObjectStore):
15
+ """
16
+ An implementation of object store that uses file system.
17
+ """
18
+
19
+ def __init__(self, dir_path: str) -> None:
20
+ self.dir_path = dir_path
21
+ super().__init__()
22
+
23
+ def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
24
+ result = []
25
+
26
+ for obj in objects:
27
+ serialized = cloudpickle.dumps(obj)
28
+ ref = f"{self.dir_path}/{uuid.uuid4()}"
29
+ with open(ref, "xb") as f:
30
+ f.write(serialized)
31
+
32
+ result.append(ref)
33
+
34
+ return result
35
+
36
+ def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
37
+ result = []
38
+ start = time.monotonic()
39
+ for ref in refs:
40
+ with open(ref, "rb") as f:
41
+ serialized = f.read()
42
+ loaded = cloudpickle.loads(serialized)
43
+ result.append(loaded)
44
+ os.remove(ref)
45
+ end = time.monotonic()
46
+
47
+ logger.info(f"The total time taken to read all objects is: {end - start}")
48
+ return result