deltacat 1.1.28__py3-none-any.whl → 1.1.29__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.28"
47
+ __version__ = "1.1.29"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -1,3 +1,5 @@
1
+ from deltacat.utils.common import env_bool, env_integer
2
+
1
3
  TOTAL_BYTES_IN_SHA1_HASH = 20
2
4
 
3
5
  PK_DELIMITER = "L6kl7u5f"
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
31
33
  # The total size of records that will be hash bucketed at once
32
34
  # Since, sorting is nlogn, we ensure that is not performed
33
35
  # on a very large dataset for best performance.
34
- MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
36
+ MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
37
+ "MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
38
+ )
35
39
 
36
40
  # Whether to drop duplicates during merge.
37
41
  DROP_DUPLICATES = True
@@ -78,3 +82,13 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
78
82
  # Number of rounds to run hash/merge for a single
79
83
  # partition. (For large table support)
80
84
  DEFAULT_NUM_ROUNDS = 1
85
+
86
+ # Whether to perform sha1 hashing when required to
87
+ # optimize memory. For example, hashing is always
88
+ # required for bucketing where it's not mandatory
89
+ # when dropping duplicates. Setting this to True
90
+ # will disable sha1 hashing in cases where it isn't
91
+ # mandatory. This flag is False by default.
92
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
93
+ "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
94
+ )
@@ -7,6 +7,7 @@ import ray
7
7
  import itertools
8
8
  import time
9
9
  import pyarrow.compute as pc
10
+ from deltacat.utils.pyarrow import MAX_INT_BYTES
10
11
  import deltacat.compute.compactor_v2.utils.merge as merge_utils
11
12
  from uuid import uuid4
12
13
  from deltacat import logs
@@ -147,10 +148,32 @@ def _merge_tables(
147
148
  if compacted_table:
148
149
  compacted_table = all_tables[0]
149
150
 
151
+ compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
152
+ incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
153
+
154
+ logger.info(
155
+ f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
156
+ f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
157
+ )
158
+
159
+ if (
160
+ compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
161
+ or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
162
+ ):
163
+ logger.info("Casting compacted and incremental pk hash to large_string...")
164
+ # is_in combines the chunks of the chunked array passed which can cause
165
+ # ArrowCapacityError if the total size of string array is over 2GB.
166
+ # Using a large_string would resolve this issue.
167
+ # The cast here should be zero-copy in most cases.
168
+ compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
169
+ incremental_pk_hash_str = pc.cast(
170
+ incremental_pk_hash_str, pa.large_string()
171
+ )
172
+
150
173
  records_to_keep = pc.invert(
151
174
  pc.is_in(
152
- compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
153
- incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
175
+ compacted_pk_hash_str,
176
+ incremental_pk_hash_str,
154
177
  )
155
178
  )
156
179
 
@@ -492,9 +515,11 @@ def _copy_manifests_from_hash_bucketing(
492
515
  def _timed_merge(input: MergeInput) -> MergeResult:
493
516
  task_id = get_current_ray_task_id()
494
517
  worker_id = get_current_ray_worker_id()
495
- with memray.Tracker(
496
- f"merge_{worker_id}_{task_id}.bin"
497
- ) if input.enable_profiler else nullcontext():
518
+ with (
519
+ memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
520
+ if input.enable_profiler
521
+ else nullcontext()
522
+ ):
498
523
  total_input_records, total_deduped_records = 0, 0
499
524
  total_dropped_records = 0
500
525
  materialized_results: List[MaterializeResult] = []
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
25
25
  result[index] = np.arange(cl, dtype="int32")
26
26
 
27
27
  chunk_lengths = ([0] + chunk_lengths)[:-1]
28
- result = pa.chunked_array(result + np.cumsum(chunk_lengths))
28
+ result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
29
29
  return result
30
30
 
31
31
 
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
10
10
  TOTAL_BYTES_IN_SHA1_HASH,
11
11
  PK_DELIMITER,
12
12
  MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
13
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
13
14
  )
14
15
  import time
15
16
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
48
49
  f"Found total length of hash column={total_len} and total_size={total_size}"
49
50
  )
50
51
 
52
+ if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
53
+ logger.info(
54
+ f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
55
+ f"Returning False for is_sha1_desired"
56
+ )
57
+ return False
58
+
51
59
  return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
52
60
 
53
61
 
@@ -108,9 +116,10 @@ def _optimized_group_record_batches_by_hash_bucket(
108
116
  record_batches = []
109
117
  result_len = 0
110
118
  for record_batch in table_batches:
111
- current_bytes += record_batch.nbytes
112
- record_batches.append(record_batch)
113
- if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
119
+ if (
120
+ record_batches
121
+ and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
122
+ ):
114
123
  logger.info(
115
124
  f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
116
125
  f"is {len(record_batches)} and size {current_bytes}"
@@ -128,6 +137,9 @@ def _optimized_group_record_batches_by_hash_bucket(
128
137
  current_bytes = 0
129
138
  record_batches.clear()
130
139
 
140
+ current_bytes += record_batch.nbytes
141
+ record_batches.append(record_batch)
142
+
131
143
  if record_batches:
132
144
  appended_len, append_latency = timed_invocation(
133
145
  _append_table_by_hash_bucket,
@@ -601,6 +601,38 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
601
601
  skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
602
602
  assert_compaction_audit=None,
603
603
  ),
604
+ "15-incremental-empty-input-with-single-hash-bucket": IncrementalCompactionTestCaseParams(
605
+ primary_keys={"pk_col_1"},
606
+ sort_keys=[SortKey.of(key_name="sk_col_1")],
607
+ partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
608
+ partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
609
+ input_deltas=pa.Table.from_arrays(
610
+ [
611
+ pa.array([]),
612
+ pa.array([]),
613
+ ],
614
+ names=["pk_col_1", "sk_col_1"],
615
+ ),
616
+ input_deltas_delta_type=DeltaType.UPSERT,
617
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
618
+ [
619
+ pa.array([]),
620
+ pa.array([]),
621
+ ],
622
+ names=["pk_col_1", "sk_col_1"],
623
+ ),
624
+ expected_terminal_exception=None,
625
+ expected_terminal_exception_message=None,
626
+ do_create_placement_group=False,
627
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
628
+ hash_bucket_count=1,
629
+ read_kwargs_provider=None,
630
+ drop_duplicates=True,
631
+ is_inplace=False,
632
+ add_late_deltas=None,
633
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
634
+ assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
635
+ ),
604
636
  }
605
637
 
606
638
  INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
@@ -1,6 +1,7 @@
1
1
  from typing import Dict, Any
2
2
  import ray
3
3
  import os
4
+ import pyarrow as pa
4
5
  import pytest
5
6
  import boto3
6
7
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
@@ -76,6 +77,17 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
76
77
  os.remove(DATABASE_FILE_PATH_VALUE)
77
78
 
78
79
 
80
+ @pytest.fixture(scope="function")
81
+ def disable_sha1(monkeypatch):
82
+ import deltacat.compute.compactor_v2.utils.primary_key_index
83
+
84
+ monkeypatch.setattr(
85
+ deltacat.compute.compactor_v2.utils.primary_key_index,
86
+ "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
87
+ True,
88
+ )
89
+
90
+
79
91
  class TestCompactionSession:
80
92
  """
81
93
  This class adds specific tests that aren't part of the parametrized test suite.
@@ -556,3 +568,124 @@ class TestCompactionSession:
556
568
  }
557
569
  )
558
570
  )
571
+
572
+ def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
573
+ self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
574
+ ):
575
+ """
576
+ A test case which ensures the compaction succeeds even if the incremental
577
+ arrow table size is over 2GB. It is added to prevent ArrowCapacityError
578
+ when running is_in operation during merge.
579
+
580
+ Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
581
+ which truncates the lengths of pk strings when deduping.
582
+ """
583
+ # setup
584
+ staged_source = stage_partition_from_file_paths(
585
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
586
+ )
587
+ # we create chunked array to avoid ArrowCapacityError
588
+ chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
589
+ table = pa.table([chunked_pk_array], names=["pk"])
590
+ source_delta = commit_delta_to_staged_partition(
591
+ staged_source, pa_table=table, **local_deltacat_storage_kwargs
592
+ )
593
+
594
+ staged_dest = stage_partition_from_file_paths(
595
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
596
+ )
597
+ dest_partition = ds.commit_partition(
598
+ staged_dest, **local_deltacat_storage_kwargs
599
+ )
600
+
601
+ # rebase first
602
+ rebase_url = compact_partition(
603
+ CompactPartitionParams.of(
604
+ {
605
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
606
+ "compacted_file_content_type": ContentType.PARQUET,
607
+ "dd_max_parallelism_ratio": 1.0,
608
+ "deltacat_storage": ds,
609
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
610
+ "destination_partition_locator": dest_partition.locator,
611
+ "drop_duplicates": True,
612
+ "hash_bucket_count": 1,
613
+ "last_stream_position_to_compact": source_delta.stream_position,
614
+ "list_deltas_kwargs": {
615
+ **local_deltacat_storage_kwargs,
616
+ **{"equivalent_table_types": []},
617
+ },
618
+ "primary_keys": ["pk"],
619
+ "rebase_source_partition_locator": source_delta.partition_locator,
620
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
621
+ "records_per_compacted_file": 4000,
622
+ "s3_client_kwargs": {},
623
+ "source_partition_locator": source_delta.partition_locator,
624
+ "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
625
+ }
626
+ )
627
+ )
628
+
629
+ rebased_rcf = get_rcf(s3_resource, rebase_url)
630
+
631
+ assert rebased_rcf.compacted_pyarrow_write_result.files == 1
632
+ assert rebased_rcf.compacted_pyarrow_write_result.records == 2
633
+
634
+ # Run incremental with a small delta on source
635
+ chunked_pk_array = pa.chunked_array(
636
+ [["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
637
+ ) # 2.3GB
638
+ table = pa.table([chunked_pk_array], names=["pk"])
639
+
640
+ incremental_source_delta = commit_delta_to_partition(
641
+ source_delta.partition_locator,
642
+ pa_table=table,
643
+ **local_deltacat_storage_kwargs,
644
+ )
645
+ assert (
646
+ incremental_source_delta.partition_locator == source_delta.partition_locator
647
+ ), "source partition locator should not change"
648
+ dest_partition = ds.get_partition(
649
+ dest_partition.stream_locator,
650
+ dest_partition.partition_values,
651
+ **local_deltacat_storage_kwargs,
652
+ )
653
+
654
+ assert (
655
+ dest_partition.locator
656
+ == rebased_rcf.compacted_delta_locator.partition_locator
657
+ ), "The new destination partition should be same as compacted partition"
658
+
659
+ # Run incremental
660
+ incremental_url = compact_partition(
661
+ CompactPartitionParams.of(
662
+ {
663
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
664
+ "compacted_file_content_type": ContentType.PARQUET,
665
+ "dd_max_parallelism_ratio": 1.0,
666
+ "deltacat_storage": ds,
667
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
668
+ "destination_partition_locator": dest_partition.locator,
669
+ "drop_duplicates": True,
670
+ "hash_bucket_count": 1,
671
+ "last_stream_position_to_compact": incremental_source_delta.stream_position,
672
+ "list_deltas_kwargs": {
673
+ **local_deltacat_storage_kwargs,
674
+ **{"equivalent_table_types": []},
675
+ },
676
+ "primary_keys": ["pk"],
677
+ "records_per_compacted_file": 4000,
678
+ "s3_client_kwargs": {},
679
+ "source_partition_locator": incremental_source_delta.partition_locator,
680
+ "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
681
+ }
682
+ )
683
+ )
684
+
685
+ incremental_rcf = get_rcf(s3_resource, incremental_url)
686
+
687
+ assert incremental_rcf.compacted_pyarrow_write_result.files == 1
688
+ assert (
689
+ incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
690
+ )
691
+ assert incremental_rcf.compacted_pyarrow_write_result.records == 4
@@ -0,0 +1,45 @@
1
+ import pyarrow as pa
2
+ from deltacat.compute.compactor_v2.utils.primary_key_index import (
3
+ group_by_pk_hash_bucket,
4
+ )
5
+
6
+
7
+ class TestGroupByPkHashBucket:
8
+ def test_sanity(self):
9
+ record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
10
+ pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
11
+ record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
12
+ table = pa.Table.from_batches([record_batch])
13
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
14
+
15
+ assert len(grouped_array) == 3
16
+ total_records = 0
17
+ for arr in grouped_array:
18
+ if arr is not None:
19
+ total_records += len(arr[1])
20
+
21
+ assert total_records == len(table)
22
+
23
+ def test_when_record_batches_exceed_int_max_size(self):
24
+ record = pa.array(["12bytestring" * 90_000_000])
25
+ record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
26
+ table = pa.Table.from_batches([record_batch, record_batch])
27
+
28
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
29
+
30
+ assert len(grouped_array) == 3
31
+ # two record batches are preserved as combining them
32
+ # would exceed 2GB.
33
+ assert len(grouped_array[2].to_batches()) == 2
34
+
35
+ def test_when_record_batches_less_than_int_max_size(self):
36
+ record = pa.array(["12bytestring" * 90_000])
37
+ record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
38
+ table = pa.Table.from_batches([record_batch, record_batch])
39
+
40
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
41
+
42
+ assert len(grouped_array) == 3
43
+ # Combined the arrays into one record batch as the size
44
+ # would not exceed 2GB.
45
+ assert len(grouped_array[1].to_batches()) == 1
@@ -47,7 +47,8 @@ def stage_partition_from_file_paths(
47
47
 
48
48
  def commit_delta_to_staged_partition(
49
49
  staged_partition,
50
- file_paths: List[str],
50
+ file_paths: List[str] = None,
51
+ pa_table: pa.Table = None,
51
52
  content_type: ContentType = ContentType.PARQUET,
52
53
  *args,
53
54
  **kwargs,
@@ -57,6 +58,7 @@ def commit_delta_to_staged_partition(
57
58
  *args,
58
59
  file_paths=file_paths,
59
60
  content_type=content_type,
61
+ pa_table=pa_table,
60
62
  **kwargs,
61
63
  )
62
64
  ds.commit_partition(staged_partition, **kwargs)
@@ -76,23 +78,28 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
76
78
 
77
79
  def commit_delta_to_partition(
78
80
  partition: Union[Partition, PartitionLocator],
79
- file_paths: List[str],
81
+ file_paths: List[str] = None,
82
+ pa_table: pa.Table = None,
80
83
  content_type: ContentType = ContentType.PARQUET,
81
84
  *args,
82
85
  **kwargs,
83
86
  ) -> Delta:
84
- tables = []
85
87
 
86
88
  if isinstance(partition, PartitionLocator):
87
89
  partition = ds.get_partition(
88
90
  partition.stream_locator, partition.partition_values, *args, **kwargs
89
91
  )
92
+ if pa_table is None:
93
+ assert file_paths is not None, "One of pa_table or file_paths must be passed."
94
+ tables = []
95
+ for file_path in file_paths:
96
+ table = pa.csv.read_csv(file_path)
97
+ tables.append(table)
90
98
 
91
- for file_path in file_paths:
92
- table = pa.csv.read_csv(file_path)
93
- tables.append(table)
99
+ pa_table = pa.concat_tables(tables)
94
100
 
95
- table = pa.concat_tables(tables)
96
- staged_delta = ds.stage_delta(table, partition, content_type=content_type, **kwargs)
101
+ staged_delta = ds.stage_delta(
102
+ pa_table, partition, content_type=content_type, **kwargs
103
+ )
97
104
 
98
105
  return ds.commit_delta(staged_delta, **kwargs)
@@ -16,6 +16,7 @@ from pyarrow.parquet import ParquetFile
16
16
  import pyarrow as pa
17
17
 
18
18
  PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
19
+ PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
19
20
  EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
20
21
  NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
21
22
  OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
@@ -789,3 +790,25 @@ class TestS3FileToTable(TestCase):
789
790
  self.assertEqual(field.name, schema.field(index).name)
790
791
 
791
792
  self.assertEqual(result.schema.field(1).type, "string")
793
+
794
+ def test_s3_file_to_table_when_parquet_gzip(self):
795
+
796
+ pa_kwargs_provider = lambda content_type, kwargs: {
797
+ "reader_type": "pyarrow",
798
+ **kwargs,
799
+ }
800
+
801
+ result = s3_file_to_table(
802
+ PARQUET_GZIP_COMPRESSED_FILE_PATH,
803
+ ContentType.PARQUET.value,
804
+ ContentEncoding.GZIP.value,
805
+ ["n_legs", "animal"],
806
+ ["n_legs"],
807
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
808
+ )
809
+
810
+ self.assertEqual(len(result), 6)
811
+ self.assertEqual(len(result.column_names), 1)
812
+ schema = result.schema
813
+ schema_index = schema.get_field_index("n_legs")
814
+ self.assertEqual(schema.field(schema_index).type, "int64")
deltacat/utils/pyarrow.py CHANGED
@@ -58,6 +58,7 @@ RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
58
58
  # Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
59
59
  DECIMAL256_DEFAULT_SCALE = 38
60
60
  DECIMAL256_MAX_PRECISION = 76
61
+ MAX_INT_BYTES = 2147483646
61
62
 
62
63
 
63
64
  def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
@@ -129,9 +130,11 @@ def _read_csv_rounding_decimal_columns_to_fit_scale(
129
130
  # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
130
131
  new_schema = _new_schema_with_replaced_fields(
131
132
  schema,
132
- lambda fld: pa.field(fld.name, pa.string(), metadata=fld.metadata)
133
- if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
134
- else None,
133
+ lambda fld: (
134
+ pa.field(fld.name, pa.string(), metadata=fld.metadata)
135
+ if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
136
+ else None
137
+ ),
135
138
  )
136
139
  new_kwargs = sanitize_kwargs_by_supported_kwargs(
137
140
  ["read_options", "parse_options", "convert_options", "memory_pool"],
@@ -569,8 +572,8 @@ def s3_file_to_table(
569
572
  **s3_client_kwargs,
570
573
  )
571
574
 
572
- if READER_TYPE_KWARG in kwargs:
573
- kwargs.pop(READER_TYPE_KWARG)
575
+ if READER_TYPE_KWARG in kwargs:
576
+ kwargs.pop(READER_TYPE_KWARG)
574
577
 
575
578
  filesystem = io
576
579
  if s3_url.startswith("s3://"):
@@ -912,7 +915,6 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
912
915
  TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
913
916
  """
914
917
  dtype = array.type
915
- MAX_BYTES = 2147483646
916
918
  max_str_len = None
917
919
  if pa.types.is_integer(dtype):
918
920
  max_str_len = _int_max_string_len()
@@ -924,7 +926,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
924
926
  max_str_len = _max_decimal256_string_len()
925
927
 
926
928
  if max_str_len is not None:
927
- max_elems_per_chunk = MAX_BYTES // (2 * max_str_len) # safety factor of 2
929
+ max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len) # safety factor of 2
928
930
  all_chunks = []
929
931
  for chunk in array.chunks:
930
932
  if len(chunk) < max_elems_per_chunk:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.28
3
+ Version: 1.1.29
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=GPlTQc6AW4ig_nZJ7kMVe-kbZxYfrSVGFN1YEqY8dXU,1778
1
+ deltacat/__init__.py,sha256=DoUiDxmgMh8HUGOEAG7CUY0Q9Ip-S7gePDsL8XQO5kk,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -51,7 +51,7 @@ deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZR
51
51
  deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
52
52
  deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
54
- deltacat/compute/compactor_v2/constants.py,sha256=AOvnIxQfKOnLubrUsg4g8OPLgqvOT46LE_da9_Dm2KY,2507
54
+ deltacat/compute/compactor_v2/constants.py,sha256=wvd34d7RGdniGbbiJcMljxRrRas4_uy9F9UaqXfS_Ag,3034
55
55
  deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
57
57
  deltacat/compute/compactor_v2/deletes/delete_strategy.py,sha256=SMEJOxR-5r92kvKNqtu2w6HmwtmhljcZX1wcNEuS-4w,2833
@@ -69,14 +69,14 @@ deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
69
69
  deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
70
70
  deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
71
  deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
72
- deltacat/compute/compactor_v2/steps/merge.py,sha256=LpktsDPfj7Of6RgUw9w1f3Y3OBkPDjvtyXjzFaIDoSo,21771
72
+ deltacat/compute/compactor_v2/steps/merge.py,sha256=qxmb3cmiKvOgfuOzlJT4Q60zOyWNjsiuZSzxdh6KTm8,22909
73
73
  deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
74
  deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=1P9CDpuWErsFcTTlRCeuUQHDokVI92he_MsL82uRAdA,7424
75
- deltacat/compute/compactor_v2/utils/dedupe.py,sha256=62tFCY2iRP7I3-45GCIYs6_SJsQl8C5lBEr8gbNfbsw,1932
75
+ deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
76
76
  deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
77
77
  deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
78
78
  deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
79
- deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=QOMwWxGhZ7VWa3oE6InM4thR5pbjmT7ttNXvx_IiKjo,11676
79
+ deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
80
80
  deltacat/compute/compactor_v2/utils/task_options.py,sha256=W0jyWIIZ0tcSAGp8mhpnu1G8p3rmX4d3juCPpAJxnDM,12649
81
81
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
82
82
  deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
@@ -136,7 +136,7 @@ deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
136
136
  deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kWyIJQMyF9oBemvgOp3ngGhMpH9zjkznV-67ewELgHQ,37719
137
137
  deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
138
138
  deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
139
- deltacat/tests/compute/compact_partition_test_cases.py,sha256=R9eiKvxCLqcoHjAx3iOogdnXZEO9TvLbRf0wA7bcJN4,26170
139
+ deltacat/tests/compute/compact_partition_test_cases.py,sha256=HJ15Xyawv8ImFju8wDwt22fH5okoPhyS-QAygkXDb7Q,27422
140
140
  deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
141
141
  deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
142
142
  deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
@@ -152,9 +152,10 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
152
152
  deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
153
153
  deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
154
154
  deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=0U8Hmu-qLvqXqLPBPS6qENc1ErolWAaAoUlwms2xLe8,23124
155
+ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADHENzUKMQYguB45zOD7F2lZgcBYYTvbTsdM,28957
156
156
  deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
157
157
  deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
+ deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
158
159
  deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
159
160
  deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
160
161
  deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
@@ -171,7 +172,7 @@ deltacat/tests/local_deltacat_storage/__init__.py,sha256=5T9ubNIS42-BotEH0yrUiWE
171
172
  deltacat/tests/local_deltacat_storage/exceptions.py,sha256=oxZ0psmrEO0M6P2r8gHQ2E8E-Y8UBfUCBUIwfuHcx38,251
172
173
  deltacat/tests/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
173
174
  deltacat/tests/test_utils/constants.py,sha256=UYe--9T_clYjiOpv0M7TtAMGdpje_SMZ-w8n0IeCAjc,214
174
- deltacat/tests/test_utils/pyarrow.py,sha256=pzTBk07xMaAfykXo3GNGwTqaQxrKnSbr-WO3HBszikI,2828
175
+ deltacat/tests/test_utils/pyarrow.py,sha256=QDdGilzsJ2xUESiGotdNVZde9yD7ja9MvNhhssnox-E,3083
175
176
  deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfWhKOSNtM,972
176
177
  deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
177
178
  deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -179,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
179
180
  deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
180
181
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
181
182
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
182
- deltacat/tests/utils/test_pyarrow.py,sha256=YDuyFYNjy6thzfA6Z2a0dOytUugsExu1uMUOhmP_aXc,29977
183
+ deltacat/tests/utils/test_pyarrow.py,sha256=fDjDkGPjdRZA3kgjgiQRym9shdeDYgkdDPYU2a7IEUk,30790
183
184
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
184
185
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
185
186
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -200,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
200
201
  deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
201
202
  deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
202
203
  deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
203
- deltacat/utils/pyarrow.py,sha256=xEZRzbTBU6uj9K4DtvngIPtQkTA8haVgQ4Y4vjwHvtM,34311
204
+ deltacat/utils/pyarrow.py,sha256=R3KkJPenE48rS3VrfFKSkJerX94f4e7X2dUPBQg44DY,34339
204
205
  deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
205
206
  deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
206
207
  deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -210,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
210
211
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
211
212
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
212
213
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
213
- deltacat-1.1.28.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
214
- deltacat-1.1.28.dist-info/METADATA,sha256=-ZnMp9C26vVxi015Il3UtBMm9pHqA8aZmTnNTgr8Tb8,1733
215
- deltacat-1.1.28.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
216
- deltacat-1.1.28.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
217
- deltacat-1.1.28.dist-info/RECORD,,
214
+ deltacat-1.1.29.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
215
+ deltacat-1.1.29.dist-info/METADATA,sha256=ZverlgFUJV4wGJao8tusRCv_sRNX4KJ4RTNAGvBCJes,1733
216
+ deltacat-1.1.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
217
+ deltacat-1.1.29.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
218
+ deltacat-1.1.29.dist-info/RECORD,,