deltacat 1.1.28__py3-none-any.whl → 1.1.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.28"
47
+ __version__ = "1.1.29"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -1,3 +1,5 @@
1
+ from deltacat.utils.common import env_bool, env_integer
2
+
1
3
  TOTAL_BYTES_IN_SHA1_HASH = 20
2
4
 
3
5
  PK_DELIMITER = "L6kl7u5f"
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
31
33
  # The total size of records that will be hash bucketed at once
32
34
  # Since, sorting is nlogn, we ensure that is not performed
33
35
  # on a very large dataset for best performance.
34
- MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
36
+ MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
37
+ "MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
38
+ )
35
39
 
36
40
  # Whether to drop duplicates during merge.
37
41
  DROP_DUPLICATES = True
@@ -78,3 +82,13 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
78
82
  # Number of rounds to run hash/merge for a single
79
83
  # partition. (For large table support)
80
84
  DEFAULT_NUM_ROUNDS = 1
85
+
86
+ # Whether to perform sha1 hashing when required to
87
+ # optimize memory. For example, hashing is always
88
+ # required for bucketing where it's not mandatory
89
+ # when dropping duplicates. Setting this to True
90
+ # will disable sha1 hashing in cases where it isn't
91
+ # mandatory. This flag is False by default.
92
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
93
+ "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
94
+ )
@@ -7,6 +7,7 @@ import ray
7
7
  import itertools
8
8
  import time
9
9
  import pyarrow.compute as pc
10
+ from deltacat.utils.pyarrow import MAX_INT_BYTES
10
11
  import deltacat.compute.compactor_v2.utils.merge as merge_utils
11
12
  from uuid import uuid4
12
13
  from deltacat import logs
@@ -147,10 +148,32 @@ def _merge_tables(
147
148
  if compacted_table:
148
149
  compacted_table = all_tables[0]
149
150
 
151
+ compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
152
+ incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
153
+
154
+ logger.info(
155
+ f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
156
+ f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
157
+ )
158
+
159
+ if (
160
+ compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
161
+ or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
162
+ ):
163
+ logger.info("Casting compacted and incremental pk hash to large_string...")
164
+ # is_in combines the chunks of the chunked array passed which can cause
165
+ # ArrowCapacityError if the total size of string array is over 2GB.
166
+ # Using a large_string would resolve this issue.
167
+ # The cast here should be zero-copy in most cases.
168
+ compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
169
+ incremental_pk_hash_str = pc.cast(
170
+ incremental_pk_hash_str, pa.large_string()
171
+ )
172
+
150
173
  records_to_keep = pc.invert(
151
174
  pc.is_in(
152
- compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
153
- incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
175
+ compacted_pk_hash_str,
176
+ incremental_pk_hash_str,
154
177
  )
155
178
  )
156
179
 
@@ -492,9 +515,11 @@ def _copy_manifests_from_hash_bucketing(
492
515
  def _timed_merge(input: MergeInput) -> MergeResult:
493
516
  task_id = get_current_ray_task_id()
494
517
  worker_id = get_current_ray_worker_id()
495
- with memray.Tracker(
496
- f"merge_{worker_id}_{task_id}.bin"
497
- ) if input.enable_profiler else nullcontext():
518
+ with (
519
+ memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
520
+ if input.enable_profiler
521
+ else nullcontext()
522
+ ):
498
523
  total_input_records, total_deduped_records = 0, 0
499
524
  total_dropped_records = 0
500
525
  materialized_results: List[MaterializeResult] = []
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
25
25
  result[index] = np.arange(cl, dtype="int32")
26
26
 
27
27
  chunk_lengths = ([0] + chunk_lengths)[:-1]
28
- result = pa.chunked_array(result + np.cumsum(chunk_lengths))
28
+ result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
29
29
  return result
30
30
 
31
31
 
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
10
10
  TOTAL_BYTES_IN_SHA1_HASH,
11
11
  PK_DELIMITER,
12
12
  MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
13
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
13
14
  )
14
15
  import time
15
16
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
48
49
  f"Found total length of hash column={total_len} and total_size={total_size}"
49
50
  )
50
51
 
52
+ if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
53
+ logger.info(
54
+ f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
55
+ f"Returning False for is_sha1_desired"
56
+ )
57
+ return False
58
+
51
59
  return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
52
60
 
53
61
 
@@ -108,9 +116,10 @@ def _optimized_group_record_batches_by_hash_bucket(
108
116
  record_batches = []
109
117
  result_len = 0
110
118
  for record_batch in table_batches:
111
- current_bytes += record_batch.nbytes
112
- record_batches.append(record_batch)
113
- if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
119
+ if (
120
+ record_batches
121
+ and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
122
+ ):
114
123
  logger.info(
115
124
  f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
116
125
  f"is {len(record_batches)} and size {current_bytes}"
@@ -128,6 +137,9 @@ def _optimized_group_record_batches_by_hash_bucket(
128
137
  current_bytes = 0
129
138
  record_batches.clear()
130
139
 
140
+ current_bytes += record_batch.nbytes
141
+ record_batches.append(record_batch)
142
+
131
143
  if record_batches:
132
144
  appended_len, append_latency = timed_invocation(
133
145
  _append_table_by_hash_bucket,
@@ -601,6 +601,38 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
601
601
  skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
602
602
  assert_compaction_audit=None,
603
603
  ),
604
+ "15-incremental-empty-input-with-single-hash-bucket": IncrementalCompactionTestCaseParams(
605
+ primary_keys={"pk_col_1"},
606
+ sort_keys=[SortKey.of(key_name="sk_col_1")],
607
+ partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
608
+ partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
609
+ input_deltas=pa.Table.from_arrays(
610
+ [
611
+ pa.array([]),
612
+ pa.array([]),
613
+ ],
614
+ names=["pk_col_1", "sk_col_1"],
615
+ ),
616
+ input_deltas_delta_type=DeltaType.UPSERT,
617
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
618
+ [
619
+ pa.array([]),
620
+ pa.array([]),
621
+ ],
622
+ names=["pk_col_1", "sk_col_1"],
623
+ ),
624
+ expected_terminal_exception=None,
625
+ expected_terminal_exception_message=None,
626
+ do_create_placement_group=False,
627
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
628
+ hash_bucket_count=1,
629
+ read_kwargs_provider=None,
630
+ drop_duplicates=True,
631
+ is_inplace=False,
632
+ add_late_deltas=None,
633
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
634
+ assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
635
+ ),
604
636
  }
605
637
 
606
638
  INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
@@ -1,6 +1,7 @@
1
1
  from typing import Dict, Any
2
2
  import ray
3
3
  import os
4
+ import pyarrow as pa
4
5
  import pytest
5
6
  import boto3
6
7
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
@@ -76,6 +77,17 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
76
77
  os.remove(DATABASE_FILE_PATH_VALUE)
77
78
 
78
79
 
80
+ @pytest.fixture(scope="function")
81
+ def disable_sha1(monkeypatch):
82
+ import deltacat.compute.compactor_v2.utils.primary_key_index
83
+
84
+ monkeypatch.setattr(
85
+ deltacat.compute.compactor_v2.utils.primary_key_index,
86
+ "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
87
+ True,
88
+ )
89
+
90
+
79
91
  class TestCompactionSession:
80
92
  """
81
93
  This class adds specific tests that aren't part of the parametrized test suite.
@@ -556,3 +568,124 @@ class TestCompactionSession:
556
568
  }
557
569
  )
558
570
  )
571
+
572
+ def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
573
+ self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
574
+ ):
575
+ """
576
+ A test case which ensures the compaction succeeds even if the incremental
577
+ arrow table size is over 2GB. It is added to prevent ArrowCapacityError
578
+ when running is_in operation during merge.
579
+
580
+ Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
581
+ which truncates the lengths of pk strings when deduping.
582
+ """
583
+ # setup
584
+ staged_source = stage_partition_from_file_paths(
585
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
586
+ )
587
+ # we create chunked array to avoid ArrowCapacityError
588
+ chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
589
+ table = pa.table([chunked_pk_array], names=["pk"])
590
+ source_delta = commit_delta_to_staged_partition(
591
+ staged_source, pa_table=table, **local_deltacat_storage_kwargs
592
+ )
593
+
594
+ staged_dest = stage_partition_from_file_paths(
595
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
596
+ )
597
+ dest_partition = ds.commit_partition(
598
+ staged_dest, **local_deltacat_storage_kwargs
599
+ )
600
+
601
+ # rebase first
602
+ rebase_url = compact_partition(
603
+ CompactPartitionParams.of(
604
+ {
605
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
606
+ "compacted_file_content_type": ContentType.PARQUET,
607
+ "dd_max_parallelism_ratio": 1.0,
608
+ "deltacat_storage": ds,
609
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
610
+ "destination_partition_locator": dest_partition.locator,
611
+ "drop_duplicates": True,
612
+ "hash_bucket_count": 1,
613
+ "last_stream_position_to_compact": source_delta.stream_position,
614
+ "list_deltas_kwargs": {
615
+ **local_deltacat_storage_kwargs,
616
+ **{"equivalent_table_types": []},
617
+ },
618
+ "primary_keys": ["pk"],
619
+ "rebase_source_partition_locator": source_delta.partition_locator,
620
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
621
+ "records_per_compacted_file": 4000,
622
+ "s3_client_kwargs": {},
623
+ "source_partition_locator": source_delta.partition_locator,
624
+ "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
625
+ }
626
+ )
627
+ )
628
+
629
+ rebased_rcf = get_rcf(s3_resource, rebase_url)
630
+
631
+ assert rebased_rcf.compacted_pyarrow_write_result.files == 1
632
+ assert rebased_rcf.compacted_pyarrow_write_result.records == 2
633
+
634
+ # Run incremental with a small delta on source
635
+ chunked_pk_array = pa.chunked_array(
636
+ [["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
637
+ ) # 2.3GB
638
+ table = pa.table([chunked_pk_array], names=["pk"])
639
+
640
+ incremental_source_delta = commit_delta_to_partition(
641
+ source_delta.partition_locator,
642
+ pa_table=table,
643
+ **local_deltacat_storage_kwargs,
644
+ )
645
+ assert (
646
+ incremental_source_delta.partition_locator == source_delta.partition_locator
647
+ ), "source partition locator should not change"
648
+ dest_partition = ds.get_partition(
649
+ dest_partition.stream_locator,
650
+ dest_partition.partition_values,
651
+ **local_deltacat_storage_kwargs,
652
+ )
653
+
654
+ assert (
655
+ dest_partition.locator
656
+ == rebased_rcf.compacted_delta_locator.partition_locator
657
+ ), "The new destination partition should be same as compacted partition"
658
+
659
+ # Run incremental
660
+ incremental_url = compact_partition(
661
+ CompactPartitionParams.of(
662
+ {
663
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
664
+ "compacted_file_content_type": ContentType.PARQUET,
665
+ "dd_max_parallelism_ratio": 1.0,
666
+ "deltacat_storage": ds,
667
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
668
+ "destination_partition_locator": dest_partition.locator,
669
+ "drop_duplicates": True,
670
+ "hash_bucket_count": 1,
671
+ "last_stream_position_to_compact": incremental_source_delta.stream_position,
672
+ "list_deltas_kwargs": {
673
+ **local_deltacat_storage_kwargs,
674
+ **{"equivalent_table_types": []},
675
+ },
676
+ "primary_keys": ["pk"],
677
+ "records_per_compacted_file": 4000,
678
+ "s3_client_kwargs": {},
679
+ "source_partition_locator": incremental_source_delta.partition_locator,
680
+ "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
681
+ }
682
+ )
683
+ )
684
+
685
+ incremental_rcf = get_rcf(s3_resource, incremental_url)
686
+
687
+ assert incremental_rcf.compacted_pyarrow_write_result.files == 1
688
+ assert (
689
+ incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
690
+ )
691
+ assert incremental_rcf.compacted_pyarrow_write_result.records == 4
@@ -0,0 +1,45 @@
1
+ import pyarrow as pa
2
+ from deltacat.compute.compactor_v2.utils.primary_key_index import (
3
+ group_by_pk_hash_bucket,
4
+ )
5
+
6
+
7
+ class TestGroupByPkHashBucket:
8
+ def test_sanity(self):
9
+ record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
10
+ pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
11
+ record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
12
+ table = pa.Table.from_batches([record_batch])
13
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
14
+
15
+ assert len(grouped_array) == 3
16
+ total_records = 0
17
+ for arr in grouped_array:
18
+ if arr is not None:
19
+ total_records += len(arr[1])
20
+
21
+ assert total_records == len(table)
22
+
23
+ def test_when_record_batches_exceed_int_max_size(self):
24
+ record = pa.array(["12bytestring" * 90_000_000])
25
+ record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
26
+ table = pa.Table.from_batches([record_batch, record_batch])
27
+
28
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
29
+
30
+ assert len(grouped_array) == 3
31
+ # two record batches are preserved as combining them
32
+ # would exceed 2GB.
33
+ assert len(grouped_array[2].to_batches()) == 2
34
+
35
+ def test_when_record_batches_less_than_int_max_size(self):
36
+ record = pa.array(["12bytestring" * 90_000])
37
+ record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
38
+ table = pa.Table.from_batches([record_batch, record_batch])
39
+
40
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
41
+
42
+ assert len(grouped_array) == 3
43
+ # Combined the arrays into one record batch as the size
44
+ # would not exceed 2GB.
45
+ assert len(grouped_array[1].to_batches()) == 1
@@ -47,7 +47,8 @@ def stage_partition_from_file_paths(
47
47
 
48
48
  def commit_delta_to_staged_partition(
49
49
  staged_partition,
50
- file_paths: List[str],
50
+ file_paths: List[str] = None,
51
+ pa_table: pa.Table = None,
51
52
  content_type: ContentType = ContentType.PARQUET,
52
53
  *args,
53
54
  **kwargs,
@@ -57,6 +58,7 @@ def commit_delta_to_staged_partition(
57
58
  *args,
58
59
  file_paths=file_paths,
59
60
  content_type=content_type,
61
+ pa_table=pa_table,
60
62
  **kwargs,
61
63
  )
62
64
  ds.commit_partition(staged_partition, **kwargs)
@@ -76,23 +78,28 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
76
78
 
77
79
  def commit_delta_to_partition(
78
80
  partition: Union[Partition, PartitionLocator],
79
- file_paths: List[str],
81
+ file_paths: List[str] = None,
82
+ pa_table: pa.Table = None,
80
83
  content_type: ContentType = ContentType.PARQUET,
81
84
  *args,
82
85
  **kwargs,
83
86
  ) -> Delta:
84
- tables = []
85
87
 
86
88
  if isinstance(partition, PartitionLocator):
87
89
  partition = ds.get_partition(
88
90
  partition.stream_locator, partition.partition_values, *args, **kwargs
89
91
  )
92
+ if pa_table is None:
93
+ assert file_paths is not None, "One of pa_table or file_paths must be passed."
94
+ tables = []
95
+ for file_path in file_paths:
96
+ table = pa.csv.read_csv(file_path)
97
+ tables.append(table)
90
98
 
91
- for file_path in file_paths:
92
- table = pa.csv.read_csv(file_path)
93
- tables.append(table)
99
+ pa_table = pa.concat_tables(tables)
94
100
 
95
- table = pa.concat_tables(tables)
96
- staged_delta = ds.stage_delta(table, partition, content_type=content_type, **kwargs)
101
+ staged_delta = ds.stage_delta(
102
+ pa_table, partition, content_type=content_type, **kwargs
103
+ )
97
104
 
98
105
  return ds.commit_delta(staged_delta, **kwargs)
@@ -16,6 +16,7 @@ from pyarrow.parquet import ParquetFile
16
16
  import pyarrow as pa
17
17
 
18
18
  PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
19
+ PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
19
20
  EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
20
21
  NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
21
22
  OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
@@ -789,3 +790,25 @@ class TestS3FileToTable(TestCase):
789
790
  self.assertEqual(field.name, schema.field(index).name)
790
791
 
791
792
  self.assertEqual(result.schema.field(1).type, "string")
793
+
794
+ def test_s3_file_to_table_when_parquet_gzip(self):
795
+
796
+ pa_kwargs_provider = lambda content_type, kwargs: {
797
+ "reader_type": "pyarrow",
798
+ **kwargs,
799
+ }
800
+
801
+ result = s3_file_to_table(
802
+ PARQUET_GZIP_COMPRESSED_FILE_PATH,
803
+ ContentType.PARQUET.value,
804
+ ContentEncoding.GZIP.value,
805
+ ["n_legs", "animal"],
806
+ ["n_legs"],
807
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
808
+ )
809
+
810
+ self.assertEqual(len(result), 6)
811
+ self.assertEqual(len(result.column_names), 1)
812
+ schema = result.schema
813
+ schema_index = schema.get_field_index("n_legs")
814
+ self.assertEqual(schema.field(schema_index).type, "int64")
deltacat/utils/pyarrow.py CHANGED
@@ -58,6 +58,7 @@ RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
58
58
  # Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
59
59
  DECIMAL256_DEFAULT_SCALE = 38
60
60
  DECIMAL256_MAX_PRECISION = 76
61
+ MAX_INT_BYTES = 2147483646
61
62
 
62
63
 
63
64
  def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
@@ -129,9 +130,11 @@ def _read_csv_rounding_decimal_columns_to_fit_scale(
129
130
  # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
130
131
  new_schema = _new_schema_with_replaced_fields(
131
132
  schema,
132
- lambda fld: pa.field(fld.name, pa.string(), metadata=fld.metadata)
133
- if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
134
- else None,
133
+ lambda fld: (
134
+ pa.field(fld.name, pa.string(), metadata=fld.metadata)
135
+ if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
136
+ else None
137
+ ),
135
138
  )
136
139
  new_kwargs = sanitize_kwargs_by_supported_kwargs(
137
140
  ["read_options", "parse_options", "convert_options", "memory_pool"],
@@ -569,8 +572,8 @@ def s3_file_to_table(
569
572
  **s3_client_kwargs,
570
573
  )
571
574
 
572
- if READER_TYPE_KWARG in kwargs:
573
- kwargs.pop(READER_TYPE_KWARG)
575
+ if READER_TYPE_KWARG in kwargs:
576
+ kwargs.pop(READER_TYPE_KWARG)
574
577
 
575
578
  filesystem = io
576
579
  if s3_url.startswith("s3://"):
@@ -912,7 +915,6 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
912
915
  TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
913
916
  """
914
917
  dtype = array.type
915
- MAX_BYTES = 2147483646
916
918
  max_str_len = None
917
919
  if pa.types.is_integer(dtype):
918
920
  max_str_len = _int_max_string_len()
@@ -924,7 +926,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
924
926
  max_str_len = _max_decimal256_string_len()
925
927
 
926
928
  if max_str_len is not None:
927
- max_elems_per_chunk = MAX_BYTES // (2 * max_str_len) # safety factor of 2
929
+ max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len) # safety factor of 2
928
930
  all_chunks = []
929
931
  for chunk in array.chunks:
930
932
  if len(chunk) < max_elems_per_chunk:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.28
3
+ Version: 1.1.29
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=GPlTQc6AW4ig_nZJ7kMVe-kbZxYfrSVGFN1YEqY8dXU,1778
1
+ deltacat/__init__.py,sha256=DoUiDxmgMh8HUGOEAG7CUY0Q9Ip-S7gePDsL8XQO5kk,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -51,7 +51,7 @@ deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZR
51
51
  deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
52
52
  deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
54
- deltacat/compute/compactor_v2/constants.py,sha256=AOvnIxQfKOnLubrUsg4g8OPLgqvOT46LE_da9_Dm2KY,2507
54
+ deltacat/compute/compactor_v2/constants.py,sha256=wvd34d7RGdniGbbiJcMljxRrRas4_uy9F9UaqXfS_Ag,3034
55
55
  deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
57
57
  deltacat/compute/compactor_v2/deletes/delete_strategy.py,sha256=SMEJOxR-5r92kvKNqtu2w6HmwtmhljcZX1wcNEuS-4w,2833
@@ -69,14 +69,14 @@ deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
69
69
  deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
70
70
  deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
71
  deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
72
- deltacat/compute/compactor_v2/steps/merge.py,sha256=LpktsDPfj7Of6RgUw9w1f3Y3OBkPDjvtyXjzFaIDoSo,21771
72
+ deltacat/compute/compactor_v2/steps/merge.py,sha256=qxmb3cmiKvOgfuOzlJT4Q60zOyWNjsiuZSzxdh6KTm8,22909
73
73
  deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
74
  deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=1P9CDpuWErsFcTTlRCeuUQHDokVI92he_MsL82uRAdA,7424
75
- deltacat/compute/compactor_v2/utils/dedupe.py,sha256=62tFCY2iRP7I3-45GCIYs6_SJsQl8C5lBEr8gbNfbsw,1932
75
+ deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
76
76
  deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
77
77
  deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
78
78
  deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
79
- deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=QOMwWxGhZ7VWa3oE6InM4thR5pbjmT7ttNXvx_IiKjo,11676
79
+ deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
80
80
  deltacat/compute/compactor_v2/utils/task_options.py,sha256=W0jyWIIZ0tcSAGp8mhpnu1G8p3rmX4d3juCPpAJxnDM,12649
81
81
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
82
82
  deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
@@ -136,7 +136,7 @@ deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
136
136
  deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kWyIJQMyF9oBemvgOp3ngGhMpH9zjkznV-67ewELgHQ,37719
137
137
  deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
138
138
  deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
139
- deltacat/tests/compute/compact_partition_test_cases.py,sha256=R9eiKvxCLqcoHjAx3iOogdnXZEO9TvLbRf0wA7bcJN4,26170
139
+ deltacat/tests/compute/compact_partition_test_cases.py,sha256=HJ15Xyawv8ImFju8wDwt22fH5okoPhyS-QAygkXDb7Q,27422
140
140
  deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
141
141
  deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
142
142
  deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
@@ -152,9 +152,10 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
152
152
  deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
153
153
  deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
154
154
  deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=0U8Hmu-qLvqXqLPBPS6qENc1ErolWAaAoUlwms2xLe8,23124
155
+ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADHENzUKMQYguB45zOD7F2lZgcBYYTvbTsdM,28957
156
156
  deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
157
157
  deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
+ deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
158
159
  deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
159
160
  deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
160
161
  deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
@@ -171,7 +172,7 @@ deltacat/tests/local_deltacat_storage/__init__.py,sha256=5T9ubNIS42-BotEH0yrUiWE
171
172
  deltacat/tests/local_deltacat_storage/exceptions.py,sha256=oxZ0psmrEO0M6P2r8gHQ2E8E-Y8UBfUCBUIwfuHcx38,251
172
173
  deltacat/tests/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
173
174
  deltacat/tests/test_utils/constants.py,sha256=UYe--9T_clYjiOpv0M7TtAMGdpje_SMZ-w8n0IeCAjc,214
174
- deltacat/tests/test_utils/pyarrow.py,sha256=pzTBk07xMaAfykXo3GNGwTqaQxrKnSbr-WO3HBszikI,2828
175
+ deltacat/tests/test_utils/pyarrow.py,sha256=QDdGilzsJ2xUESiGotdNVZde9yD7ja9MvNhhssnox-E,3083
175
176
  deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfWhKOSNtM,972
176
177
  deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
177
178
  deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -179,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
179
180
  deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
180
181
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
181
182
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
182
- deltacat/tests/utils/test_pyarrow.py,sha256=YDuyFYNjy6thzfA6Z2a0dOytUugsExu1uMUOhmP_aXc,29977
183
+ deltacat/tests/utils/test_pyarrow.py,sha256=fDjDkGPjdRZA3kgjgiQRym9shdeDYgkdDPYU2a7IEUk,30790
183
184
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
184
185
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
185
186
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -200,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
200
201
  deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
201
202
  deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
202
203
  deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
203
- deltacat/utils/pyarrow.py,sha256=xEZRzbTBU6uj9K4DtvngIPtQkTA8haVgQ4Y4vjwHvtM,34311
204
+ deltacat/utils/pyarrow.py,sha256=R3KkJPenE48rS3VrfFKSkJerX94f4e7X2dUPBQg44DY,34339
204
205
  deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
205
206
  deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
206
207
  deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -210,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
210
211
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
211
212
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
212
213
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
213
- deltacat-1.1.28.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
214
- deltacat-1.1.28.dist-info/METADATA,sha256=-ZnMp9C26vVxi015Il3UtBMm9pHqA8aZmTnNTgr8Tb8,1733
215
- deltacat-1.1.28.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
216
- deltacat-1.1.28.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
217
- deltacat-1.1.28.dist-info/RECORD,,
214
+ deltacat-1.1.29.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
215
+ deltacat-1.1.29.dist-info/METADATA,sha256=ZverlgFUJV4wGJao8tusRCv_sRNX4KJ4RTNAGvBCJes,1733
216
+ deltacat-1.1.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
217
+ deltacat-1.1.29.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
218
+ deltacat-1.1.29.dist-info/RECORD,,