deltacat 1.1.27__py3-none-any.whl → 1.1.29__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.27"
47
+ __version__ = "1.1.29"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -1,3 +1,5 @@
1
+ from deltacat.utils.common import env_bool, env_integer
2
+
1
3
  TOTAL_BYTES_IN_SHA1_HASH = 20
2
4
 
3
5
  PK_DELIMITER = "L6kl7u5f"
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
31
33
  # The total size of records that will be hash bucketed at once
32
34
  # Since, sorting is nlogn, we ensure that is not performed
33
35
  # on a very large dataset for best performance.
34
- MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
36
+ MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
37
+ "MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
38
+ )
35
39
 
36
40
  # Whether to drop duplicates during merge.
37
41
  DROP_DUPLICATES = True
@@ -78,3 +82,13 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
78
82
  # Number of rounds to run hash/merge for a single
79
83
  # partition. (For large table support)
80
84
  DEFAULT_NUM_ROUNDS = 1
85
+
86
+ # Whether to perform sha1 hashing when required to
87
+ # optimize memory. For example, hashing is always
88
+ # required for bucketing where it's not mandatory
89
+ # when dropping duplicates. Setting this to True
90
+ # will disable sha1 hashing in cases where it isn't
91
+ # mandatory. This flag is False by default.
92
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
93
+ "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
94
+ )
@@ -7,6 +7,7 @@ import ray
7
7
  import itertools
8
8
  import time
9
9
  import pyarrow.compute as pc
10
+ from deltacat.utils.pyarrow import MAX_INT_BYTES
10
11
  import deltacat.compute.compactor_v2.utils.merge as merge_utils
11
12
  from uuid import uuid4
12
13
  from deltacat import logs
@@ -147,10 +148,32 @@ def _merge_tables(
147
148
  if compacted_table:
148
149
  compacted_table = all_tables[0]
149
150
 
151
+ compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
152
+ incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
153
+
154
+ logger.info(
155
+ f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
156
+ f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
157
+ )
158
+
159
+ if (
160
+ compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
161
+ or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
162
+ ):
163
+ logger.info("Casting compacted and incremental pk hash to large_string...")
164
+ # is_in combines the chunks of the chunked array passed which can cause
165
+ # ArrowCapacityError if the total size of string array is over 2GB.
166
+ # Using a large_string would resolve this issue.
167
+ # The cast here should be zero-copy in most cases.
168
+ compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
169
+ incremental_pk_hash_str = pc.cast(
170
+ incremental_pk_hash_str, pa.large_string()
171
+ )
172
+
150
173
  records_to_keep = pc.invert(
151
174
  pc.is_in(
152
- compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
153
- incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
175
+ compacted_pk_hash_str,
176
+ incremental_pk_hash_str,
154
177
  )
155
178
  )
156
179
 
@@ -492,9 +515,11 @@ def _copy_manifests_from_hash_bucketing(
492
515
  def _timed_merge(input: MergeInput) -> MergeResult:
493
516
  task_id = get_current_ray_task_id()
494
517
  worker_id = get_current_ray_worker_id()
495
- with memray.Tracker(
496
- f"merge_{worker_id}_{task_id}.bin"
497
- ) if input.enable_profiler else nullcontext():
518
+ with (
519
+ memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
520
+ if input.enable_profiler
521
+ else nullcontext()
522
+ ):
498
523
  total_input_records, total_deduped_records = 0, 0
499
524
  total_dropped_records = 0
500
525
  materialized_results: List[MaterializeResult] = []
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
25
25
  result[index] = np.arange(cl, dtype="int32")
26
26
 
27
27
  chunk_lengths = ([0] + chunk_lengths)[:-1]
28
- result = pa.chunked_array(result + np.cumsum(chunk_lengths))
28
+ result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
29
29
  return result
30
30
 
31
31
 
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
10
10
  TOTAL_BYTES_IN_SHA1_HASH,
11
11
  PK_DELIMITER,
12
12
  MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
13
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
13
14
  )
14
15
  import time
15
16
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
48
49
  f"Found total length of hash column={total_len} and total_size={total_size}"
49
50
  )
50
51
 
52
+ if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
53
+ logger.info(
54
+ f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
55
+ f"Returning False for is_sha1_desired"
56
+ )
57
+ return False
58
+
51
59
  return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
52
60
 
53
61
 
@@ -108,9 +116,10 @@ def _optimized_group_record_batches_by_hash_bucket(
108
116
  record_batches = []
109
117
  result_len = 0
110
118
  for record_batch in table_batches:
111
- current_bytes += record_batch.nbytes
112
- record_batches.append(record_batch)
113
- if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
119
+ if (
120
+ record_batches
121
+ and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
122
+ ):
114
123
  logger.info(
115
124
  f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
116
125
  f"is {len(record_batches)} and size {current_bytes}"
@@ -128,6 +137,9 @@ def _optimized_group_record_batches_by_hash_bucket(
128
137
  current_bytes = 0
129
138
  record_batches.clear()
130
139
 
140
+ current_bytes += record_batch.nbytes
141
+ record_batches.append(record_batch)
142
+
131
143
  if record_batches:
132
144
  appended_len, append_latency = timed_invocation(
133
145
  _append_table_by_hash_bucket,
@@ -601,6 +601,38 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
601
601
  skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
602
602
  assert_compaction_audit=None,
603
603
  ),
604
+ "15-incremental-empty-input-with-single-hash-bucket": IncrementalCompactionTestCaseParams(
605
+ primary_keys={"pk_col_1"},
606
+ sort_keys=[SortKey.of(key_name="sk_col_1")],
607
+ partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
608
+ partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
609
+ input_deltas=pa.Table.from_arrays(
610
+ [
611
+ pa.array([]),
612
+ pa.array([]),
613
+ ],
614
+ names=["pk_col_1", "sk_col_1"],
615
+ ),
616
+ input_deltas_delta_type=DeltaType.UPSERT,
617
+ expected_terminal_compact_partition_result=pa.Table.from_arrays(
618
+ [
619
+ pa.array([]),
620
+ pa.array([]),
621
+ ],
622
+ names=["pk_col_1", "sk_col_1"],
623
+ ),
624
+ expected_terminal_exception=None,
625
+ expected_terminal_exception_message=None,
626
+ do_create_placement_group=False,
627
+ records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
628
+ hash_bucket_count=1,
629
+ read_kwargs_provider=None,
630
+ drop_duplicates=True,
631
+ is_inplace=False,
632
+ add_late_deltas=None,
633
+ skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
634
+ assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
635
+ ),
604
636
  }
605
637
 
606
638
  INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
@@ -1,6 +1,7 @@
1
1
  from typing import Dict, Any
2
2
  import ray
3
3
  import os
4
+ import pyarrow as pa
4
5
  import pytest
5
6
  import boto3
6
7
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
@@ -76,6 +77,17 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
76
77
  os.remove(DATABASE_FILE_PATH_VALUE)
77
78
 
78
79
 
80
+ @pytest.fixture(scope="function")
81
+ def disable_sha1(monkeypatch):
82
+ import deltacat.compute.compactor_v2.utils.primary_key_index
83
+
84
+ monkeypatch.setattr(
85
+ deltacat.compute.compactor_v2.utils.primary_key_index,
86
+ "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
87
+ True,
88
+ )
89
+
90
+
79
91
  class TestCompactionSession:
80
92
  """
81
93
  This class adds specific tests that aren't part of the parametrized test suite.
@@ -556,3 +568,124 @@ class TestCompactionSession:
556
568
  }
557
569
  )
558
570
  )
571
+
572
+ def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
573
+ self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
574
+ ):
575
+ """
576
+ A test case which ensures the compaction succeeds even if the incremental
577
+ arrow table size is over 2GB. It is added to prevent ArrowCapacityError
578
+ when running is_in operation during merge.
579
+
580
+ Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
581
+ which truncates the lengths of pk strings when deduping.
582
+ """
583
+ # setup
584
+ staged_source = stage_partition_from_file_paths(
585
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
586
+ )
587
+ # we create chunked array to avoid ArrowCapacityError
588
+ chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
589
+ table = pa.table([chunked_pk_array], names=["pk"])
590
+ source_delta = commit_delta_to_staged_partition(
591
+ staged_source, pa_table=table, **local_deltacat_storage_kwargs
592
+ )
593
+
594
+ staged_dest = stage_partition_from_file_paths(
595
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
596
+ )
597
+ dest_partition = ds.commit_partition(
598
+ staged_dest, **local_deltacat_storage_kwargs
599
+ )
600
+
601
+ # rebase first
602
+ rebase_url = compact_partition(
603
+ CompactPartitionParams.of(
604
+ {
605
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
606
+ "compacted_file_content_type": ContentType.PARQUET,
607
+ "dd_max_parallelism_ratio": 1.0,
608
+ "deltacat_storage": ds,
609
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
610
+ "destination_partition_locator": dest_partition.locator,
611
+ "drop_duplicates": True,
612
+ "hash_bucket_count": 1,
613
+ "last_stream_position_to_compact": source_delta.stream_position,
614
+ "list_deltas_kwargs": {
615
+ **local_deltacat_storage_kwargs,
616
+ **{"equivalent_table_types": []},
617
+ },
618
+ "primary_keys": ["pk"],
619
+ "rebase_source_partition_locator": source_delta.partition_locator,
620
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
621
+ "records_per_compacted_file": 4000,
622
+ "s3_client_kwargs": {},
623
+ "source_partition_locator": source_delta.partition_locator,
624
+ "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
625
+ }
626
+ )
627
+ )
628
+
629
+ rebased_rcf = get_rcf(s3_resource, rebase_url)
630
+
631
+ assert rebased_rcf.compacted_pyarrow_write_result.files == 1
632
+ assert rebased_rcf.compacted_pyarrow_write_result.records == 2
633
+
634
+ # Run incremental with a small delta on source
635
+ chunked_pk_array = pa.chunked_array(
636
+ [["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
637
+ ) # 2.3GB
638
+ table = pa.table([chunked_pk_array], names=["pk"])
639
+
640
+ incremental_source_delta = commit_delta_to_partition(
641
+ source_delta.partition_locator,
642
+ pa_table=table,
643
+ **local_deltacat_storage_kwargs,
644
+ )
645
+ assert (
646
+ incremental_source_delta.partition_locator == source_delta.partition_locator
647
+ ), "source partition locator should not change"
648
+ dest_partition = ds.get_partition(
649
+ dest_partition.stream_locator,
650
+ dest_partition.partition_values,
651
+ **local_deltacat_storage_kwargs,
652
+ )
653
+
654
+ assert (
655
+ dest_partition.locator
656
+ == rebased_rcf.compacted_delta_locator.partition_locator
657
+ ), "The new destination partition should be same as compacted partition"
658
+
659
+ # Run incremental
660
+ incremental_url = compact_partition(
661
+ CompactPartitionParams.of(
662
+ {
663
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
664
+ "compacted_file_content_type": ContentType.PARQUET,
665
+ "dd_max_parallelism_ratio": 1.0,
666
+ "deltacat_storage": ds,
667
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
668
+ "destination_partition_locator": dest_partition.locator,
669
+ "drop_duplicates": True,
670
+ "hash_bucket_count": 1,
671
+ "last_stream_position_to_compact": incremental_source_delta.stream_position,
672
+ "list_deltas_kwargs": {
673
+ **local_deltacat_storage_kwargs,
674
+ **{"equivalent_table_types": []},
675
+ },
676
+ "primary_keys": ["pk"],
677
+ "records_per_compacted_file": 4000,
678
+ "s3_client_kwargs": {},
679
+ "source_partition_locator": incremental_source_delta.partition_locator,
680
+ "resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
681
+ }
682
+ )
683
+ )
684
+
685
+ incremental_rcf = get_rcf(s3_resource, incremental_url)
686
+
687
+ assert incremental_rcf.compacted_pyarrow_write_result.files == 1
688
+ assert (
689
+ incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
690
+ )
691
+ assert incremental_rcf.compacted_pyarrow_write_result.records == 4
@@ -0,0 +1,45 @@
1
+ import pyarrow as pa
2
+ from deltacat.compute.compactor_v2.utils.primary_key_index import (
3
+ group_by_pk_hash_bucket,
4
+ )
5
+
6
+
7
+ class TestGroupByPkHashBucket:
8
+ def test_sanity(self):
9
+ record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
10
+ pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
11
+ record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
12
+ table = pa.Table.from_batches([record_batch])
13
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
14
+
15
+ assert len(grouped_array) == 3
16
+ total_records = 0
17
+ for arr in grouped_array:
18
+ if arr is not None:
19
+ total_records += len(arr[1])
20
+
21
+ assert total_records == len(table)
22
+
23
+ def test_when_record_batches_exceed_int_max_size(self):
24
+ record = pa.array(["12bytestring" * 90_000_000])
25
+ record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
26
+ table = pa.Table.from_batches([record_batch, record_batch])
27
+
28
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
29
+
30
+ assert len(grouped_array) == 3
31
+ # two record batches are preserved as combining them
32
+ # would exceed 2GB.
33
+ assert len(grouped_array[2].to_batches()) == 2
34
+
35
+ def test_when_record_batches_less_than_int_max_size(self):
36
+ record = pa.array(["12bytestring" * 90_000])
37
+ record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
38
+ table = pa.Table.from_batches([record_batch, record_batch])
39
+
40
+ grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
41
+
42
+ assert len(grouped_array) == 3
43
+ # Combined the arrays into one record batch as the size
44
+ # would not exceed 2GB.
45
+ assert len(grouped_array[1].to_batches()) == 1
@@ -47,7 +47,8 @@ def stage_partition_from_file_paths(
47
47
 
48
48
  def commit_delta_to_staged_partition(
49
49
  staged_partition,
50
- file_paths: List[str],
50
+ file_paths: List[str] = None,
51
+ pa_table: pa.Table = None,
51
52
  content_type: ContentType = ContentType.PARQUET,
52
53
  *args,
53
54
  **kwargs,
@@ -57,6 +58,7 @@ def commit_delta_to_staged_partition(
57
58
  *args,
58
59
  file_paths=file_paths,
59
60
  content_type=content_type,
61
+ pa_table=pa_table,
60
62
  **kwargs,
61
63
  )
62
64
  ds.commit_partition(staged_partition, **kwargs)
@@ -76,23 +78,28 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
76
78
 
77
79
  def commit_delta_to_partition(
78
80
  partition: Union[Partition, PartitionLocator],
79
- file_paths: List[str],
81
+ file_paths: List[str] = None,
82
+ pa_table: pa.Table = None,
80
83
  content_type: ContentType = ContentType.PARQUET,
81
84
  *args,
82
85
  **kwargs,
83
86
  ) -> Delta:
84
- tables = []
85
87
 
86
88
  if isinstance(partition, PartitionLocator):
87
89
  partition = ds.get_partition(
88
90
  partition.stream_locator, partition.partition_values, *args, **kwargs
89
91
  )
92
+ if pa_table is None:
93
+ assert file_paths is not None, "One of pa_table or file_paths must be passed."
94
+ tables = []
95
+ for file_path in file_paths:
96
+ table = pa.csv.read_csv(file_path)
97
+ tables.append(table)
90
98
 
91
- for file_path in file_paths:
92
- table = pa.csv.read_csv(file_path)
93
- tables.append(table)
99
+ pa_table = pa.concat_tables(tables)
94
100
 
95
- table = pa.concat_tables(tables)
96
- staged_delta = ds.stage_delta(table, partition, content_type=content_type, **kwargs)
101
+ staged_delta = ds.stage_delta(
102
+ pa_table, partition, content_type=content_type, **kwargs
103
+ )
97
104
 
98
105
  return ds.commit_delta(staged_delta, **kwargs)
@@ -7,15 +7,24 @@ from deltacat.utils.pyarrow import (
7
7
  s3_file_to_table,
8
8
  ReadKwargsProviderPyArrowSchemaOverride,
9
9
  RAISE_ON_EMPTY_CSV_KWARG,
10
+ RAISE_ON_DECIMAL_OVERFLOW,
10
11
  )
12
+ import decimal
11
13
  from deltacat.types.media import ContentEncoding, ContentType
12
14
  from deltacat.types.partial_download import PartialParquetParameters
13
15
  from pyarrow.parquet import ParquetFile
14
16
  import pyarrow as pa
15
17
 
16
18
  PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
19
+ PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
17
20
  EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
18
21
  NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
22
+ OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
23
+ "deltacat/tests/utils/data/overflowing_decimal_precision.csv"
24
+ )
25
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
26
+ "deltacat/tests/utils/data/overflowing_decimal_scale.csv"
27
+ )
19
28
  GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
20
29
  BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
21
30
 
@@ -407,6 +416,253 @@ class TestReadCSV(TestCase):
407
416
  ),
408
417
  )
409
418
 
419
+ def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
420
+ schema = pa.schema(
421
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
422
+ )
423
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
424
+ _add_column_kwargs(
425
+ ContentType.UNESCAPED_TSV.value,
426
+ ["is_active", "decimal_value"],
427
+ ["is_active", "decimal_value"],
428
+ kwargs,
429
+ )
430
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
431
+
432
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
433
+ self.assertRaises(
434
+ pa.lib.ArrowInvalid,
435
+ lambda: pyarrow_read_csv(
436
+ OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
437
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
438
+ ),
439
+ )
440
+
441
+ def test_read_csv_when_decimal_precision_overflows_sanity(self):
442
+ schema = pa.schema(
443
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
444
+ )
445
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
446
+ _add_column_kwargs(
447
+ ContentType.UNESCAPED_TSV.value,
448
+ ["is_active", "decimal_value"],
449
+ ["is_active", "decimal_value"],
450
+ kwargs,
451
+ )
452
+
453
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
454
+
455
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
456
+
457
+ self.assertRaises(
458
+ pa.lib.ArrowInvalid,
459
+ lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
460
+ )
461
+
462
+ def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
463
+ schema = pa.schema(
464
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
465
+ )
466
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
467
+ _add_column_kwargs(
468
+ ContentType.UNESCAPED_TSV.value,
469
+ ["is_active", "decimal_value"],
470
+ ["is_active", "decimal_value"],
471
+ kwargs,
472
+ )
473
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
474
+
475
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
476
+
477
+ self.assertRaises(
478
+ pa.lib.ArrowInvalid,
479
+ lambda: pyarrow_read_csv(
480
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
481
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
482
+ ),
483
+ )
484
+
485
+ def test_read_csv_when_decimal_scale_overflows_sanity(self):
486
+ schema = pa.schema(
487
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
488
+ )
489
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
490
+ _add_column_kwargs(
491
+ ContentType.UNESCAPED_TSV.value,
492
+ ["is_active", "decimal_value"],
493
+ ["is_active", "decimal_value"],
494
+ kwargs,
495
+ )
496
+
497
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
498
+
499
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
500
+
501
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
502
+
503
+ self.assertEqual(len(result), 3)
504
+ self.assertEqual(
505
+ result[1][0].as_py(), decimal.Decimal("322236.66")
506
+ ) # rounding decimal
507
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
508
+ self.assertEqual(len(result.column_names), 2)
509
+ result_schema = result.schema
510
+ self.assertEqual(result_schema.field(0).type, "string")
511
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
512
+
513
+ def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
514
+ schema = pa.schema(
515
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
516
+ )
517
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
518
+ _add_column_kwargs(
519
+ ContentType.UNESCAPED_TSV.value,
520
+ ["is_active", "decimal_value"],
521
+ ["is_active", "decimal_value"],
522
+ kwargs,
523
+ )
524
+
525
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
526
+
527
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
528
+
529
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
530
+
531
+ self.assertEqual(len(result), 3)
532
+ self.assertEqual(
533
+ result[1][0].as_py(),
534
+ decimal.Decimal("322200"), # consequence of negative scale
535
+ ) # rounding decimal
536
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
537
+ self.assertEqual(len(result.column_names), 2)
538
+ result_schema = result.schema
539
+ self.assertEqual(result_schema.field(0).type, "string")
540
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
541
+
542
+ def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
543
+ schema = pa.schema(
544
+ [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
545
+ )
546
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
547
+ _add_column_kwargs(
548
+ ContentType.UNESCAPED_TSV.value,
549
+ ["is_active", "decimal_value"],
550
+ ["is_active", "decimal_value"],
551
+ kwargs,
552
+ )
553
+
554
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
555
+
556
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
557
+
558
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
559
+
560
+ self.assertEqual(len(result), 3)
561
+ self.assertEqual(
562
+ result[1][0].as_py(), decimal.Decimal("322236.66")
563
+ ) # rounding decimal
564
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
565
+ self.assertEqual(len(result.column_names), 2)
566
+ result_schema = result.schema
567
+ self.assertEqual(result_schema.field(0).type, "string")
568
+ self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
569
+
570
+ def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
571
+ self,
572
+ ):
573
+ schema = pa.schema(
574
+ [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
575
+ )
576
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
577
+ _add_column_kwargs(
578
+ ContentType.UNESCAPED_TSV.value,
579
+ ["is_active", "decimal_value"],
580
+ ["is_active", "decimal_value"],
581
+ kwargs,
582
+ )
583
+
584
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
585
+
586
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
587
+
588
+ self.assertRaises(
589
+ pa.lib.ArrowNotImplementedError,
590
+ lambda: pyarrow_read_csv(
591
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
592
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
593
+ ),
594
+ )
595
+
596
+ def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
597
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
598
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
599
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
600
+
601
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
602
+
603
+ # The default behavior of pyarrow is to invalid skip rows
604
+ self.assertEqual(len(result), 2)
605
+ self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
606
+ self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
607
+ self.assertEqual(len(result.column_names), 2)
608
+ result_schema = result.schema
609
+ self.assertEqual(result_schema.field(0).type, "string")
610
+ self.assertEqual(result_schema.field(1).type, pa.float64())
611
+
612
+ def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
613
+ self,
614
+ ):
615
+ schema = pa.schema(
616
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
617
+ )
618
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
619
+ _add_column_kwargs(
620
+ ContentType.UNESCAPED_TSV.value,
621
+ ["is_active", "decimal_value"],
622
+ ["is_active", "decimal_value"],
623
+ kwargs,
624
+ )
625
+
626
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
627
+
628
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
629
+
630
+ self.assertRaises(
631
+ pa.lib.ArrowInvalid,
632
+ lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
633
+ )
634
+
635
+ def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
636
+ schema = pa.schema(
637
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
638
+ )
639
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
640
+ _add_column_kwargs(
641
+ ContentType.UNESCAPED_TSV.value,
642
+ ["is_active", "decimal_value"],
643
+ ["is_active", "decimal_value"],
644
+ kwargs,
645
+ )
646
+
647
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
648
+
649
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
650
+
651
+ with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
652
+ result = pyarrow_read_csv(file, **kwargs)
653
+
654
+ self.assertEqual(len(result), 3)
655
+ self.assertEqual(
656
+ result[1][0].as_py(), decimal.Decimal("322236.66")
657
+ ) # rounding decimal
658
+ self.assertEqual(
659
+ result[1][1].as_py(), decimal.Decimal("32.33")
660
+ ) # not rounded
661
+ self.assertEqual(len(result.column_names), 2)
662
+ result_schema = result.schema
663
+ self.assertEqual(result_schema.field(0).type, "string")
664
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
665
+
410
666
 
411
667
  class TestS3FileToTable(TestCase):
412
668
  def test_s3_file_to_table_identity_sanity(self):
@@ -534,3 +790,25 @@ class TestS3FileToTable(TestCase):
534
790
  self.assertEqual(field.name, schema.field(index).name)
535
791
 
536
792
  self.assertEqual(result.schema.field(1).type, "string")
793
+
794
+ def test_s3_file_to_table_when_parquet_gzip(self):
795
+
796
+ pa_kwargs_provider = lambda content_type, kwargs: {
797
+ "reader_type": "pyarrow",
798
+ **kwargs,
799
+ }
800
+
801
+ result = s3_file_to_table(
802
+ PARQUET_GZIP_COMPRESSED_FILE_PATH,
803
+ ContentType.PARQUET.value,
804
+ ContentEncoding.GZIP.value,
805
+ ["n_legs", "animal"],
806
+ ["n_legs"],
807
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
808
+ )
809
+
810
+ self.assertEqual(len(result), 6)
811
+ self.assertEqual(len(result.column_names), 1)
812
+ schema = result.schema
813
+ schema_index = schema.get_field_index("n_legs")
814
+ self.assertEqual(schema.field(schema_index).type, "int64")
deltacat/utils/pyarrow.py CHANGED
@@ -1,6 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ import copy
4
5
  import bz2
5
6
  import gzip
6
7
  import io
@@ -47,6 +48,18 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
47
48
  RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
48
49
  READER_TYPE_KWARG = "reader_type"
49
50
 
51
+ """
52
+ By default, round decimal values using half_to_even round mode when
53
+ rescaling a decimal to the given scale and precision in the schema would cause
54
+ data loss. Setting any non null value of this argument will result
55
+ in an error instead.
56
+ """
57
+ RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
58
+ # Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
59
+ DECIMAL256_DEFAULT_SCALE = 38
60
+ DECIMAL256_MAX_PRECISION = 76
61
+ MAX_INT_BYTES = 2147483646
62
+
50
63
 
51
64
  def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
52
65
 
@@ -64,45 +77,164 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
64
77
  return target_schema
65
78
 
66
79
 
67
- def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
68
- try:
69
- new_kwargs = sanitize_kwargs_by_supported_kwargs(
70
- ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
80
+ def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
81
+ schema = None
82
+ if (
83
+ "convert_options" in kwargs
84
+ and kwargs["convert_options"].column_types is not None
85
+ ):
86
+ schema = kwargs["convert_options"].column_types
87
+ if not isinstance(schema, pa.Schema):
88
+ schema = pa.schema(schema)
89
+ if kwargs["convert_options"].include_columns:
90
+ schema = _filter_schema_for_columns(
91
+ schema, kwargs["convert_options"].include_columns
92
+ )
93
+ elif (
94
+ kwargs.get("read_options") is not None
95
+ and kwargs["read_options"].column_names
96
+ ):
97
+ schema = _filter_schema_for_columns(
98
+ schema, kwargs["read_options"].column_names
99
+ )
100
+ else:
101
+ logger.debug(
102
+ "Schema not specified in the kwargs."
103
+ " Hence, schema could not be inferred from the empty CSV."
71
104
  )
105
+
106
+ return schema
107
+
108
+
109
+ def _new_schema_with_replaced_fields(
110
+ schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
111
+ ) -> pa.Schema:
112
+ if schema is None:
113
+ return None
114
+
115
+ new_schema_fields = []
116
+ for field in schema:
117
+ new_field = field_to_replace(field)
118
+ if new_field is not None:
119
+ new_schema_fields.append(new_field)
120
+ else:
121
+ new_schema_fields.append(field)
122
+
123
+ return pa.schema(new_schema_fields, metadata=schema.metadata)
124
+
125
+
126
+ def _read_csv_rounding_decimal_columns_to_fit_scale(
127
+ schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
128
+ ) -> pa.Table:
129
+ # Note: We read decimals as strings first because CSV
130
+ # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
131
+ new_schema = _new_schema_with_replaced_fields(
132
+ schema,
133
+ lambda fld: (
134
+ pa.field(fld.name, pa.string(), metadata=fld.metadata)
135
+ if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
136
+ else None
137
+ ),
138
+ )
139
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
140
+ ["read_options", "parse_options", "convert_options", "memory_pool"],
141
+ reader_kwargs,
142
+ )
143
+ # Creating a shallow copy for efficiency
144
+ new_convert_options = copy.copy(new_kwargs["convert_options"])
145
+ new_convert_options.column_types = new_schema
146
+ new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
147
+ arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
148
+
149
+ for column_index, field in enumerate(schema):
150
+ if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
151
+ column_array = arrow_table[field.name]
152
+ # We always cast to decimal256 to accomodate fixed scale of 38
153
+ cast_to_type = pa.decimal256(
154
+ DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
155
+ )
156
+ casted_decimal_array = pc.cast(column_array, cast_to_type)
157
+ # Note that scale can be negative
158
+ rounded_column_array = pc.round(
159
+ casted_decimal_array, ndigits=field.type.scale
160
+ )
161
+ final_decimal_array = pc.cast(rounded_column_array, field.type)
162
+ arrow_table = arrow_table.set_column(
163
+ column_index,
164
+ field,
165
+ final_decimal_array,
166
+ )
167
+ logger.debug(
168
+ f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
169
+ f" {field.type.precision} precision"
170
+ )
171
+
172
+ return arrow_table
173
+
174
+
175
+ def pyarrow_read_csv_default(*args, **kwargs):
176
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
177
+ ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
178
+ )
179
+
180
+ try:
72
181
  return pacsv.read_csv(*args, **new_kwargs)
73
182
  except pa.lib.ArrowInvalid as e:
74
- if e.__str__() == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
75
- schema = None
76
- if (
77
- "convert_options" in kwargs
78
- and kwargs["convert_options"].column_types is not None
79
- ):
80
- schema = kwargs["convert_options"].column_types
81
- if not isinstance(schema, pa.Schema):
82
- schema = pa.schema(schema)
83
- if kwargs["convert_options"].include_columns:
84
- schema = _filter_schema_for_columns(
85
- schema, kwargs["convert_options"].include_columns
86
- )
87
- elif (
88
- kwargs.get("read_options") is not None
89
- and kwargs["read_options"].column_names
183
+ error_str = e.__str__()
184
+ schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
185
+
186
+ if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
187
+ logger.debug(f"Read CSV empty schema being used: {schema}")
188
+ return pa.Table.from_pylist([], schema=schema)
189
+ if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
190
+ # Note, this logic requires expensive casting. To prevent downgrading performance
191
+ # for happy path reads, we are handling this case in response to an error.
192
+ logger.warning(
193
+ "Rescaling Decimal to the given scale in the schema. "
194
+ f"Original error: {error_str}"
195
+ )
196
+
197
+ if schema is not None and "convert_options" in kwargs:
198
+ if (
199
+ "Rescaling Decimal" in error_str
200
+ and "value would cause data loss" in error_str
90
201
  ):
91
- schema = _filter_schema_for_columns(
92
- schema, kwargs["read_options"].column_names
202
+ logger.debug(f"Checking if the file: {args[0]}...")
203
+ # Since we are re-reading the file, we have to seek to beginning
204
+ if isinstance(args[0], io.IOBase) and args[0].seekable():
205
+ logger.debug(f"Seeking to the beginning of the file {args[0]}")
206
+ args[0].seek(0)
207
+ return _read_csv_rounding_decimal_columns_to_fit_scale(
208
+ schema=schema, reader_args=args, reader_kwargs=kwargs
93
209
  )
94
-
95
210
  else:
96
211
  logger.debug(
97
- "Schema not specified in the kwargs."
98
- " Hence, schema could not be inferred from the empty CSV."
212
+ "Schema is None when trying to adjust decimal values. "
213
+ "Hence, bubbling up exception..."
99
214
  )
100
215
 
101
- logger.debug(f"Read CSV empty schema being used: {schema}")
102
- return pa.Table.from_pylist([], schema=schema)
103
216
  raise e
104
217
 
105
218
 
219
+ def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
220
+ schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
221
+
222
+ # CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
223
+ # Below ensures decimal256 is casted properly.
224
+ schema_includes_decimal256 = (
225
+ (True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
226
+ if schema is not None
227
+ else None
228
+ )
229
+ if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
230
+ # falling back to expensive method of reading CSV
231
+ return _read_csv_rounding_decimal_columns_to_fit_scale(
232
+ schema, reader_args=args, reader_kwargs=kwargs
233
+ )
234
+ else:
235
+ return pyarrow_read_csv_default(*args, **kwargs)
236
+
237
+
106
238
  CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
107
239
  ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
108
240
  ContentType.TSV.value: pyarrow_read_csv,
@@ -440,8 +572,8 @@ def s3_file_to_table(
440
572
  **s3_client_kwargs,
441
573
  )
442
574
 
443
- if READER_TYPE_KWARG in kwargs:
444
- kwargs.pop(READER_TYPE_KWARG)
575
+ if READER_TYPE_KWARG in kwargs:
576
+ kwargs.pop(READER_TYPE_KWARG)
445
577
 
446
578
  filesystem = io
447
579
  if s3_url.startswith("s3://"):
@@ -783,7 +915,6 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
783
915
  TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
784
916
  """
785
917
  dtype = array.type
786
- MAX_BYTES = 2147483646
787
918
  max_str_len = None
788
919
  if pa.types.is_integer(dtype):
789
920
  max_str_len = _int_max_string_len()
@@ -795,7 +926,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
795
926
  max_str_len = _max_decimal256_string_len()
796
927
 
797
928
  if max_str_len is not None:
798
- max_elems_per_chunk = MAX_BYTES // (2 * max_str_len) # safety factor of 2
929
+ max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len) # safety factor of 2
799
930
  all_chunks = []
800
931
  for chunk in array.chunks:
801
932
  if len(chunk) < max_elems_per_chunk:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.27
3
+ Version: 1.1.29
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=NNgt1N6a4dwztCKl6C7klF3mQEn-S-sBHNZPKPqRHko,1778
1
+ deltacat/__init__.py,sha256=DoUiDxmgMh8HUGOEAG7CUY0Q9Ip-S7gePDsL8XQO5kk,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -51,7 +51,7 @@ deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZR
51
51
  deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
52
52
  deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
54
- deltacat/compute/compactor_v2/constants.py,sha256=AOvnIxQfKOnLubrUsg4g8OPLgqvOT46LE_da9_Dm2KY,2507
54
+ deltacat/compute/compactor_v2/constants.py,sha256=wvd34d7RGdniGbbiJcMljxRrRas4_uy9F9UaqXfS_Ag,3034
55
55
  deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
57
57
  deltacat/compute/compactor_v2/deletes/delete_strategy.py,sha256=SMEJOxR-5r92kvKNqtu2w6HmwtmhljcZX1wcNEuS-4w,2833
@@ -69,14 +69,14 @@ deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
69
69
  deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
70
70
  deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
71
  deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
72
- deltacat/compute/compactor_v2/steps/merge.py,sha256=LpktsDPfj7Of6RgUw9w1f3Y3OBkPDjvtyXjzFaIDoSo,21771
72
+ deltacat/compute/compactor_v2/steps/merge.py,sha256=qxmb3cmiKvOgfuOzlJT4Q60zOyWNjsiuZSzxdh6KTm8,22909
73
73
  deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
74
  deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=1P9CDpuWErsFcTTlRCeuUQHDokVI92he_MsL82uRAdA,7424
75
- deltacat/compute/compactor_v2/utils/dedupe.py,sha256=62tFCY2iRP7I3-45GCIYs6_SJsQl8C5lBEr8gbNfbsw,1932
75
+ deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
76
76
  deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
77
77
  deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
78
78
  deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
79
- deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=QOMwWxGhZ7VWa3oE6InM4thR5pbjmT7ttNXvx_IiKjo,11676
79
+ deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
80
80
  deltacat/compute/compactor_v2/utils/task_options.py,sha256=W0jyWIIZ0tcSAGp8mhpnu1G8p3rmX4d3juCPpAJxnDM,12649
81
81
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
82
82
  deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
@@ -136,7 +136,7 @@ deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
136
136
  deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kWyIJQMyF9oBemvgOp3ngGhMpH9zjkznV-67ewELgHQ,37719
137
137
  deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
138
138
  deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
139
- deltacat/tests/compute/compact_partition_test_cases.py,sha256=R9eiKvxCLqcoHjAx3iOogdnXZEO9TvLbRf0wA7bcJN4,26170
139
+ deltacat/tests/compute/compact_partition_test_cases.py,sha256=HJ15Xyawv8ImFju8wDwt22fH5okoPhyS-QAygkXDb7Q,27422
140
140
  deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
141
141
  deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
142
142
  deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
@@ -152,9 +152,10 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
152
152
  deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
153
153
  deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
154
154
  deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=0U8Hmu-qLvqXqLPBPS6qENc1ErolWAaAoUlwms2xLe8,23124
155
+ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADHENzUKMQYguB45zOD7F2lZgcBYYTvbTsdM,28957
156
156
  deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
157
157
  deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
+ deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
158
159
  deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
159
160
  deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
160
161
  deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
@@ -171,7 +172,7 @@ deltacat/tests/local_deltacat_storage/__init__.py,sha256=5T9ubNIS42-BotEH0yrUiWE
171
172
  deltacat/tests/local_deltacat_storage/exceptions.py,sha256=oxZ0psmrEO0M6P2r8gHQ2E8E-Y8UBfUCBUIwfuHcx38,251
172
173
  deltacat/tests/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
173
174
  deltacat/tests/test_utils/constants.py,sha256=UYe--9T_clYjiOpv0M7TtAMGdpje_SMZ-w8n0IeCAjc,214
174
- deltacat/tests/test_utils/pyarrow.py,sha256=pzTBk07xMaAfykXo3GNGwTqaQxrKnSbr-WO3HBszikI,2828
175
+ deltacat/tests/test_utils/pyarrow.py,sha256=QDdGilzsJ2xUESiGotdNVZde9yD7ja9MvNhhssnox-E,3083
175
176
  deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfWhKOSNtM,972
176
177
  deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
177
178
  deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -179,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
179
180
  deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
180
181
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
181
182
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
182
- deltacat/tests/utils/test_pyarrow.py,sha256=AWx0DTsBA1PkQag48w_HeQdz7tlBzJsm9v7Nd6-dhEY,19607
183
+ deltacat/tests/utils/test_pyarrow.py,sha256=fDjDkGPjdRZA3kgjgiQRym9shdeDYgkdDPYU2a7IEUk,30790
183
184
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
184
185
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
185
186
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -200,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
200
201
  deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
201
202
  deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
202
203
  deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
203
- deltacat/utils/pyarrow.py,sha256=nW_eD6fWAlbyHUzPj1rOOfnUbpP3RnAgNSuuVNyvhZ4,29174
204
+ deltacat/utils/pyarrow.py,sha256=R3KkJPenE48rS3VrfFKSkJerX94f4e7X2dUPBQg44DY,34339
204
205
  deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
205
206
  deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
206
207
  deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -210,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
210
211
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
211
212
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
212
213
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
213
- deltacat-1.1.27.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
214
- deltacat-1.1.27.dist-info/METADATA,sha256=VL7sWG3lO3cV3tzwTiCTgpm7h0K5Dh3GtKiqojgSgHI,1733
215
- deltacat-1.1.27.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
216
- deltacat-1.1.27.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
217
- deltacat-1.1.27.dist-info/RECORD,,
214
+ deltacat-1.1.29.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
215
+ deltacat-1.1.29.dist-info/METADATA,sha256=ZverlgFUJV4wGJao8tusRCv_sRNX4KJ4RTNAGvBCJes,1733
216
+ deltacat-1.1.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
217
+ deltacat-1.1.29.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
218
+ deltacat-1.1.29.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.44.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5