deltacat 1.1.26__py3-none-any.whl → 1.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.26"
47
+ __version__ = "1.1.28"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -584,8 +584,11 @@ def _process_merge_results(
584
584
  f"Duplicate record count ({duplicate_hash_bucket_mat_results}) is as large "
585
585
  f"as or greater than params.num_rounds, which is {params.num_rounds}"
586
586
  )
587
+ # ensure start index is the first file index if task index is same
587
588
  hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
588
- file_index,
589
+ hb_id_to_entry_indices_range.get(str(mat_result.task_index), [file_index])[
590
+ 0
591
+ ],
589
592
  file_index + mat_result.pyarrow_write_result.files,
590
593
  )
591
594
 
@@ -188,7 +188,7 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
188
188
  sampled_on_disk_size += delta.manifest.entries[entry_index].meta.content_length
189
189
  sampled_num_rows += len(tbl)
190
190
 
191
- if not sampled_on_disk_size:
191
+ if not sampled_on_disk_size or not sampled_in_memory_size:
192
192
  return EstimatedResources.of(
193
193
  memory_bytes=0,
194
194
  statistics=Statistics.of(
@@ -437,6 +437,43 @@ class TestEstimateResourcesRequiredToProcessDelta:
437
437
  == parquet_delta_with_manifest.meta.content_length
438
438
  )
439
439
 
440
+ def test_parquet_delta_when_file_sampling_and_arrow_size_zero(
441
+ self,
442
+ local_deltacat_storage_kwargs,
443
+ parquet_delta_with_manifest: Delta,
444
+ monkeypatch,
445
+ ):
446
+ params = EstimateResourcesParams.of(
447
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
448
+ max_files_to_sample=2,
449
+ )
450
+
451
+ def mock_func(*args, **kwargs):
452
+ class MockedValue:
453
+ nbytes = 0
454
+
455
+ def __len__(self):
456
+ return 0
457
+
458
+ return MockedValue()
459
+
460
+ monkeypatch.setattr(ds, "download_delta_manifest_entry", mock_func)
461
+
462
+ result = estimate_resources_required_to_process_delta(
463
+ delta=parquet_delta_with_manifest,
464
+ operation_type=OperationType.PYARROW_DOWNLOAD,
465
+ deltacat_storage=ds,
466
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
467
+ estimate_resources_params=params,
468
+ )
469
+
470
+ assert parquet_delta_with_manifest.manifest is not None
471
+ assert result.memory_bytes == 0
472
+ assert (
473
+ result.statistics.on_disk_size_bytes
474
+ == parquet_delta_with_manifest.meta.content_length
475
+ )
476
+
440
477
  def test_delta_manifest_utsv_when_file_sampling(
441
478
  self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
442
479
  ):
@@ -328,6 +328,16 @@ def test_compact_partition_incremental(
328
328
  **compaction_audit_obj
329
329
  )
330
330
 
331
+ # assert if RCF covers all files
332
+ if compactor_version != CompactorVersion.V1.value:
333
+ previous_end = None
334
+ for start, end in round_completion_info.hb_index_to_entry_range.values():
335
+ assert (previous_end is None and start == 0) or start == previous_end
336
+ previous_end = end
337
+ assert (
338
+ previous_end == round_completion_info.compacted_pyarrow_write_result.files
339
+ )
340
+
331
341
  tables = ds.download_delta(
332
342
  compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
333
343
  )
@@ -309,6 +309,16 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
309
309
  **compaction_audit_obj
310
310
  )
311
311
 
312
+ # assert if RCF covers all files
313
+ # multiple rounds feature is only supported in V2 compactor
314
+ previous_end = None
315
+ for start, end in round_completion_info.hb_index_to_entry_range.values():
316
+ assert (previous_end is None and start == 0) or start == previous_end
317
+ previous_end = end
318
+ assert (
319
+ previous_end == round_completion_info.compacted_pyarrow_write_result.files
320
+ )
321
+
312
322
  # Assert not in-place compacted
313
323
  assert (
314
324
  execute_compaction_result_spy.call_args.args[-1] is False
@@ -299,6 +299,17 @@ def test_compact_partition_rebase_same_source_and_destination(
299
299
  round_completion_info.compaction_audit_url
300
300
  )
301
301
 
302
+ # assert if RCF covers all files
303
+ if compactor_version != CompactorVersion.V1.value:
304
+ previous_end = None
305
+ for start, end in round_completion_info.hb_index_to_entry_range.values():
306
+ assert (previous_end is None and start == 0) or start == previous_end
307
+ previous_end = end
308
+ assert (
309
+ previous_end
310
+ == round_completion_info.compacted_pyarrow_write_result.files
311
+ )
312
+
302
313
  compaction_audit_obj: Dict[str, Any] = read_s3_contents(
303
314
  s3_resource, audit_bucket, audit_key
304
315
  )
@@ -355,6 +355,16 @@ def test_compact_partition_rebase_then_incremental(
355
355
  compacted_delta_locator_incremental: DeltaLocator = (
356
356
  round_completion_info.compacted_delta_locator
357
357
  )
358
+ # assert if RCF covers all files
359
+ if compactor_version != CompactorVersion.V1.value:
360
+ previous_end = None
361
+ for start, end in round_completion_info.hb_index_to_entry_range.values():
362
+ assert (previous_end is None and start == 0) or start == previous_end
363
+ previous_end = end
364
+ assert (
365
+ previous_end == round_completion_info.compacted_pyarrow_write_result.files
366
+ )
367
+
358
368
  audit_bucket, audit_key = round_completion_info.compaction_audit_url.replace(
359
369
  "s3://", ""
360
370
  ).split("/", 1)
@@ -7,7 +7,9 @@ from deltacat.utils.pyarrow import (
7
7
  s3_file_to_table,
8
8
  ReadKwargsProviderPyArrowSchemaOverride,
9
9
  RAISE_ON_EMPTY_CSV_KWARG,
10
+ RAISE_ON_DECIMAL_OVERFLOW,
10
11
  )
12
+ import decimal
11
13
  from deltacat.types.media import ContentEncoding, ContentType
12
14
  from deltacat.types.partial_download import PartialParquetParameters
13
15
  from pyarrow.parquet import ParquetFile
@@ -16,6 +18,12 @@ import pyarrow as pa
16
18
  PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
17
19
  EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
18
20
  NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
21
+ OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
22
+ "deltacat/tests/utils/data/overflowing_decimal_precision.csv"
23
+ )
24
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
25
+ "deltacat/tests/utils/data/overflowing_decimal_scale.csv"
26
+ )
19
27
  GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
20
28
  BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
21
29
 
@@ -407,6 +415,253 @@ class TestReadCSV(TestCase):
407
415
  ),
408
416
  )
409
417
 
418
+ def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
419
+ schema = pa.schema(
420
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
421
+ )
422
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
423
+ _add_column_kwargs(
424
+ ContentType.UNESCAPED_TSV.value,
425
+ ["is_active", "decimal_value"],
426
+ ["is_active", "decimal_value"],
427
+ kwargs,
428
+ )
429
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
430
+
431
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
432
+ self.assertRaises(
433
+ pa.lib.ArrowInvalid,
434
+ lambda: pyarrow_read_csv(
435
+ OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
436
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
437
+ ),
438
+ )
439
+
440
+ def test_read_csv_when_decimal_precision_overflows_sanity(self):
441
+ schema = pa.schema(
442
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
443
+ )
444
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
445
+ _add_column_kwargs(
446
+ ContentType.UNESCAPED_TSV.value,
447
+ ["is_active", "decimal_value"],
448
+ ["is_active", "decimal_value"],
449
+ kwargs,
450
+ )
451
+
452
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
453
+
454
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
455
+
456
+ self.assertRaises(
457
+ pa.lib.ArrowInvalid,
458
+ lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
459
+ )
460
+
461
+ def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
462
+ schema = pa.schema(
463
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
464
+ )
465
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
466
+ _add_column_kwargs(
467
+ ContentType.UNESCAPED_TSV.value,
468
+ ["is_active", "decimal_value"],
469
+ ["is_active", "decimal_value"],
470
+ kwargs,
471
+ )
472
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
473
+
474
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
475
+
476
+ self.assertRaises(
477
+ pa.lib.ArrowInvalid,
478
+ lambda: pyarrow_read_csv(
479
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
480
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
481
+ ),
482
+ )
483
+
484
+ def test_read_csv_when_decimal_scale_overflows_sanity(self):
485
+ schema = pa.schema(
486
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
487
+ )
488
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
489
+ _add_column_kwargs(
490
+ ContentType.UNESCAPED_TSV.value,
491
+ ["is_active", "decimal_value"],
492
+ ["is_active", "decimal_value"],
493
+ kwargs,
494
+ )
495
+
496
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
497
+
498
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
499
+
500
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
501
+
502
+ self.assertEqual(len(result), 3)
503
+ self.assertEqual(
504
+ result[1][0].as_py(), decimal.Decimal("322236.66")
505
+ ) # rounding decimal
506
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
507
+ self.assertEqual(len(result.column_names), 2)
508
+ result_schema = result.schema
509
+ self.assertEqual(result_schema.field(0).type, "string")
510
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
511
+
512
+ def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
513
+ schema = pa.schema(
514
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
515
+ )
516
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
517
+ _add_column_kwargs(
518
+ ContentType.UNESCAPED_TSV.value,
519
+ ["is_active", "decimal_value"],
520
+ ["is_active", "decimal_value"],
521
+ kwargs,
522
+ )
523
+
524
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
525
+
526
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
527
+
528
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
529
+
530
+ self.assertEqual(len(result), 3)
531
+ self.assertEqual(
532
+ result[1][0].as_py(),
533
+ decimal.Decimal("322200"), # consequence of negative scale
534
+ ) # rounding decimal
535
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
536
+ self.assertEqual(len(result.column_names), 2)
537
+ result_schema = result.schema
538
+ self.assertEqual(result_schema.field(0).type, "string")
539
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
540
+
541
+ def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
542
+ schema = pa.schema(
543
+ [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
544
+ )
545
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
546
+ _add_column_kwargs(
547
+ ContentType.UNESCAPED_TSV.value,
548
+ ["is_active", "decimal_value"],
549
+ ["is_active", "decimal_value"],
550
+ kwargs,
551
+ )
552
+
553
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
554
+
555
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
556
+
557
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
558
+
559
+ self.assertEqual(len(result), 3)
560
+ self.assertEqual(
561
+ result[1][0].as_py(), decimal.Decimal("322236.66")
562
+ ) # rounding decimal
563
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
564
+ self.assertEqual(len(result.column_names), 2)
565
+ result_schema = result.schema
566
+ self.assertEqual(result_schema.field(0).type, "string")
567
+ self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
568
+
569
+ def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
570
+ self,
571
+ ):
572
+ schema = pa.schema(
573
+ [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
574
+ )
575
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
576
+ _add_column_kwargs(
577
+ ContentType.UNESCAPED_TSV.value,
578
+ ["is_active", "decimal_value"],
579
+ ["is_active", "decimal_value"],
580
+ kwargs,
581
+ )
582
+
583
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
584
+
585
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
586
+
587
+ self.assertRaises(
588
+ pa.lib.ArrowNotImplementedError,
589
+ lambda: pyarrow_read_csv(
590
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
591
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
592
+ ),
593
+ )
594
+
595
+ def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
596
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
597
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
598
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
599
+
600
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
601
+
602
+ # The default behavior of pyarrow is to invalid skip rows
603
+ self.assertEqual(len(result), 2)
604
+ self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
605
+ self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
606
+ self.assertEqual(len(result.column_names), 2)
607
+ result_schema = result.schema
608
+ self.assertEqual(result_schema.field(0).type, "string")
609
+ self.assertEqual(result_schema.field(1).type, pa.float64())
610
+
611
+ def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
612
+ self,
613
+ ):
614
+ schema = pa.schema(
615
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
616
+ )
617
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
618
+ _add_column_kwargs(
619
+ ContentType.UNESCAPED_TSV.value,
620
+ ["is_active", "decimal_value"],
621
+ ["is_active", "decimal_value"],
622
+ kwargs,
623
+ )
624
+
625
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
626
+
627
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
628
+
629
+ self.assertRaises(
630
+ pa.lib.ArrowInvalid,
631
+ lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
632
+ )
633
+
634
+ def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
635
+ schema = pa.schema(
636
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
637
+ )
638
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
639
+ _add_column_kwargs(
640
+ ContentType.UNESCAPED_TSV.value,
641
+ ["is_active", "decimal_value"],
642
+ ["is_active", "decimal_value"],
643
+ kwargs,
644
+ )
645
+
646
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
647
+
648
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
649
+
650
+ with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
651
+ result = pyarrow_read_csv(file, **kwargs)
652
+
653
+ self.assertEqual(len(result), 3)
654
+ self.assertEqual(
655
+ result[1][0].as_py(), decimal.Decimal("322236.66")
656
+ ) # rounding decimal
657
+ self.assertEqual(
658
+ result[1][1].as_py(), decimal.Decimal("32.33")
659
+ ) # not rounded
660
+ self.assertEqual(len(result.column_names), 2)
661
+ result_schema = result.schema
662
+ self.assertEqual(result_schema.field(0).type, "string")
663
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
664
+
410
665
 
411
666
  class TestS3FileToTable(TestCase):
412
667
  def test_s3_file_to_table_identity_sanity(self):
deltacat/utils/pyarrow.py CHANGED
@@ -1,6 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ import copy
4
5
  import bz2
5
6
  import gzip
6
7
  import io
@@ -47,6 +48,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
47
48
  RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
48
49
  READER_TYPE_KWARG = "reader_type"
49
50
 
51
+ """
52
+ By default, round decimal values using half_to_even round mode when
53
+ rescaling a decimal to the given scale and precision in the schema would cause
54
+ data loss. Setting any non null value of this argument will result
55
+ in an error instead.
56
+ """
57
+ RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
58
+ # Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
59
+ DECIMAL256_DEFAULT_SCALE = 38
60
+ DECIMAL256_MAX_PRECISION = 76
61
+
50
62
 
51
63
  def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
52
64
 
@@ -64,45 +76,162 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
64
76
  return target_schema
65
77
 
66
78
 
67
- def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
68
- try:
69
- new_kwargs = sanitize_kwargs_by_supported_kwargs(
70
- ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
79
+ def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
80
+ schema = None
81
+ if (
82
+ "convert_options" in kwargs
83
+ and kwargs["convert_options"].column_types is not None
84
+ ):
85
+ schema = kwargs["convert_options"].column_types
86
+ if not isinstance(schema, pa.Schema):
87
+ schema = pa.schema(schema)
88
+ if kwargs["convert_options"].include_columns:
89
+ schema = _filter_schema_for_columns(
90
+ schema, kwargs["convert_options"].include_columns
91
+ )
92
+ elif (
93
+ kwargs.get("read_options") is not None
94
+ and kwargs["read_options"].column_names
95
+ ):
96
+ schema = _filter_schema_for_columns(
97
+ schema, kwargs["read_options"].column_names
98
+ )
99
+ else:
100
+ logger.debug(
101
+ "Schema not specified in the kwargs."
102
+ " Hence, schema could not be inferred from the empty CSV."
71
103
  )
104
+
105
+ return schema
106
+
107
+
108
+ def _new_schema_with_replaced_fields(
109
+ schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
110
+ ) -> pa.Schema:
111
+ if schema is None:
112
+ return None
113
+
114
+ new_schema_fields = []
115
+ for field in schema:
116
+ new_field = field_to_replace(field)
117
+ if new_field is not None:
118
+ new_schema_fields.append(new_field)
119
+ else:
120
+ new_schema_fields.append(field)
121
+
122
+ return pa.schema(new_schema_fields, metadata=schema.metadata)
123
+
124
+
125
+ def _read_csv_rounding_decimal_columns_to_fit_scale(
126
+ schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
127
+ ) -> pa.Table:
128
+ # Note: We read decimals as strings first because CSV
129
+ # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
130
+ new_schema = _new_schema_with_replaced_fields(
131
+ schema,
132
+ lambda fld: pa.field(fld.name, pa.string(), metadata=fld.metadata)
133
+ if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
134
+ else None,
135
+ )
136
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
137
+ ["read_options", "parse_options", "convert_options", "memory_pool"],
138
+ reader_kwargs,
139
+ )
140
+ # Creating a shallow copy for efficiency
141
+ new_convert_options = copy.copy(new_kwargs["convert_options"])
142
+ new_convert_options.column_types = new_schema
143
+ new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
144
+ arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
145
+
146
+ for column_index, field in enumerate(schema):
147
+ if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
148
+ column_array = arrow_table[field.name]
149
+ # We always cast to decimal256 to accomodate fixed scale of 38
150
+ cast_to_type = pa.decimal256(
151
+ DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
152
+ )
153
+ casted_decimal_array = pc.cast(column_array, cast_to_type)
154
+ # Note that scale can be negative
155
+ rounded_column_array = pc.round(
156
+ casted_decimal_array, ndigits=field.type.scale
157
+ )
158
+ final_decimal_array = pc.cast(rounded_column_array, field.type)
159
+ arrow_table = arrow_table.set_column(
160
+ column_index,
161
+ field,
162
+ final_decimal_array,
163
+ )
164
+ logger.debug(
165
+ f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
166
+ f" {field.type.precision} precision"
167
+ )
168
+
169
+ return arrow_table
170
+
171
+
172
+ def pyarrow_read_csv_default(*args, **kwargs):
173
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
174
+ ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
175
+ )
176
+
177
+ try:
72
178
  return pacsv.read_csv(*args, **new_kwargs)
73
179
  except pa.lib.ArrowInvalid as e:
74
- if e.__str__() == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
75
- schema = None
76
- if (
77
- "convert_options" in kwargs
78
- and kwargs["convert_options"].column_types is not None
79
- ):
80
- schema = kwargs["convert_options"].column_types
81
- if not isinstance(schema, pa.Schema):
82
- schema = pa.schema(schema)
83
- if kwargs["convert_options"].include_columns:
84
- schema = _filter_schema_for_columns(
85
- schema, kwargs["convert_options"].include_columns
86
- )
87
- elif (
88
- kwargs.get("read_options") is not None
89
- and kwargs["read_options"].column_names
180
+ error_str = e.__str__()
181
+ schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
182
+
183
+ if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
184
+ logger.debug(f"Read CSV empty schema being used: {schema}")
185
+ return pa.Table.from_pylist([], schema=schema)
186
+ if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
187
+ # Note, this logic requires expensive casting. To prevent downgrading performance
188
+ # for happy path reads, we are handling this case in response to an error.
189
+ logger.warning(
190
+ "Rescaling Decimal to the given scale in the schema. "
191
+ f"Original error: {error_str}"
192
+ )
193
+
194
+ if schema is not None and "convert_options" in kwargs:
195
+ if (
196
+ "Rescaling Decimal" in error_str
197
+ and "value would cause data loss" in error_str
90
198
  ):
91
- schema = _filter_schema_for_columns(
92
- schema, kwargs["read_options"].column_names
199
+ logger.debug(f"Checking if the file: {args[0]}...")
200
+ # Since we are re-reading the file, we have to seek to beginning
201
+ if isinstance(args[0], io.IOBase) and args[0].seekable():
202
+ logger.debug(f"Seeking to the beginning of the file {args[0]}")
203
+ args[0].seek(0)
204
+ return _read_csv_rounding_decimal_columns_to_fit_scale(
205
+ schema=schema, reader_args=args, reader_kwargs=kwargs
93
206
  )
94
-
95
207
  else:
96
208
  logger.debug(
97
- "Schema not specified in the kwargs."
98
- " Hence, schema could not be inferred from the empty CSV."
209
+ "Schema is None when trying to adjust decimal values. "
210
+ "Hence, bubbling up exception..."
99
211
  )
100
212
 
101
- logger.debug(f"Read CSV empty schema being used: {schema}")
102
- return pa.Table.from_pylist([], schema=schema)
103
213
  raise e
104
214
 
105
215
 
216
+ def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
217
+ schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
218
+
219
+ # CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
220
+ # Below ensures decimal256 is casted properly.
221
+ schema_includes_decimal256 = (
222
+ (True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
223
+ if schema is not None
224
+ else None
225
+ )
226
+ if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
227
+ # falling back to expensive method of reading CSV
228
+ return _read_csv_rounding_decimal_columns_to_fit_scale(
229
+ schema, reader_args=args, reader_kwargs=kwargs
230
+ )
231
+ else:
232
+ return pyarrow_read_csv_default(*args, **kwargs)
233
+
234
+
106
235
  CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
107
236
  ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
108
237
  ContentType.TSV.value: pyarrow_read_csv,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.26
3
+ Version: 1.1.28
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=N7LrDYFJUaYdchJUVZ8VN_9QUJzuETzkz-oT833iEr4,1778
1
+ deltacat/__init__.py,sha256=GPlTQc6AW4ig_nZJ7kMVe-kbZxYfrSVGFN1YEqY8dXU,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -66,7 +66,7 @@ deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViV
66
66
  deltacat/compute/compactor_v2/model/merge_input.py,sha256=-SxTE0e67z2V7MiMEVz5aMu4E0k8h3-vqohvUUOC0do,5659
67
67
  deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
68
68
  deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
- deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=e8pZFobq6KBCy67ZRn2z1CAwNVjPIJnAiD4HHDmDbCk,30757
69
+ deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
70
70
  deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
71
  deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
72
72
  deltacat/compute/compactor_v2/steps/merge.py,sha256=LpktsDPfj7Of6RgUw9w1f3Y3OBkPDjvtyXjzFaIDoSo,21771
@@ -85,7 +85,7 @@ deltacat/compute/merge_on_read/model/merge_on_read_params.py,sha256=Q51znagh8PtL
85
85
  deltacat/compute/merge_on_read/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
86
86
  deltacat/compute/merge_on_read/utils/delta.py,sha256=e4BtOHa5XPpUnR4r0HqBKjXckBsTI8qBwdUWwpJfkWQ,1367
87
87
  deltacat/compute/resource_estimation/__init__.py,sha256=4bfBXcq-VAt9JCmjvj3yAmn0lEHVGdGsUCCoMGxjEqA,799
88
- deltacat/compute/resource_estimation/delta.py,sha256=Ei4v9UYhtcT5P-wNEMAg0E4mYl0z5FpSkaTufVoGD18,9492
88
+ deltacat/compute/resource_estimation/delta.py,sha256=8oRy1rgGUimwMqPB5At81AS-AsjPHdcvLHzJ9TW8RpM,9522
89
89
  deltacat/compute/resource_estimation/manifest.py,sha256=gSqOyIda-pYq3vRsKFq3IiZvwhV3mMqrWPtsmUH9dD8,13035
90
90
  deltacat/compute/resource_estimation/model.py,sha256=psyagFXdpLGt8DfDqy7c8DWiuXCacr0Swe5f0M7DdO4,5465
91
91
  deltacat/compute/resource_estimation/parquet.py,sha256=5_apma4EKbKcm-nfV73-qN2nfnCeyhFW23ZHX3jz0Kw,3158
@@ -137,11 +137,11 @@ deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kW
137
137
  deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
138
138
  deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
139
139
  deltacat/tests/compute/compact_partition_test_cases.py,sha256=R9eiKvxCLqcoHjAx3iOogdnXZEO9TvLbRf0wA7bcJN4,26170
140
- deltacat/tests/compute/test_compact_partition_incremental.py,sha256=Z0hyQGhMZjCaOn1Vk4qUbgDiS7HDhtdNeFQyG1PJhqA,14559
141
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=Qw74ajnKf41C3MCMvf4bIPXA6-ucKlPj_IeEqDm8rCg,12503
140
+ deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
141
+ deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
142
142
  deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
143
- deltacat/tests/compute/test_compact_partition_rebase.py,sha256=ztSiLgC2OpU4yz81vz-4xWzvZyrLGojtzomsW4q7Bl8,12626
144
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=CHHfNFEJW8S1We7NE1Gg6EaoKEWnaOMRxWrLyirrahc,14643
143
+ deltacat/tests/compute/test_compact_partition_rebase.py,sha256=DNcpmnBo5QoZ23BiIhJCC3zaDK0xClZLUb2-ZEEp5s4,13108
144
+ deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=Rxen3QGIaxVPa8lcO7NDMRxQ0aBjrOKn46LK5ZsfQTo,15073
145
145
  deltacat/tests/compute/test_util_common.py,sha256=0mEHo38bgH64y0XZ_zgUL_aZgQMgJOSTlOYvIJxG_MM,11825
146
146
  deltacat/tests/compute/test_util_constant.py,sha256=4o-W3E7r7jhFl1A3OFLLrdKnwcF46zx4lEIDY8ONJ3c,929
147
147
  deltacat/tests/compute/test_util_create_table_deltas_repo.py,sha256=Q3HJj1fjoe2JwRUOW8KEjbTqPIIoP2o_T3ZGH6SJnCM,13244
@@ -157,7 +157,7 @@ deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6ip
157
157
  deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
158
  deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
159
159
  deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
160
- deltacat/tests/compute/resource_estimation/test_delta.py,sha256=LyzRitBrasQa35Bq7rHTQInaOelSWOSoC0_dyjgpNuE,24505
160
+ deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
161
161
  deltacat/tests/compute/resource_estimation/test_manifest.py,sha256=yrMvqDjolExdRf6Vtg5XaKDuaKz9ok15PCZ7_aJOYrI,32893
162
162
  deltacat/tests/compute/resource_estimation/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
163
163
  deltacat/tests/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -179,7 +179,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
179
179
  deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
180
180
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
181
181
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
182
- deltacat/tests/utils/test_pyarrow.py,sha256=AWx0DTsBA1PkQag48w_HeQdz7tlBzJsm9v7Nd6-dhEY,19607
182
+ deltacat/tests/utils/test_pyarrow.py,sha256=YDuyFYNjy6thzfA6Z2a0dOytUugsExu1uMUOhmP_aXc,29977
183
183
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
184
184
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
185
185
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -200,7 +200,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
200
200
  deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
201
201
  deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
202
202
  deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
203
- deltacat/utils/pyarrow.py,sha256=nW_eD6fWAlbyHUzPj1rOOfnUbpP3RnAgNSuuVNyvhZ4,29174
203
+ deltacat/utils/pyarrow.py,sha256=xEZRzbTBU6uj9K4DtvngIPtQkTA8haVgQ4Y4vjwHvtM,34311
204
204
  deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
205
205
  deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
206
206
  deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -210,8 +210,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
210
210
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
211
211
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
212
212
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
213
- deltacat-1.1.26.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
214
- deltacat-1.1.26.dist-info/METADATA,sha256=5p2qZYAkOXBNT_rc9PyfGJ5Id3zKfbTp3KhiqZWNxas,1733
215
- deltacat-1.1.26.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
216
- deltacat-1.1.26.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
217
- deltacat-1.1.26.dist-info/RECORD,,
213
+ deltacat-1.1.28.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
214
+ deltacat-1.1.28.dist-info/METADATA,sha256=-ZnMp9C26vVxi015Il3UtBMm9pHqA8aZmTnNTgr8Tb8,1733
215
+ deltacat-1.1.28.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
216
+ deltacat-1.1.28.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
217
+ deltacat-1.1.28.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.44.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5