deltacat 1.1.26__py3-none-any.whl → 1.1.28__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.26"
47
+ __version__ = "1.1.28"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -584,8 +584,11 @@ def _process_merge_results(
584
584
  f"Duplicate record count ({duplicate_hash_bucket_mat_results}) is as large "
585
585
  f"as or greater than params.num_rounds, which is {params.num_rounds}"
586
586
  )
587
+ # ensure start index is the first file index if task index is same
587
588
  hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
588
- file_index,
589
+ hb_id_to_entry_indices_range.get(str(mat_result.task_index), [file_index])[
590
+ 0
591
+ ],
589
592
  file_index + mat_result.pyarrow_write_result.files,
590
593
  )
591
594
 
@@ -188,7 +188,7 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
188
188
  sampled_on_disk_size += delta.manifest.entries[entry_index].meta.content_length
189
189
  sampled_num_rows += len(tbl)
190
190
 
191
- if not sampled_on_disk_size:
191
+ if not sampled_on_disk_size or not sampled_in_memory_size:
192
192
  return EstimatedResources.of(
193
193
  memory_bytes=0,
194
194
  statistics=Statistics.of(
@@ -437,6 +437,43 @@ class TestEstimateResourcesRequiredToProcessDelta:
437
437
  == parquet_delta_with_manifest.meta.content_length
438
438
  )
439
439
 
440
+ def test_parquet_delta_when_file_sampling_and_arrow_size_zero(
441
+ self,
442
+ local_deltacat_storage_kwargs,
443
+ parquet_delta_with_manifest: Delta,
444
+ monkeypatch,
445
+ ):
446
+ params = EstimateResourcesParams.of(
447
+ resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
448
+ max_files_to_sample=2,
449
+ )
450
+
451
+ def mock_func(*args, **kwargs):
452
+ class MockedValue:
453
+ nbytes = 0
454
+
455
+ def __len__(self):
456
+ return 0
457
+
458
+ return MockedValue()
459
+
460
+ monkeypatch.setattr(ds, "download_delta_manifest_entry", mock_func)
461
+
462
+ result = estimate_resources_required_to_process_delta(
463
+ delta=parquet_delta_with_manifest,
464
+ operation_type=OperationType.PYARROW_DOWNLOAD,
465
+ deltacat_storage=ds,
466
+ deltacat_storage_kwargs=local_deltacat_storage_kwargs,
467
+ estimate_resources_params=params,
468
+ )
469
+
470
+ assert parquet_delta_with_manifest.manifest is not None
471
+ assert result.memory_bytes == 0
472
+ assert (
473
+ result.statistics.on_disk_size_bytes
474
+ == parquet_delta_with_manifest.meta.content_length
475
+ )
476
+
440
477
  def test_delta_manifest_utsv_when_file_sampling(
441
478
  self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
442
479
  ):
@@ -328,6 +328,16 @@ def test_compact_partition_incremental(
328
328
  **compaction_audit_obj
329
329
  )
330
330
 
331
+ # assert if RCF covers all files
332
+ if compactor_version != CompactorVersion.V1.value:
333
+ previous_end = None
334
+ for start, end in round_completion_info.hb_index_to_entry_range.values():
335
+ assert (previous_end is None and start == 0) or start == previous_end
336
+ previous_end = end
337
+ assert (
338
+ previous_end == round_completion_info.compacted_pyarrow_write_result.files
339
+ )
340
+
331
341
  tables = ds.download_delta(
332
342
  compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
333
343
  )
@@ -309,6 +309,16 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
309
309
  **compaction_audit_obj
310
310
  )
311
311
 
312
+ # assert if RCF covers all files
313
+ # multiple rounds feature is only supported in V2 compactor
314
+ previous_end = None
315
+ for start, end in round_completion_info.hb_index_to_entry_range.values():
316
+ assert (previous_end is None and start == 0) or start == previous_end
317
+ previous_end = end
318
+ assert (
319
+ previous_end == round_completion_info.compacted_pyarrow_write_result.files
320
+ )
321
+
312
322
  # Assert not in-place compacted
313
323
  assert (
314
324
  execute_compaction_result_spy.call_args.args[-1] is False
@@ -299,6 +299,17 @@ def test_compact_partition_rebase_same_source_and_destination(
299
299
  round_completion_info.compaction_audit_url
300
300
  )
301
301
 
302
+ # assert if RCF covers all files
303
+ if compactor_version != CompactorVersion.V1.value:
304
+ previous_end = None
305
+ for start, end in round_completion_info.hb_index_to_entry_range.values():
306
+ assert (previous_end is None and start == 0) or start == previous_end
307
+ previous_end = end
308
+ assert (
309
+ previous_end
310
+ == round_completion_info.compacted_pyarrow_write_result.files
311
+ )
312
+
302
313
  compaction_audit_obj: Dict[str, Any] = read_s3_contents(
303
314
  s3_resource, audit_bucket, audit_key
304
315
  )
@@ -355,6 +355,16 @@ def test_compact_partition_rebase_then_incremental(
355
355
  compacted_delta_locator_incremental: DeltaLocator = (
356
356
  round_completion_info.compacted_delta_locator
357
357
  )
358
+ # assert if RCF covers all files
359
+ if compactor_version != CompactorVersion.V1.value:
360
+ previous_end = None
361
+ for start, end in round_completion_info.hb_index_to_entry_range.values():
362
+ assert (previous_end is None and start == 0) or start == previous_end
363
+ previous_end = end
364
+ assert (
365
+ previous_end == round_completion_info.compacted_pyarrow_write_result.files
366
+ )
367
+
358
368
  audit_bucket, audit_key = round_completion_info.compaction_audit_url.replace(
359
369
  "s3://", ""
360
370
  ).split("/", 1)
@@ -7,7 +7,9 @@ from deltacat.utils.pyarrow import (
7
7
  s3_file_to_table,
8
8
  ReadKwargsProviderPyArrowSchemaOverride,
9
9
  RAISE_ON_EMPTY_CSV_KWARG,
10
+ RAISE_ON_DECIMAL_OVERFLOW,
10
11
  )
12
+ import decimal
11
13
  from deltacat.types.media import ContentEncoding, ContentType
12
14
  from deltacat.types.partial_download import PartialParquetParameters
13
15
  from pyarrow.parquet import ParquetFile
@@ -16,6 +18,12 @@ import pyarrow as pa
16
18
  PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
17
19
  EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
18
20
  NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
21
+ OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
22
+ "deltacat/tests/utils/data/overflowing_decimal_precision.csv"
23
+ )
24
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
25
+ "deltacat/tests/utils/data/overflowing_decimal_scale.csv"
26
+ )
19
27
  GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
20
28
  BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
21
29
 
@@ -407,6 +415,253 @@ class TestReadCSV(TestCase):
407
415
  ),
408
416
  )
409
417
 
418
+ def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
419
+ schema = pa.schema(
420
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
421
+ )
422
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
423
+ _add_column_kwargs(
424
+ ContentType.UNESCAPED_TSV.value,
425
+ ["is_active", "decimal_value"],
426
+ ["is_active", "decimal_value"],
427
+ kwargs,
428
+ )
429
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
430
+
431
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
432
+ self.assertRaises(
433
+ pa.lib.ArrowInvalid,
434
+ lambda: pyarrow_read_csv(
435
+ OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
436
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
437
+ ),
438
+ )
439
+
440
+ def test_read_csv_when_decimal_precision_overflows_sanity(self):
441
+ schema = pa.schema(
442
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
443
+ )
444
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
445
+ _add_column_kwargs(
446
+ ContentType.UNESCAPED_TSV.value,
447
+ ["is_active", "decimal_value"],
448
+ ["is_active", "decimal_value"],
449
+ kwargs,
450
+ )
451
+
452
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
453
+
454
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
455
+
456
+ self.assertRaises(
457
+ pa.lib.ArrowInvalid,
458
+ lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
459
+ )
460
+
461
+ def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
462
+ schema = pa.schema(
463
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
464
+ )
465
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
466
+ _add_column_kwargs(
467
+ ContentType.UNESCAPED_TSV.value,
468
+ ["is_active", "decimal_value"],
469
+ ["is_active", "decimal_value"],
470
+ kwargs,
471
+ )
472
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
473
+
474
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
475
+
476
+ self.assertRaises(
477
+ pa.lib.ArrowInvalid,
478
+ lambda: pyarrow_read_csv(
479
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
480
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
481
+ ),
482
+ )
483
+
484
+ def test_read_csv_when_decimal_scale_overflows_sanity(self):
485
+ schema = pa.schema(
486
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
487
+ )
488
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
489
+ _add_column_kwargs(
490
+ ContentType.UNESCAPED_TSV.value,
491
+ ["is_active", "decimal_value"],
492
+ ["is_active", "decimal_value"],
493
+ kwargs,
494
+ )
495
+
496
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
497
+
498
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
499
+
500
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
501
+
502
+ self.assertEqual(len(result), 3)
503
+ self.assertEqual(
504
+ result[1][0].as_py(), decimal.Decimal("322236.66")
505
+ ) # rounding decimal
506
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
507
+ self.assertEqual(len(result.column_names), 2)
508
+ result_schema = result.schema
509
+ self.assertEqual(result_schema.field(0).type, "string")
510
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
511
+
512
+ def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
513
+ schema = pa.schema(
514
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
515
+ )
516
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
517
+ _add_column_kwargs(
518
+ ContentType.UNESCAPED_TSV.value,
519
+ ["is_active", "decimal_value"],
520
+ ["is_active", "decimal_value"],
521
+ kwargs,
522
+ )
523
+
524
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
525
+
526
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
527
+
528
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
529
+
530
+ self.assertEqual(len(result), 3)
531
+ self.assertEqual(
532
+ result[1][0].as_py(),
533
+ decimal.Decimal("322200"), # consequence of negative scale
534
+ ) # rounding decimal
535
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
536
+ self.assertEqual(len(result.column_names), 2)
537
+ result_schema = result.schema
538
+ self.assertEqual(result_schema.field(0).type, "string")
539
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
540
+
541
+ def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
542
+ schema = pa.schema(
543
+ [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
544
+ )
545
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
546
+ _add_column_kwargs(
547
+ ContentType.UNESCAPED_TSV.value,
548
+ ["is_active", "decimal_value"],
549
+ ["is_active", "decimal_value"],
550
+ kwargs,
551
+ )
552
+
553
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
554
+
555
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
556
+
557
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
558
+
559
+ self.assertEqual(len(result), 3)
560
+ self.assertEqual(
561
+ result[1][0].as_py(), decimal.Decimal("322236.66")
562
+ ) # rounding decimal
563
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
564
+ self.assertEqual(len(result.column_names), 2)
565
+ result_schema = result.schema
566
+ self.assertEqual(result_schema.field(0).type, "string")
567
+ self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
568
+
569
+ def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
570
+ self,
571
+ ):
572
+ schema = pa.schema(
573
+ [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
574
+ )
575
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
576
+ _add_column_kwargs(
577
+ ContentType.UNESCAPED_TSV.value,
578
+ ["is_active", "decimal_value"],
579
+ ["is_active", "decimal_value"],
580
+ kwargs,
581
+ )
582
+
583
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
584
+
585
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
586
+
587
+ self.assertRaises(
588
+ pa.lib.ArrowNotImplementedError,
589
+ lambda: pyarrow_read_csv(
590
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
591
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
592
+ ),
593
+ )
594
+
595
+ def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
596
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
597
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
598
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
599
+
600
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
601
+
602
+ # The default behavior of pyarrow is to invalid skip rows
603
+ self.assertEqual(len(result), 2)
604
+ self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
605
+ self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
606
+ self.assertEqual(len(result.column_names), 2)
607
+ result_schema = result.schema
608
+ self.assertEqual(result_schema.field(0).type, "string")
609
+ self.assertEqual(result_schema.field(1).type, pa.float64())
610
+
611
+ def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
612
+ self,
613
+ ):
614
+ schema = pa.schema(
615
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
616
+ )
617
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
618
+ _add_column_kwargs(
619
+ ContentType.UNESCAPED_TSV.value,
620
+ ["is_active", "decimal_value"],
621
+ ["is_active", "decimal_value"],
622
+ kwargs,
623
+ )
624
+
625
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
626
+
627
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
628
+
629
+ self.assertRaises(
630
+ pa.lib.ArrowInvalid,
631
+ lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
632
+ )
633
+
634
+ def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
635
+ schema = pa.schema(
636
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
637
+ )
638
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
639
+ _add_column_kwargs(
640
+ ContentType.UNESCAPED_TSV.value,
641
+ ["is_active", "decimal_value"],
642
+ ["is_active", "decimal_value"],
643
+ kwargs,
644
+ )
645
+
646
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
647
+
648
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
649
+
650
+ with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
651
+ result = pyarrow_read_csv(file, **kwargs)
652
+
653
+ self.assertEqual(len(result), 3)
654
+ self.assertEqual(
655
+ result[1][0].as_py(), decimal.Decimal("322236.66")
656
+ ) # rounding decimal
657
+ self.assertEqual(
658
+ result[1][1].as_py(), decimal.Decimal("32.33")
659
+ ) # not rounded
660
+ self.assertEqual(len(result.column_names), 2)
661
+ result_schema = result.schema
662
+ self.assertEqual(result_schema.field(0).type, "string")
663
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
664
+
410
665
 
411
666
  class TestS3FileToTable(TestCase):
412
667
  def test_s3_file_to_table_identity_sanity(self):
deltacat/utils/pyarrow.py CHANGED
@@ -1,6 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ import copy
4
5
  import bz2
5
6
  import gzip
6
7
  import io
@@ -47,6 +48,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
47
48
  RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
48
49
  READER_TYPE_KWARG = "reader_type"
49
50
 
51
+ """
52
+ By default, round decimal values using half_to_even round mode when
53
+ rescaling a decimal to the given scale and precision in the schema would cause
54
+ data loss. Setting any non null value of this argument will result
55
+ in an error instead.
56
+ """
57
+ RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
58
+ # Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
59
+ DECIMAL256_DEFAULT_SCALE = 38
60
+ DECIMAL256_MAX_PRECISION = 76
61
+
50
62
 
51
63
  def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
52
64
 
@@ -64,45 +76,162 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
64
76
  return target_schema
65
77
 
66
78
 
67
- def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
68
- try:
69
- new_kwargs = sanitize_kwargs_by_supported_kwargs(
70
- ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
79
+ def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
80
+ schema = None
81
+ if (
82
+ "convert_options" in kwargs
83
+ and kwargs["convert_options"].column_types is not None
84
+ ):
85
+ schema = kwargs["convert_options"].column_types
86
+ if not isinstance(schema, pa.Schema):
87
+ schema = pa.schema(schema)
88
+ if kwargs["convert_options"].include_columns:
89
+ schema = _filter_schema_for_columns(
90
+ schema, kwargs["convert_options"].include_columns
91
+ )
92
+ elif (
93
+ kwargs.get("read_options") is not None
94
+ and kwargs["read_options"].column_names
95
+ ):
96
+ schema = _filter_schema_for_columns(
97
+ schema, kwargs["read_options"].column_names
98
+ )
99
+ else:
100
+ logger.debug(
101
+ "Schema not specified in the kwargs."
102
+ " Hence, schema could not be inferred from the empty CSV."
71
103
  )
104
+
105
+ return schema
106
+
107
+
108
+ def _new_schema_with_replaced_fields(
109
+ schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
110
+ ) -> pa.Schema:
111
+ if schema is None:
112
+ return None
113
+
114
+ new_schema_fields = []
115
+ for field in schema:
116
+ new_field = field_to_replace(field)
117
+ if new_field is not None:
118
+ new_schema_fields.append(new_field)
119
+ else:
120
+ new_schema_fields.append(field)
121
+
122
+ return pa.schema(new_schema_fields, metadata=schema.metadata)
123
+
124
+
125
+ def _read_csv_rounding_decimal_columns_to_fit_scale(
126
+ schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
127
+ ) -> pa.Table:
128
+ # Note: We read decimals as strings first because CSV
129
+ # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
130
+ new_schema = _new_schema_with_replaced_fields(
131
+ schema,
132
+ lambda fld: pa.field(fld.name, pa.string(), metadata=fld.metadata)
133
+ if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
134
+ else None,
135
+ )
136
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
137
+ ["read_options", "parse_options", "convert_options", "memory_pool"],
138
+ reader_kwargs,
139
+ )
140
+ # Creating a shallow copy for efficiency
141
+ new_convert_options = copy.copy(new_kwargs["convert_options"])
142
+ new_convert_options.column_types = new_schema
143
+ new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
144
+ arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
145
+
146
+ for column_index, field in enumerate(schema):
147
+ if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
148
+ column_array = arrow_table[field.name]
149
+ # We always cast to decimal256 to accomodate fixed scale of 38
150
+ cast_to_type = pa.decimal256(
151
+ DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
152
+ )
153
+ casted_decimal_array = pc.cast(column_array, cast_to_type)
154
+ # Note that scale can be negative
155
+ rounded_column_array = pc.round(
156
+ casted_decimal_array, ndigits=field.type.scale
157
+ )
158
+ final_decimal_array = pc.cast(rounded_column_array, field.type)
159
+ arrow_table = arrow_table.set_column(
160
+ column_index,
161
+ field,
162
+ final_decimal_array,
163
+ )
164
+ logger.debug(
165
+ f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
166
+ f" {field.type.precision} precision"
167
+ )
168
+
169
+ return arrow_table
170
+
171
+
172
+ def pyarrow_read_csv_default(*args, **kwargs):
173
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
174
+ ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
175
+ )
176
+
177
+ try:
72
178
  return pacsv.read_csv(*args, **new_kwargs)
73
179
  except pa.lib.ArrowInvalid as e:
74
- if e.__str__() == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
75
- schema = None
76
- if (
77
- "convert_options" in kwargs
78
- and kwargs["convert_options"].column_types is not None
79
- ):
80
- schema = kwargs["convert_options"].column_types
81
- if not isinstance(schema, pa.Schema):
82
- schema = pa.schema(schema)
83
- if kwargs["convert_options"].include_columns:
84
- schema = _filter_schema_for_columns(
85
- schema, kwargs["convert_options"].include_columns
86
- )
87
- elif (
88
- kwargs.get("read_options") is not None
89
- and kwargs["read_options"].column_names
180
+ error_str = e.__str__()
181
+ schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
182
+
183
+ if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
184
+ logger.debug(f"Read CSV empty schema being used: {schema}")
185
+ return pa.Table.from_pylist([], schema=schema)
186
+ if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
187
+ # Note, this logic requires expensive casting. To prevent downgrading performance
188
+ # for happy path reads, we are handling this case in response to an error.
189
+ logger.warning(
190
+ "Rescaling Decimal to the given scale in the schema. "
191
+ f"Original error: {error_str}"
192
+ )
193
+
194
+ if schema is not None and "convert_options" in kwargs:
195
+ if (
196
+ "Rescaling Decimal" in error_str
197
+ and "value would cause data loss" in error_str
90
198
  ):
91
- schema = _filter_schema_for_columns(
92
- schema, kwargs["read_options"].column_names
199
+ logger.debug(f"Checking if the file: {args[0]}...")
200
+ # Since we are re-reading the file, we have to seek to beginning
201
+ if isinstance(args[0], io.IOBase) and args[0].seekable():
202
+ logger.debug(f"Seeking to the beginning of the file {args[0]}")
203
+ args[0].seek(0)
204
+ return _read_csv_rounding_decimal_columns_to_fit_scale(
205
+ schema=schema, reader_args=args, reader_kwargs=kwargs
93
206
  )
94
-
95
207
  else:
96
208
  logger.debug(
97
- "Schema not specified in the kwargs."
98
- " Hence, schema could not be inferred from the empty CSV."
209
+ "Schema is None when trying to adjust decimal values. "
210
+ "Hence, bubbling up exception..."
99
211
  )
100
212
 
101
- logger.debug(f"Read CSV empty schema being used: {schema}")
102
- return pa.Table.from_pylist([], schema=schema)
103
213
  raise e
104
214
 
105
215
 
216
+ def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
217
+ schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
218
+
219
+ # CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
220
+ # Below ensures decimal256 is casted properly.
221
+ schema_includes_decimal256 = (
222
+ (True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
223
+ if schema is not None
224
+ else None
225
+ )
226
+ if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
227
+ # falling back to expensive method of reading CSV
228
+ return _read_csv_rounding_decimal_columns_to_fit_scale(
229
+ schema, reader_args=args, reader_kwargs=kwargs
230
+ )
231
+ else:
232
+ return pyarrow_read_csv_default(*args, **kwargs)
233
+
234
+
106
235
  CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
107
236
  ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
108
237
  ContentType.TSV.value: pyarrow_read_csv,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.26
3
+ Version: 1.1.28
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=N7LrDYFJUaYdchJUVZ8VN_9QUJzuETzkz-oT833iEr4,1778
1
+ deltacat/__init__.py,sha256=GPlTQc6AW4ig_nZJ7kMVe-kbZxYfrSVGFN1YEqY8dXU,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -66,7 +66,7 @@ deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViV
66
66
  deltacat/compute/compactor_v2/model/merge_input.py,sha256=-SxTE0e67z2V7MiMEVz5aMu4E0k8h3-vqohvUUOC0do,5659
67
67
  deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
68
68
  deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
- deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=e8pZFobq6KBCy67ZRn2z1CAwNVjPIJnAiD4HHDmDbCk,30757
69
+ deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
70
70
  deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
71
  deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
72
72
  deltacat/compute/compactor_v2/steps/merge.py,sha256=LpktsDPfj7Of6RgUw9w1f3Y3OBkPDjvtyXjzFaIDoSo,21771
@@ -85,7 +85,7 @@ deltacat/compute/merge_on_read/model/merge_on_read_params.py,sha256=Q51znagh8PtL
85
85
  deltacat/compute/merge_on_read/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
86
86
  deltacat/compute/merge_on_read/utils/delta.py,sha256=e4BtOHa5XPpUnR4r0HqBKjXckBsTI8qBwdUWwpJfkWQ,1367
87
87
  deltacat/compute/resource_estimation/__init__.py,sha256=4bfBXcq-VAt9JCmjvj3yAmn0lEHVGdGsUCCoMGxjEqA,799
88
- deltacat/compute/resource_estimation/delta.py,sha256=Ei4v9UYhtcT5P-wNEMAg0E4mYl0z5FpSkaTufVoGD18,9492
88
+ deltacat/compute/resource_estimation/delta.py,sha256=8oRy1rgGUimwMqPB5At81AS-AsjPHdcvLHzJ9TW8RpM,9522
89
89
  deltacat/compute/resource_estimation/manifest.py,sha256=gSqOyIda-pYq3vRsKFq3IiZvwhV3mMqrWPtsmUH9dD8,13035
90
90
  deltacat/compute/resource_estimation/model.py,sha256=psyagFXdpLGt8DfDqy7c8DWiuXCacr0Swe5f0M7DdO4,5465
91
91
  deltacat/compute/resource_estimation/parquet.py,sha256=5_apma4EKbKcm-nfV73-qN2nfnCeyhFW23ZHX3jz0Kw,3158
@@ -137,11 +137,11 @@ deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kW
137
137
  deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
138
138
  deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
139
139
  deltacat/tests/compute/compact_partition_test_cases.py,sha256=R9eiKvxCLqcoHjAx3iOogdnXZEO9TvLbRf0wA7bcJN4,26170
140
- deltacat/tests/compute/test_compact_partition_incremental.py,sha256=Z0hyQGhMZjCaOn1Vk4qUbgDiS7HDhtdNeFQyG1PJhqA,14559
141
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=Qw74ajnKf41C3MCMvf4bIPXA6-ucKlPj_IeEqDm8rCg,12503
140
+ deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
141
+ deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
142
142
  deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
143
- deltacat/tests/compute/test_compact_partition_rebase.py,sha256=ztSiLgC2OpU4yz81vz-4xWzvZyrLGojtzomsW4q7Bl8,12626
144
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=CHHfNFEJW8S1We7NE1Gg6EaoKEWnaOMRxWrLyirrahc,14643
143
+ deltacat/tests/compute/test_compact_partition_rebase.py,sha256=DNcpmnBo5QoZ23BiIhJCC3zaDK0xClZLUb2-ZEEp5s4,13108
144
+ deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=Rxen3QGIaxVPa8lcO7NDMRxQ0aBjrOKn46LK5ZsfQTo,15073
145
145
  deltacat/tests/compute/test_util_common.py,sha256=0mEHo38bgH64y0XZ_zgUL_aZgQMgJOSTlOYvIJxG_MM,11825
146
146
  deltacat/tests/compute/test_util_constant.py,sha256=4o-W3E7r7jhFl1A3OFLLrdKnwcF46zx4lEIDY8ONJ3c,929
147
147
  deltacat/tests/compute/test_util_create_table_deltas_repo.py,sha256=Q3HJj1fjoe2JwRUOW8KEjbTqPIIoP2o_T3ZGH6SJnCM,13244
@@ -157,7 +157,7 @@ deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6ip
157
157
  deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
158
  deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
159
159
  deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
160
- deltacat/tests/compute/resource_estimation/test_delta.py,sha256=LyzRitBrasQa35Bq7rHTQInaOelSWOSoC0_dyjgpNuE,24505
160
+ deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
161
161
  deltacat/tests/compute/resource_estimation/test_manifest.py,sha256=yrMvqDjolExdRf6Vtg5XaKDuaKz9ok15PCZ7_aJOYrI,32893
162
162
  deltacat/tests/compute/resource_estimation/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
163
163
  deltacat/tests/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -179,7 +179,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
179
179
  deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
180
180
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
181
181
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
182
- deltacat/tests/utils/test_pyarrow.py,sha256=AWx0DTsBA1PkQag48w_HeQdz7tlBzJsm9v7Nd6-dhEY,19607
182
+ deltacat/tests/utils/test_pyarrow.py,sha256=YDuyFYNjy6thzfA6Z2a0dOytUugsExu1uMUOhmP_aXc,29977
183
183
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
184
184
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
185
185
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -200,7 +200,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
200
200
  deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
201
201
  deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
202
202
  deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
203
- deltacat/utils/pyarrow.py,sha256=nW_eD6fWAlbyHUzPj1rOOfnUbpP3RnAgNSuuVNyvhZ4,29174
203
+ deltacat/utils/pyarrow.py,sha256=xEZRzbTBU6uj9K4DtvngIPtQkTA8haVgQ4Y4vjwHvtM,34311
204
204
  deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
205
205
  deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
206
206
  deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -210,8 +210,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
210
210
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
211
211
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
212
212
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
213
- deltacat-1.1.26.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
214
- deltacat-1.1.26.dist-info/METADATA,sha256=5p2qZYAkOXBNT_rc9PyfGJ5Id3zKfbTp3KhiqZWNxas,1733
215
- deltacat-1.1.26.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
216
- deltacat-1.1.26.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
217
- deltacat-1.1.26.dist-info/RECORD,,
213
+ deltacat-1.1.28.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
214
+ deltacat-1.1.28.dist-info/METADATA,sha256=-ZnMp9C26vVxi015Il3UtBMm9pHqA8aZmTnNTgr8Tb8,1733
215
+ deltacat-1.1.28.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
216
+ deltacat-1.1.28.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
217
+ deltacat-1.1.28.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.44.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5