deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +2 -3
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
- deltacat/compute/compactor_v2/steps/merge.py +11 -80
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.35.dist-info/METADATA +0 -64
- deltacat-1.1.35.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -2,33 +2,20 @@ from unittest import TestCase
|
|
2
2
|
from deltacat.utils.pyarrow import (
|
3
3
|
s3_partial_parquet_file_to_table,
|
4
4
|
pyarrow_read_csv,
|
5
|
-
ContentTypeValidationError,
|
6
5
|
content_type_to_reader_kwargs,
|
7
6
|
_add_column_kwargs,
|
8
|
-
logger,
|
9
7
|
s3_file_to_table,
|
10
|
-
s3_file_to_parquet,
|
11
8
|
ReadKwargsProviderPyArrowSchemaOverride,
|
12
9
|
RAISE_ON_EMPTY_CSV_KWARG,
|
13
|
-
RAISE_ON_DECIMAL_OVERFLOW,
|
14
|
-
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG,
|
15
10
|
)
|
16
|
-
import decimal
|
17
11
|
from deltacat.types.media import ContentEncoding, ContentType
|
18
12
|
from deltacat.types.partial_download import PartialParquetParameters
|
19
13
|
from pyarrow.parquet import ParquetFile
|
20
14
|
import pyarrow as pa
|
21
15
|
|
22
16
|
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
23
|
-
PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
|
24
17
|
EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
|
25
18
|
NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
|
26
|
-
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
|
27
|
-
"deltacat/tests/utils/data/overflowing_decimal_precision.csv"
|
28
|
-
)
|
29
|
-
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
|
30
|
-
"deltacat/tests/utils/data/overflowing_decimal_scale.csv"
|
31
|
-
)
|
32
19
|
GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
|
33
20
|
BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
|
34
21
|
|
@@ -420,253 +407,6 @@ class TestReadCSV(TestCase):
|
|
420
407
|
),
|
421
408
|
)
|
422
409
|
|
423
|
-
def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
|
424
|
-
schema = pa.schema(
|
425
|
-
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
426
|
-
)
|
427
|
-
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
428
|
-
_add_column_kwargs(
|
429
|
-
ContentType.UNESCAPED_TSV.value,
|
430
|
-
["is_active", "decimal_value"],
|
431
|
-
["is_active", "decimal_value"],
|
432
|
-
kwargs,
|
433
|
-
)
|
434
|
-
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
435
|
-
|
436
|
-
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
437
|
-
self.assertRaises(
|
438
|
-
pa.lib.ArrowInvalid,
|
439
|
-
lambda: pyarrow_read_csv(
|
440
|
-
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
|
441
|
-
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
442
|
-
),
|
443
|
-
)
|
444
|
-
|
445
|
-
def test_read_csv_when_decimal_precision_overflows_sanity(self):
|
446
|
-
schema = pa.schema(
|
447
|
-
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
448
|
-
)
|
449
|
-
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
450
|
-
_add_column_kwargs(
|
451
|
-
ContentType.UNESCAPED_TSV.value,
|
452
|
-
["is_active", "decimal_value"],
|
453
|
-
["is_active", "decimal_value"],
|
454
|
-
kwargs,
|
455
|
-
)
|
456
|
-
|
457
|
-
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
458
|
-
|
459
|
-
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
460
|
-
|
461
|
-
self.assertRaises(
|
462
|
-
pa.lib.ArrowInvalid,
|
463
|
-
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
|
464
|
-
)
|
465
|
-
|
466
|
-
def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
|
467
|
-
schema = pa.schema(
|
468
|
-
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
469
|
-
)
|
470
|
-
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
471
|
-
_add_column_kwargs(
|
472
|
-
ContentType.UNESCAPED_TSV.value,
|
473
|
-
["is_active", "decimal_value"],
|
474
|
-
["is_active", "decimal_value"],
|
475
|
-
kwargs,
|
476
|
-
)
|
477
|
-
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
478
|
-
|
479
|
-
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
480
|
-
|
481
|
-
self.assertRaises(
|
482
|
-
pa.lib.ArrowInvalid,
|
483
|
-
lambda: pyarrow_read_csv(
|
484
|
-
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
485
|
-
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
486
|
-
),
|
487
|
-
)
|
488
|
-
|
489
|
-
def test_read_csv_when_decimal_scale_overflows_sanity(self):
|
490
|
-
schema = pa.schema(
|
491
|
-
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
492
|
-
)
|
493
|
-
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
494
|
-
_add_column_kwargs(
|
495
|
-
ContentType.UNESCAPED_TSV.value,
|
496
|
-
["is_active", "decimal_value"],
|
497
|
-
["is_active", "decimal_value"],
|
498
|
-
kwargs,
|
499
|
-
)
|
500
|
-
|
501
|
-
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
502
|
-
|
503
|
-
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
504
|
-
|
505
|
-
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
506
|
-
|
507
|
-
self.assertEqual(len(result), 3)
|
508
|
-
self.assertEqual(
|
509
|
-
result[1][0].as_py(), decimal.Decimal("322236.66")
|
510
|
-
) # rounding decimal
|
511
|
-
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
512
|
-
self.assertEqual(len(result.column_names), 2)
|
513
|
-
result_schema = result.schema
|
514
|
-
self.assertEqual(result_schema.field(0).type, "string")
|
515
|
-
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
|
516
|
-
|
517
|
-
def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
|
518
|
-
schema = pa.schema(
|
519
|
-
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
|
520
|
-
)
|
521
|
-
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
522
|
-
_add_column_kwargs(
|
523
|
-
ContentType.UNESCAPED_TSV.value,
|
524
|
-
["is_active", "decimal_value"],
|
525
|
-
["is_active", "decimal_value"],
|
526
|
-
kwargs,
|
527
|
-
)
|
528
|
-
|
529
|
-
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
530
|
-
|
531
|
-
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
532
|
-
|
533
|
-
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
534
|
-
|
535
|
-
self.assertEqual(len(result), 3)
|
536
|
-
self.assertEqual(
|
537
|
-
result[1][0].as_py(),
|
538
|
-
decimal.Decimal("322200"), # consequence of negative scale
|
539
|
-
) # rounding decimal
|
540
|
-
self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
|
541
|
-
self.assertEqual(len(result.column_names), 2)
|
542
|
-
result_schema = result.schema
|
543
|
-
self.assertEqual(result_schema.field(0).type, "string")
|
544
|
-
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
|
545
|
-
|
546
|
-
def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
|
547
|
-
schema = pa.schema(
|
548
|
-
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
549
|
-
)
|
550
|
-
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
551
|
-
_add_column_kwargs(
|
552
|
-
ContentType.UNESCAPED_TSV.value,
|
553
|
-
["is_active", "decimal_value"],
|
554
|
-
["is_active", "decimal_value"],
|
555
|
-
kwargs,
|
556
|
-
)
|
557
|
-
|
558
|
-
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
559
|
-
|
560
|
-
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
561
|
-
|
562
|
-
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
563
|
-
|
564
|
-
self.assertEqual(len(result), 3)
|
565
|
-
self.assertEqual(
|
566
|
-
result[1][0].as_py(), decimal.Decimal("322236.66")
|
567
|
-
) # rounding decimal
|
568
|
-
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
569
|
-
self.assertEqual(len(result.column_names), 2)
|
570
|
-
result_schema = result.schema
|
571
|
-
self.assertEqual(result_schema.field(0).type, "string")
|
572
|
-
self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
|
573
|
-
|
574
|
-
def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
|
575
|
-
self,
|
576
|
-
):
|
577
|
-
schema = pa.schema(
|
578
|
-
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
579
|
-
)
|
580
|
-
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
581
|
-
_add_column_kwargs(
|
582
|
-
ContentType.UNESCAPED_TSV.value,
|
583
|
-
["is_active", "decimal_value"],
|
584
|
-
["is_active", "decimal_value"],
|
585
|
-
kwargs,
|
586
|
-
)
|
587
|
-
|
588
|
-
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
589
|
-
|
590
|
-
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
591
|
-
|
592
|
-
self.assertRaises(
|
593
|
-
pa.lib.ArrowNotImplementedError,
|
594
|
-
lambda: pyarrow_read_csv(
|
595
|
-
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
596
|
-
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
597
|
-
),
|
598
|
-
)
|
599
|
-
|
600
|
-
def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
|
601
|
-
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
602
|
-
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
|
603
|
-
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
604
|
-
|
605
|
-
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
606
|
-
|
607
|
-
# The default behavior of pyarrow is to invalid skip rows
|
608
|
-
self.assertEqual(len(result), 2)
|
609
|
-
self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
|
610
|
-
self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
|
611
|
-
self.assertEqual(len(result.column_names), 2)
|
612
|
-
result_schema = result.schema
|
613
|
-
self.assertEqual(result_schema.field(0).type, "string")
|
614
|
-
self.assertEqual(result_schema.field(1).type, pa.float64())
|
615
|
-
|
616
|
-
def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
|
617
|
-
self,
|
618
|
-
):
|
619
|
-
schema = pa.schema(
|
620
|
-
[("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
|
621
|
-
)
|
622
|
-
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
623
|
-
_add_column_kwargs(
|
624
|
-
ContentType.UNESCAPED_TSV.value,
|
625
|
-
["is_active", "decimal_value"],
|
626
|
-
["is_active", "decimal_value"],
|
627
|
-
kwargs,
|
628
|
-
)
|
629
|
-
|
630
|
-
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
631
|
-
|
632
|
-
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
633
|
-
|
634
|
-
self.assertRaises(
|
635
|
-
pa.lib.ArrowInvalid,
|
636
|
-
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
|
637
|
-
)
|
638
|
-
|
639
|
-
def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
|
640
|
-
schema = pa.schema(
|
641
|
-
[("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
|
642
|
-
)
|
643
|
-
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
644
|
-
_add_column_kwargs(
|
645
|
-
ContentType.UNESCAPED_TSV.value,
|
646
|
-
["is_active", "decimal_value"],
|
647
|
-
["is_active", "decimal_value"],
|
648
|
-
kwargs,
|
649
|
-
)
|
650
|
-
|
651
|
-
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
652
|
-
|
653
|
-
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
654
|
-
|
655
|
-
with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
|
656
|
-
result = pyarrow_read_csv(file, **kwargs)
|
657
|
-
|
658
|
-
self.assertEqual(len(result), 3)
|
659
|
-
self.assertEqual(
|
660
|
-
result[1][0].as_py(), decimal.Decimal("322236.66")
|
661
|
-
) # rounding decimal
|
662
|
-
self.assertEqual(
|
663
|
-
result[1][1].as_py(), decimal.Decimal("32.33")
|
664
|
-
) # not rounded
|
665
|
-
self.assertEqual(len(result.column_names), 2)
|
666
|
-
result_schema = result.schema
|
667
|
-
self.assertEqual(result_schema.field(0).type, "string")
|
668
|
-
self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
|
669
|
-
|
670
410
|
|
671
411
|
class TestS3FileToTable(TestCase):
|
672
412
|
def test_s3_file_to_table_identity_sanity(self):
|
@@ -794,175 +534,3 @@ class TestS3FileToTable(TestCase):
|
|
794
534
|
self.assertEqual(field.name, schema.field(index).name)
|
795
535
|
|
796
536
|
self.assertEqual(result.schema.field(1).type, "string")
|
797
|
-
|
798
|
-
def test_s3_file_to_table_when_parquet_gzip(self):
|
799
|
-
|
800
|
-
pa_kwargs_provider = lambda content_type, kwargs: {
|
801
|
-
"reader_type": "pyarrow",
|
802
|
-
**kwargs,
|
803
|
-
}
|
804
|
-
|
805
|
-
result = s3_file_to_table(
|
806
|
-
PARQUET_GZIP_COMPRESSED_FILE_PATH,
|
807
|
-
ContentType.PARQUET.value,
|
808
|
-
ContentEncoding.GZIP.value,
|
809
|
-
["n_legs", "animal"],
|
810
|
-
["n_legs"],
|
811
|
-
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
812
|
-
)
|
813
|
-
|
814
|
-
self.assertEqual(len(result), 6)
|
815
|
-
self.assertEqual(len(result.column_names), 1)
|
816
|
-
schema = result.schema
|
817
|
-
schema_index = schema.get_field_index("n_legs")
|
818
|
-
self.assertEqual(schema.field(schema_index).type, "int64")
|
819
|
-
|
820
|
-
def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
|
821
|
-
schema = pa.schema(
|
822
|
-
[("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
|
823
|
-
)
|
824
|
-
# OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
|
825
|
-
pa_kwargs_provider = lambda content_type, kwargs: {
|
826
|
-
"reader_type": "pyarrow",
|
827
|
-
**kwargs,
|
828
|
-
}
|
829
|
-
pa_kwargs_provider = lambda content_type, kwargs: {
|
830
|
-
"reader_type": "pyarrow",
|
831
|
-
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
832
|
-
**kwargs,
|
833
|
-
}
|
834
|
-
|
835
|
-
result = s3_file_to_table(
|
836
|
-
GZIP_COMPRESSED_FILE_UTSV_PATH,
|
837
|
-
ContentType.UNESCAPED_TSV.value,
|
838
|
-
ContentEncoding.GZIP.value,
|
839
|
-
["is_active", "ship_datetime_utc"],
|
840
|
-
None,
|
841
|
-
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
842
|
-
)
|
843
|
-
|
844
|
-
self.assertEqual(len(result), 3)
|
845
|
-
self.assertEqual(len(result.column_names), 2)
|
846
|
-
result_schema = result.schema
|
847
|
-
for index, field in enumerate(result_schema):
|
848
|
-
self.assertEqual(field.name, schema.field(index).name)
|
849
|
-
|
850
|
-
self.assertEqual(result.schema.field(0).type, "string")
|
851
|
-
|
852
|
-
def test_s3_file_to_table_when_parquet_gzip_and_encoding_overridden(self):
|
853
|
-
pa_kwargs_provider = lambda content_type, kwargs: {
|
854
|
-
"reader_type": "pyarrow",
|
855
|
-
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
856
|
-
**kwargs,
|
857
|
-
}
|
858
|
-
|
859
|
-
result = s3_file_to_table(
|
860
|
-
PARQUET_FILE_PATH,
|
861
|
-
ContentType.PARQUET.value,
|
862
|
-
ContentEncoding.GZIP.value,
|
863
|
-
["n_legs", "animal"],
|
864
|
-
["n_legs"],
|
865
|
-
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
866
|
-
)
|
867
|
-
|
868
|
-
self.assertEqual(len(result), 6)
|
869
|
-
self.assertEqual(len(result.column_names), 1)
|
870
|
-
schema = result.schema
|
871
|
-
schema_index = schema.get_field_index("n_legs")
|
872
|
-
self.assertEqual(schema.field(schema_index).type, "int64")
|
873
|
-
|
874
|
-
|
875
|
-
class TestS3FileToParquet(TestCase):
|
876
|
-
def test_s3_file_to_parquet_sanity(self):
|
877
|
-
test_s3_url = PARQUET_FILE_PATH
|
878
|
-
test_content_type = ContentType.PARQUET.value
|
879
|
-
test_content_encoding = ContentEncoding.IDENTITY.value
|
880
|
-
pa_kwargs_provider = lambda content_type, kwargs: {
|
881
|
-
"reader_type": "pyarrow",
|
882
|
-
**kwargs,
|
883
|
-
}
|
884
|
-
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
885
|
-
result_parquet_file: ParquetFile = s3_file_to_parquet(
|
886
|
-
test_s3_url,
|
887
|
-
test_content_type,
|
888
|
-
test_content_encoding,
|
889
|
-
["n_legs", "animal"],
|
890
|
-
["n_legs"],
|
891
|
-
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
892
|
-
)
|
893
|
-
log_message_log_args = cm.records[0].getMessage()
|
894
|
-
log_message_presanitize_kwargs = cm.records[1].getMessage()
|
895
|
-
self.assertIn(
|
896
|
-
f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
|
897
|
-
log_message_log_args,
|
898
|
-
)
|
899
|
-
self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
|
900
|
-
for index, field in enumerate(result_parquet_file.schema_arrow):
|
901
|
-
self.assertEqual(
|
902
|
-
field.name, result_parquet_file.schema_arrow.field(index).name
|
903
|
-
)
|
904
|
-
self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
|
905
|
-
|
906
|
-
def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success(
|
907
|
-
self,
|
908
|
-
):
|
909
|
-
test_s3_url = PARQUET_FILE_PATH
|
910
|
-
test_content_type = ContentType.PARQUET.value
|
911
|
-
test_content_encoding = ContentEncoding.GZIP.value
|
912
|
-
pa_kwargs_provider = lambda content_type, kwargs: {
|
913
|
-
"reader_type": "pyarrow",
|
914
|
-
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
915
|
-
**kwargs,
|
916
|
-
}
|
917
|
-
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
918
|
-
result_parquet_file: ParquetFile = s3_file_to_parquet(
|
919
|
-
test_s3_url,
|
920
|
-
test_content_type,
|
921
|
-
test_content_encoding,
|
922
|
-
["n_legs", "animal"],
|
923
|
-
["n_legs"],
|
924
|
-
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
925
|
-
)
|
926
|
-
log_message_log_args = cm.records[0].getMessage()
|
927
|
-
log_message_log_new_content_encoding = cm.records[1].getMessage()
|
928
|
-
log_message_presanitize_kwargs = cm.records[2].getMessage()
|
929
|
-
self.assertIn(
|
930
|
-
f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
|
931
|
-
log_message_log_args,
|
932
|
-
)
|
933
|
-
self.assertIn(
|
934
|
-
f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
|
935
|
-
log_message_log_new_content_encoding,
|
936
|
-
)
|
937
|
-
self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
|
938
|
-
for index, field in enumerate(result_parquet_file.schema_arrow):
|
939
|
-
self.assertEqual(
|
940
|
-
field.name, result_parquet_file.schema_arrow.field(index).name
|
941
|
-
)
|
942
|
-
self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
|
943
|
-
|
944
|
-
def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error(
|
945
|
-
self,
|
946
|
-
):
|
947
|
-
test_s3_url = PARQUET_FILE_PATH
|
948
|
-
test_content_type = ContentType.PARQUET.value
|
949
|
-
test_content_encoding = ContentEncoding.GZIP.value
|
950
|
-
pa_kwargs_provider = lambda content_type, kwargs: {
|
951
|
-
"reader_type": "pyarrow",
|
952
|
-
**kwargs,
|
953
|
-
}
|
954
|
-
with self.assertRaises(ContentTypeValidationError):
|
955
|
-
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
956
|
-
s3_file_to_parquet(
|
957
|
-
test_s3_url,
|
958
|
-
test_content_type,
|
959
|
-
test_content_encoding,
|
960
|
-
["n_legs", "animal"],
|
961
|
-
["n_legs"],
|
962
|
-
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
963
|
-
)
|
964
|
-
log_message_log_args = cm.records[0].getMessage()
|
965
|
-
self.assertIn(
|
966
|
-
f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
|
967
|
-
log_message_log_args,
|
968
|
-
)
|
deltacat/types/tables.py
CHANGED
@@ -89,7 +89,7 @@ class TableWriteMode(str, Enum):
|
|
89
89
|
Enum controlling how a given dataset will be written to a table.
|
90
90
|
|
91
91
|
AUTO: CREATE if the table doesn't exist, APPEND if the table exists
|
92
|
-
without
|
92
|
+
without merge keys, and MERGE if the table exists with merge keys.
|
93
93
|
CREATE: Create the table if it doesn't exist, throw an error if it does.
|
94
94
|
APPEND: Append to the table if it exists, throw an error if it doesn't.
|
95
95
|
REPLACE: Replace existing table contents with the data to write.
|
deltacat/utils/export.py
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
import logging
|
2
|
+
import json
|
3
|
+
import pyarrow as pa
|
4
|
+
import pyarrow.parquet
|
5
|
+
import pyarrow.feather
|
6
|
+
from typing import Callable, Dict
|
7
|
+
|
8
|
+
from deltacat.storage.rivulet.reader.query_expression import QueryExpression
|
9
|
+
from deltacat import logs
|
10
|
+
|
11
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
|
+
|
13
|
+
|
14
|
+
def export_parquet(dataset, file_uri: str, query: QueryExpression = QueryExpression()):
|
15
|
+
records = dataset.scan(query).to_arrow()
|
16
|
+
table = pa.Table.from_batches(records)
|
17
|
+
pyarrow.parquet.write_table(table, file_uri)
|
18
|
+
|
19
|
+
|
20
|
+
def export_feather(dataset, file_uri: str, query: QueryExpression = QueryExpression()):
|
21
|
+
records = dataset.scan(query).to_arrow()
|
22
|
+
table = pa.Table.from_batches(records)
|
23
|
+
pyarrow.feather.write_feather(table, file_uri)
|
24
|
+
|
25
|
+
|
26
|
+
def export_json(dataset, file_uri: str, query: QueryExpression = QueryExpression()):
|
27
|
+
with open(file_uri, "w") as f:
|
28
|
+
for batch in dataset.scan(query).to_pydict():
|
29
|
+
json.dump(batch, f, indent=2)
|
30
|
+
f.write("\n")
|
31
|
+
|
32
|
+
|
33
|
+
def export_dataset(dataset, file_uri: str, format: str = "parquet", query=None):
|
34
|
+
"""
|
35
|
+
Export the dataset to a file.
|
36
|
+
|
37
|
+
TODO: Make this pluggable for custom formats.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
dataset: The dataset to export.
|
41
|
+
file_uri: The URI to write the dataset to.
|
42
|
+
format: The format to write the dataset in. Options are [parquet, feather, json].
|
43
|
+
query: QueryExpression to filter the dataset before exporting.
|
44
|
+
"""
|
45
|
+
# Supported format handlers
|
46
|
+
export_handlers: Dict[str, Callable] = {
|
47
|
+
"parquet": export_parquet,
|
48
|
+
"feather": export_feather,
|
49
|
+
"json": export_json,
|
50
|
+
}
|
51
|
+
|
52
|
+
if format not in export_handlers:
|
53
|
+
raise ValueError(
|
54
|
+
f"Unsupported format: {format}. Supported formats are {list(export_handlers.keys())}"
|
55
|
+
)
|
56
|
+
|
57
|
+
export_handlers[format](dataset, file_uri, query or QueryExpression())
|
58
|
+
|
59
|
+
logger.info(f"Dataset exported to {file_uri} in {format} format.")
|