deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/storage/util/__init__.py +0 -0
  150. deltacat/storage/util/scan_planner.py +26 -0
  151. deltacat/tests/_io/__init__.py +1 -0
  152. deltacat/tests/catalog/test_catalogs.py +324 -0
  153. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  154. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  155. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  156. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  157. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  158. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  159. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  160. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  161. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  162. deltacat/tests/compute/conftest.py +75 -0
  163. deltacat/tests/compute/converter/__init__.py +0 -0
  164. deltacat/tests/compute/converter/conftest.py +80 -0
  165. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  166. deltacat/tests/compute/converter/utils.py +123 -0
  167. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  168. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  169. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  170. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  171. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  172. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  173. deltacat/tests/compute/test_util_common.py +19 -12
  174. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  175. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  176. deltacat/tests/storage/__init__.py +0 -0
  177. deltacat/tests/storage/conftest.py +25 -0
  178. deltacat/tests/storage/main/__init__.py +0 -0
  179. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  180. deltacat/tests/storage/model/__init__.py +0 -0
  181. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  182. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  183. deltacat/tests/storage/model/test_schema.py +308 -0
  184. deltacat/tests/storage/model/test_shard.py +22 -0
  185. deltacat/tests/storage/model/test_table_version.py +110 -0
  186. deltacat/tests/storage/model/test_transaction.py +308 -0
  187. deltacat/tests/storage/rivulet/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/conftest.py +149 -0
  189. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  191. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  192. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  193. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  194. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  195. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  196. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  197. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  198. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  199. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  200. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  201. deltacat/tests/test_deltacat_api.py +39 -0
  202. deltacat/tests/test_utils/filesystem.py +14 -0
  203. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  204. deltacat/tests/test_utils/pyarrow.py +8 -15
  205. deltacat/tests/test_utils/storage.py +266 -3
  206. deltacat/tests/utils/test_daft.py +3 -3
  207. deltacat/tests/utils/test_pyarrow.py +0 -432
  208. deltacat/types/partial_download.py +1 -1
  209. deltacat/types/tables.py +1 -1
  210. deltacat/utils/export.py +59 -0
  211. deltacat/utils/filesystem.py +320 -0
  212. deltacat/utils/metafile_locator.py +73 -0
  213. deltacat/utils/pyarrow.py +36 -183
  214. deltacat-2.0.0b2.dist-info/METADATA +65 -0
  215. deltacat-2.0.0b2.dist-info/RECORD +349 -0
  216. deltacat/aws/redshift/__init__.py +0 -19
  217. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  218. deltacat/io/dataset.py +0 -73
  219. deltacat/io/read_api.py +0 -143
  220. deltacat/storage/model/delete_parameters.py +0 -40
  221. deltacat/storage/model/partition_spec.py +0 -71
  222. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  223. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  224. deltacat-1.1.36.dist-info/METADATA +0 -64
  225. deltacat-1.1.36.dist-info/RECORD +0 -219
  226. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  227. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  228. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  229. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  230. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  231. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  234. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  235. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
  237. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
  238. {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -2,33 +2,20 @@ from unittest import TestCase
2
2
  from deltacat.utils.pyarrow import (
3
3
  s3_partial_parquet_file_to_table,
4
4
  pyarrow_read_csv,
5
- ContentTypeValidationError,
6
5
  content_type_to_reader_kwargs,
7
6
  _add_column_kwargs,
8
- logger,
9
7
  s3_file_to_table,
10
- s3_file_to_parquet,
11
8
  ReadKwargsProviderPyArrowSchemaOverride,
12
9
  RAISE_ON_EMPTY_CSV_KWARG,
13
- RAISE_ON_DECIMAL_OVERFLOW,
14
- OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG,
15
10
  )
16
- import decimal
17
11
  from deltacat.types.media import ContentEncoding, ContentType
18
12
  from deltacat.types.partial_download import PartialParquetParameters
19
13
  from pyarrow.parquet import ParquetFile
20
14
  import pyarrow as pa
21
15
 
22
16
  PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
23
- PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
24
17
  EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
25
18
  NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
26
- OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
27
- "deltacat/tests/utils/data/overflowing_decimal_precision.csv"
28
- )
29
- OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
30
- "deltacat/tests/utils/data/overflowing_decimal_scale.csv"
31
- )
32
19
  GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
33
20
  BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
34
21
 
@@ -420,253 +407,6 @@ class TestReadCSV(TestCase):
420
407
  ),
421
408
  )
422
409
 
423
- def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
424
- schema = pa.schema(
425
- [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
426
- )
427
- kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
428
- _add_column_kwargs(
429
- ContentType.UNESCAPED_TSV.value,
430
- ["is_active", "decimal_value"],
431
- ["is_active", "decimal_value"],
432
- kwargs,
433
- )
434
- read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
435
-
436
- kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
437
- self.assertRaises(
438
- pa.lib.ArrowInvalid,
439
- lambda: pyarrow_read_csv(
440
- OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
441
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
442
- ),
443
- )
444
-
445
- def test_read_csv_when_decimal_precision_overflows_sanity(self):
446
- schema = pa.schema(
447
- [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
448
- )
449
- kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
450
- _add_column_kwargs(
451
- ContentType.UNESCAPED_TSV.value,
452
- ["is_active", "decimal_value"],
453
- ["is_active", "decimal_value"],
454
- kwargs,
455
- )
456
-
457
- read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
458
-
459
- kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
460
-
461
- self.assertRaises(
462
- pa.lib.ArrowInvalid,
463
- lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
464
- )
465
-
466
- def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
467
- schema = pa.schema(
468
- [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
469
- )
470
- kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
471
- _add_column_kwargs(
472
- ContentType.UNESCAPED_TSV.value,
473
- ["is_active", "decimal_value"],
474
- ["is_active", "decimal_value"],
475
- kwargs,
476
- )
477
- read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
478
-
479
- kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
480
-
481
- self.assertRaises(
482
- pa.lib.ArrowInvalid,
483
- lambda: pyarrow_read_csv(
484
- OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
485
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
486
- ),
487
- )
488
-
489
- def test_read_csv_when_decimal_scale_overflows_sanity(self):
490
- schema = pa.schema(
491
- [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
492
- )
493
- kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
494
- _add_column_kwargs(
495
- ContentType.UNESCAPED_TSV.value,
496
- ["is_active", "decimal_value"],
497
- ["is_active", "decimal_value"],
498
- kwargs,
499
- )
500
-
501
- read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
502
-
503
- kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
504
-
505
- result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
506
-
507
- self.assertEqual(len(result), 3)
508
- self.assertEqual(
509
- result[1][0].as_py(), decimal.Decimal("322236.66")
510
- ) # rounding decimal
511
- self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
512
- self.assertEqual(len(result.column_names), 2)
513
- result_schema = result.schema
514
- self.assertEqual(result_schema.field(0).type, "string")
515
- self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
516
-
517
- def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
518
- schema = pa.schema(
519
- [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
520
- )
521
- kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
522
- _add_column_kwargs(
523
- ContentType.UNESCAPED_TSV.value,
524
- ["is_active", "decimal_value"],
525
- ["is_active", "decimal_value"],
526
- kwargs,
527
- )
528
-
529
- read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
530
-
531
- kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
532
-
533
- result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
534
-
535
- self.assertEqual(len(result), 3)
536
- self.assertEqual(
537
- result[1][0].as_py(),
538
- decimal.Decimal("322200"), # consequence of negative scale
539
- ) # rounding decimal
540
- self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
541
- self.assertEqual(len(result.column_names), 2)
542
- result_schema = result.schema
543
- self.assertEqual(result_schema.field(0).type, "string")
544
- self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
545
-
546
- def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
547
- schema = pa.schema(
548
- [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
549
- )
550
- kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
551
- _add_column_kwargs(
552
- ContentType.UNESCAPED_TSV.value,
553
- ["is_active", "decimal_value"],
554
- ["is_active", "decimal_value"],
555
- kwargs,
556
- )
557
-
558
- read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
559
-
560
- kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
561
-
562
- result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
563
-
564
- self.assertEqual(len(result), 3)
565
- self.assertEqual(
566
- result[1][0].as_py(), decimal.Decimal("322236.66")
567
- ) # rounding decimal
568
- self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
569
- self.assertEqual(len(result.column_names), 2)
570
- result_schema = result.schema
571
- self.assertEqual(result_schema.field(0).type, "string")
572
- self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
573
-
574
- def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
575
- self,
576
- ):
577
- schema = pa.schema(
578
- [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
579
- )
580
- kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
581
- _add_column_kwargs(
582
- ContentType.UNESCAPED_TSV.value,
583
- ["is_active", "decimal_value"],
584
- ["is_active", "decimal_value"],
585
- kwargs,
586
- )
587
-
588
- read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
589
-
590
- kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
591
-
592
- self.assertRaises(
593
- pa.lib.ArrowNotImplementedError,
594
- lambda: pyarrow_read_csv(
595
- OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
596
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
597
- ),
598
- )
599
-
600
- def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
601
- kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
602
- read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
603
- kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
604
-
605
- result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
606
-
607
- # The default behavior of pyarrow is to invalid skip rows
608
- self.assertEqual(len(result), 2)
609
- self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
610
- self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
611
- self.assertEqual(len(result.column_names), 2)
612
- result_schema = result.schema
613
- self.assertEqual(result_schema.field(0).type, "string")
614
- self.assertEqual(result_schema.field(1).type, pa.float64())
615
-
616
- def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
617
- self,
618
- ):
619
- schema = pa.schema(
620
- [("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
621
- )
622
- kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
623
- _add_column_kwargs(
624
- ContentType.UNESCAPED_TSV.value,
625
- ["is_active", "decimal_value"],
626
- ["is_active", "decimal_value"],
627
- kwargs,
628
- )
629
-
630
- read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
631
-
632
- kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
633
-
634
- self.assertRaises(
635
- pa.lib.ArrowInvalid,
636
- lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
637
- )
638
-
639
- def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
640
- schema = pa.schema(
641
- [("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
642
- )
643
- kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
644
- _add_column_kwargs(
645
- ContentType.UNESCAPED_TSV.value,
646
- ["is_active", "decimal_value"],
647
- ["is_active", "decimal_value"],
648
- kwargs,
649
- )
650
-
651
- read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
652
-
653
- kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
654
-
655
- with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
656
- result = pyarrow_read_csv(file, **kwargs)
657
-
658
- self.assertEqual(len(result), 3)
659
- self.assertEqual(
660
- result[1][0].as_py(), decimal.Decimal("322236.66")
661
- ) # rounding decimal
662
- self.assertEqual(
663
- result[1][1].as_py(), decimal.Decimal("32.33")
664
- ) # not rounded
665
- self.assertEqual(len(result.column_names), 2)
666
- result_schema = result.schema
667
- self.assertEqual(result_schema.field(0).type, "string")
668
- self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
669
-
670
410
 
671
411
  class TestS3FileToTable(TestCase):
672
412
  def test_s3_file_to_table_identity_sanity(self):
@@ -794,175 +534,3 @@ class TestS3FileToTable(TestCase):
794
534
  self.assertEqual(field.name, schema.field(index).name)
795
535
 
796
536
  self.assertEqual(result.schema.field(1).type, "string")
797
-
798
- def test_s3_file_to_table_when_parquet_gzip(self):
799
-
800
- pa_kwargs_provider = lambda content_type, kwargs: {
801
- "reader_type": "pyarrow",
802
- **kwargs,
803
- }
804
-
805
- result = s3_file_to_table(
806
- PARQUET_GZIP_COMPRESSED_FILE_PATH,
807
- ContentType.PARQUET.value,
808
- ContentEncoding.GZIP.value,
809
- ["n_legs", "animal"],
810
- ["n_legs"],
811
- pa_read_func_kwargs_provider=pa_kwargs_provider,
812
- )
813
-
814
- self.assertEqual(len(result), 6)
815
- self.assertEqual(len(result.column_names), 1)
816
- schema = result.schema
817
- schema_index = schema.get_field_index("n_legs")
818
- self.assertEqual(schema.field(schema_index).type, "int64")
819
-
820
- def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
821
- schema = pa.schema(
822
- [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
823
- )
824
- # OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
825
- pa_kwargs_provider = lambda content_type, kwargs: {
826
- "reader_type": "pyarrow",
827
- **kwargs,
828
- }
829
- pa_kwargs_provider = lambda content_type, kwargs: {
830
- "reader_type": "pyarrow",
831
- OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
832
- **kwargs,
833
- }
834
-
835
- result = s3_file_to_table(
836
- GZIP_COMPRESSED_FILE_UTSV_PATH,
837
- ContentType.UNESCAPED_TSV.value,
838
- ContentEncoding.GZIP.value,
839
- ["is_active", "ship_datetime_utc"],
840
- None,
841
- pa_read_func_kwargs_provider=pa_kwargs_provider,
842
- )
843
-
844
- self.assertEqual(len(result), 3)
845
- self.assertEqual(len(result.column_names), 2)
846
- result_schema = result.schema
847
- for index, field in enumerate(result_schema):
848
- self.assertEqual(field.name, schema.field(index).name)
849
-
850
- self.assertEqual(result.schema.field(0).type, "string")
851
-
852
- def test_s3_file_to_table_when_parquet_gzip_and_encoding_overridden(self):
853
- pa_kwargs_provider = lambda content_type, kwargs: {
854
- "reader_type": "pyarrow",
855
- OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
856
- **kwargs,
857
- }
858
-
859
- result = s3_file_to_table(
860
- PARQUET_FILE_PATH,
861
- ContentType.PARQUET.value,
862
- ContentEncoding.GZIP.value,
863
- ["n_legs", "animal"],
864
- ["n_legs"],
865
- pa_read_func_kwargs_provider=pa_kwargs_provider,
866
- )
867
-
868
- self.assertEqual(len(result), 6)
869
- self.assertEqual(len(result.column_names), 1)
870
- schema = result.schema
871
- schema_index = schema.get_field_index("n_legs")
872
- self.assertEqual(schema.field(schema_index).type, "int64")
873
-
874
-
875
- class TestS3FileToParquet(TestCase):
876
- def test_s3_file_to_parquet_sanity(self):
877
- test_s3_url = PARQUET_FILE_PATH
878
- test_content_type = ContentType.PARQUET.value
879
- test_content_encoding = ContentEncoding.IDENTITY.value
880
- pa_kwargs_provider = lambda content_type, kwargs: {
881
- "reader_type": "pyarrow",
882
- **kwargs,
883
- }
884
- with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
885
- result_parquet_file: ParquetFile = s3_file_to_parquet(
886
- test_s3_url,
887
- test_content_type,
888
- test_content_encoding,
889
- ["n_legs", "animal"],
890
- ["n_legs"],
891
- pa_read_func_kwargs_provider=pa_kwargs_provider,
892
- )
893
- log_message_log_args = cm.records[0].getMessage()
894
- log_message_presanitize_kwargs = cm.records[1].getMessage()
895
- self.assertIn(
896
- f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
897
- log_message_log_args,
898
- )
899
- self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
900
- for index, field in enumerate(result_parquet_file.schema_arrow):
901
- self.assertEqual(
902
- field.name, result_parquet_file.schema_arrow.field(index).name
903
- )
904
- self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
905
-
906
- def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success(
907
- self,
908
- ):
909
- test_s3_url = PARQUET_FILE_PATH
910
- test_content_type = ContentType.PARQUET.value
911
- test_content_encoding = ContentEncoding.GZIP.value
912
- pa_kwargs_provider = lambda content_type, kwargs: {
913
- "reader_type": "pyarrow",
914
- OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
915
- **kwargs,
916
- }
917
- with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
918
- result_parquet_file: ParquetFile = s3_file_to_parquet(
919
- test_s3_url,
920
- test_content_type,
921
- test_content_encoding,
922
- ["n_legs", "animal"],
923
- ["n_legs"],
924
- pa_read_func_kwargs_provider=pa_kwargs_provider,
925
- )
926
- log_message_log_args = cm.records[0].getMessage()
927
- log_message_log_new_content_encoding = cm.records[1].getMessage()
928
- log_message_presanitize_kwargs = cm.records[2].getMessage()
929
- self.assertIn(
930
- f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
931
- log_message_log_args,
932
- )
933
- self.assertIn(
934
- f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
935
- log_message_log_new_content_encoding,
936
- )
937
- self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
938
- for index, field in enumerate(result_parquet_file.schema_arrow):
939
- self.assertEqual(
940
- field.name, result_parquet_file.schema_arrow.field(index).name
941
- )
942
- self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
943
-
944
- def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error(
945
- self,
946
- ):
947
- test_s3_url = PARQUET_FILE_PATH
948
- test_content_type = ContentType.PARQUET.value
949
- test_content_encoding = ContentEncoding.GZIP.value
950
- pa_kwargs_provider = lambda content_type, kwargs: {
951
- "reader_type": "pyarrow",
952
- **kwargs,
953
- }
954
- with self.assertRaises(ContentTypeValidationError):
955
- with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
956
- s3_file_to_parquet(
957
- test_s3_url,
958
- test_content_type,
959
- test_content_encoding,
960
- ["n_legs", "animal"],
961
- ["n_legs"],
962
- pa_read_func_kwargs_provider=pa_kwargs_provider,
963
- )
964
- log_message_log_args = cm.records[0].getMessage()
965
- self.assertIn(
966
- f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
967
- log_message_log_args,
968
- )
@@ -63,7 +63,7 @@ class PartialParquetParameters(PartialFileDownloadParams):
63
63
  return self["row_groups_to_download"]
64
64
 
65
65
  @property
66
- def num_row_groups(self) -> List[int]:
66
+ def num_row_groups(self) -> int:
67
67
  return self["num_row_groups"]
68
68
 
69
69
  @property
deltacat/types/tables.py CHANGED
@@ -89,7 +89,7 @@ class TableWriteMode(str, Enum):
89
89
  Enum controlling how a given dataset will be written to a table.
90
90
 
91
91
  AUTO: CREATE if the table doesn't exist, APPEND if the table exists
92
- without primary keys, and MERGE if the table exists with primary keys.
92
+ without merge keys, and MERGE if the table exists with merge keys.
93
93
  CREATE: Create the table if it doesn't exist, throw an error if it does.
94
94
  APPEND: Append to the table if it exists, throw an error if it doesn't.
95
95
  REPLACE: Replace existing table contents with the data to write.
@@ -0,0 +1,59 @@
1
+ import logging
2
+ import json
3
+ import pyarrow as pa
4
+ import pyarrow.parquet
5
+ import pyarrow.feather
6
+ from typing import Callable, Dict
7
+
8
+ from deltacat.storage.rivulet.reader.query_expression import QueryExpression
9
+ from deltacat import logs
10
+
11
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
+
13
+
14
+ def export_parquet(dataset, file_uri: str, query: QueryExpression = QueryExpression()):
15
+ records = dataset.scan(query).to_arrow()
16
+ table = pa.Table.from_batches(records)
17
+ pyarrow.parquet.write_table(table, file_uri)
18
+
19
+
20
+ def export_feather(dataset, file_uri: str, query: QueryExpression = QueryExpression()):
21
+ records = dataset.scan(query).to_arrow()
22
+ table = pa.Table.from_batches(records)
23
+ pyarrow.feather.write_feather(table, file_uri)
24
+
25
+
26
+ def export_json(dataset, file_uri: str, query: QueryExpression = QueryExpression()):
27
+ with open(file_uri, "w") as f:
28
+ for batch in dataset.scan(query).to_pydict():
29
+ json.dump(batch, f, indent=2)
30
+ f.write("\n")
31
+
32
+
33
+ def export_dataset(dataset, file_uri: str, format: str = "parquet", query=None):
34
+ """
35
+ Export the dataset to a file.
36
+
37
+ TODO: Make this pluggable for custom formats.
38
+
39
+ Args:
40
+ dataset: The dataset to export.
41
+ file_uri: The URI to write the dataset to.
42
+ format: The format to write the dataset in. Options are [parquet, feather, json].
43
+ query: QueryExpression to filter the dataset before exporting.
44
+ """
45
+ # Supported format handlers
46
+ export_handlers: Dict[str, Callable] = {
47
+ "parquet": export_parquet,
48
+ "feather": export_feather,
49
+ "json": export_json,
50
+ }
51
+
52
+ if format not in export_handlers:
53
+ raise ValueError(
54
+ f"Unsupported format: {format}. Supported formats are {list(export_handlers.keys())}"
55
+ )
56
+
57
+ export_handlers[format](dataset, file_uri, query or QueryExpression())
58
+
59
+ logger.info(f"Dataset exported to {file_uri} in {format} format.")