deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,231 +0,0 @@
1
- import pytest
2
- import os
3
- from moto import mock_s3
4
- import boto3
5
- from boto3.resources.base import ServiceResource
6
- from deltacat.compute.compactor.utils.round_completion_file import (
7
- read_round_completion_file,
8
- write_round_completion_file,
9
- )
10
- from deltacat.tests.compute.test_util_common import get_test_partition_locator
11
- from deltacat.compute.compactor import RoundCompletionInfo
12
-
13
- RCF_BUCKET_NAME = "rcf-bucket"
14
-
15
-
16
- @pytest.fixture(autouse=True, scope="module")
17
- def mock_aws_credential():
18
- os.environ["AWS_ACCESS_KEY_ID"] = "testing"
19
- os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
20
- os.environ["AWS_SECURITY_TOKEN"] = "testing"
21
- os.environ["AWS_SESSION_TOKEN"] = "testing"
22
- os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
23
- yield
24
-
25
-
26
- @pytest.fixture(autouse=True, scope="module")
27
- def s3_resource(mock_aws_credential):
28
- with mock_s3():
29
- yield boto3.resource("s3")
30
-
31
-
32
- @pytest.fixture(autouse=True, scope="function")
33
- def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
34
- s3_resource.create_bucket(
35
- ACL="authenticated-read",
36
- Bucket=RCF_BUCKET_NAME,
37
- )
38
- yield
39
- s3_resource.Bucket(RCF_BUCKET_NAME).objects.all().delete()
40
-
41
-
42
- class TestReadWriteRoundCompletionFile:
43
- def test_read_when_rcf_written_without_destination(self):
44
- """
45
- This test case tests the backward compatibility by successfully
46
- reading the previously written rcf.
47
- """
48
-
49
- source_locator = get_test_partition_locator("source")
50
- destination_locator = get_test_partition_locator("destination")
51
-
52
- expected_rcf = RoundCompletionInfo.of(
53
- high_watermark=122,
54
- compacted_delta_locator={},
55
- compacted_pyarrow_write_result={},
56
- sort_keys_bit_width=12,
57
- )
58
-
59
- rcf_url = write_round_completion_file(
60
- RCF_BUCKET_NAME, source_locator, None, expected_rcf
61
- )
62
-
63
- rcf = read_round_completion_file(
64
- RCF_BUCKET_NAME, source_locator, destination_locator
65
- )
66
-
67
- assert (
68
- rcf_url == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507.json"
69
- )
70
- assert rcf == expected_rcf
71
-
72
- def test_read_when_rcf_written_with_destination(self):
73
- """
74
- This test case tests the backward compatibility by successfully
75
- reading the previously written rcf.
76
- """
77
-
78
- source_locator = get_test_partition_locator("source")
79
- destination_locator = get_test_partition_locator("destination")
80
-
81
- expected_rcf = RoundCompletionInfo.of(
82
- high_watermark=122,
83
- compacted_delta_locator={},
84
- compacted_pyarrow_write_result={},
85
- sort_keys_bit_width=12,
86
- )
87
-
88
- rcf_url = write_round_completion_file(
89
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
90
- )
91
-
92
- rcf = read_round_completion_file(
93
- RCF_BUCKET_NAME, source_locator, destination_locator
94
- )
95
-
96
- assert (
97
- rcf_url
98
- == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507/e9939deadc091b3289a2eb0ca56b1ba86b9892f4.json"
99
- )
100
- assert rcf == expected_rcf
101
-
102
- def test_read_without_destination_when_rcf_written_with_destination(self):
103
- """
104
- This test case tests the backward compatibility by successfully
105
- reading the previously written rcf.
106
- """
107
-
108
- source_locator = get_test_partition_locator("source")
109
- destination_locator = get_test_partition_locator("destination")
110
-
111
- expected_rcf = RoundCompletionInfo.of(
112
- high_watermark=122,
113
- compacted_delta_locator={},
114
- compacted_pyarrow_write_result={},
115
- sort_keys_bit_width=12,
116
- )
117
-
118
- write_round_completion_file(
119
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
120
- )
121
-
122
- rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
123
-
124
- assert rcf is None
125
-
126
- def test_read_without_destination_when_rcf_written_without_destination(self):
127
- """
128
- This test case tests the backward compatibility by successfully
129
- reading the previously written rcf.
130
- """
131
-
132
- source_locator = get_test_partition_locator("source")
133
-
134
- expected_rcf = RoundCompletionInfo.of(
135
- high_watermark=122,
136
- compacted_delta_locator={},
137
- compacted_pyarrow_write_result={},
138
- sort_keys_bit_width=12,
139
- )
140
-
141
- write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
142
-
143
- rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
144
-
145
- assert rcf == expected_rcf
146
-
147
- def test_read_when_rcf_written_both_with_and_without_destination(self):
148
- """
149
- This test case tests the backward compatibility by successfully
150
- reading the previously written rcf.
151
- """
152
-
153
- source_locator = get_test_partition_locator("source")
154
- destination_locator = get_test_partition_locator("destination")
155
-
156
- expected_rcf = RoundCompletionInfo.of(
157
- high_watermark=122,
158
- compacted_delta_locator={},
159
- compacted_pyarrow_write_result={},
160
- sort_keys_bit_width=12,
161
- )
162
-
163
- expected_rcf_2 = RoundCompletionInfo.of(
164
- high_watermark=1223,
165
- compacted_delta_locator={},
166
- compacted_pyarrow_write_result={},
167
- sort_keys_bit_width=1233,
168
- )
169
-
170
- write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
171
-
172
- write_round_completion_file(
173
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf_2
174
- )
175
-
176
- rcf = read_round_completion_file(
177
- RCF_BUCKET_NAME, source_locator, destination_locator
178
- )
179
-
180
- assert rcf == expected_rcf_2
181
-
182
- def test_read_when_none_destination_partition_id(self):
183
-
184
- source_locator = get_test_partition_locator("source")
185
- destination_locator = get_test_partition_locator(None)
186
-
187
- expected_rcf = RoundCompletionInfo.of(
188
- high_watermark=122,
189
- compacted_delta_locator={},
190
- compacted_pyarrow_write_result={},
191
- sort_keys_bit_width=12,
192
- )
193
-
194
- write_round_completion_file(
195
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
196
- )
197
-
198
- rcf = read_round_completion_file(
199
- RCF_BUCKET_NAME, source_locator, destination_locator
200
- )
201
-
202
- assert rcf == expected_rcf
203
-
204
- def test_write_when_custom_url_is_passed(self):
205
- """
206
- This test case tests the backward compatibility by successfully
207
- reading the previously written rcf.
208
- """
209
-
210
- source_locator = get_test_partition_locator("source")
211
-
212
- expected_rcf = RoundCompletionInfo.of(
213
- high_watermark=122,
214
- compacted_delta_locator={},
215
- compacted_pyarrow_write_result={},
216
- sort_keys_bit_width=12,
217
- )
218
-
219
- completion_file_s3_url = f"s3://{RCF_BUCKET_NAME}/test.json"
220
- rcf_url = write_round_completion_file(
221
- RCF_BUCKET_NAME,
222
- source_locator,
223
- None,
224
- expected_rcf,
225
- completion_file_s3_url=completion_file_s3_url,
226
- )
227
-
228
- rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
229
-
230
- assert rcf_url == completion_file_s3_url
231
- assert rcf is None
@@ -1,388 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
- from typing import Any, Dict, List, Optional, Tuple
4
- import pyarrow as pa
5
-
6
- from deltacat.tests.compute.test_util_common import (
7
- PartitionKey,
8
- )
9
-
10
- from deltacat.storage import (
11
- Delta,
12
- DeltaType,
13
- Partition,
14
- PartitionLocator,
15
- Stream,
16
- )
17
- from deltacat.tests.compute.test_util_common import (
18
- create_src_table,
19
- create_destination_table,
20
- create_rebase_table,
21
- )
22
- import logging
23
- from deltacat import logs
24
-
25
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
-
27
-
28
- def _add_deltas_to_partition(
29
- deltas_ingredients: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
30
- partition: Optional[Partition],
31
- ds_mock_kwargs: Optional[Dict[str, Any]],
32
- ) -> List[Optional[Delta], int]:
33
- import deltacat.tests.local_deltacat_storage as ds
34
-
35
- all_deltas_length = 0
36
- for (delta_data, delta_type, delete_parameters) in deltas_ingredients:
37
- staged_delta: Delta = ds.stage_delta(
38
- delta_data,
39
- partition,
40
- delta_type,
41
- entry_params=delete_parameters,
42
- **ds_mock_kwargs,
43
- )
44
- incremental_delta = ds.commit_delta(
45
- staged_delta,
46
- **ds_mock_kwargs,
47
- )
48
- all_deltas_length += len(delta_data) if delta_data else 0
49
- return incremental_delta, all_deltas_length
50
-
51
-
52
- def add_late_deltas_to_partition(
53
- late_deltas: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
54
- source_partition: Optional[Partition],
55
- ds_mock_kwargs: Optional[Dict[str, Any]],
56
- ) -> List[Optional[Delta], int]:
57
- return _add_deltas_to_partition(late_deltas, source_partition, ds_mock_kwargs)
58
-
59
-
60
- def create_incremental_deltas_on_source_table(
61
- source_namespace: str,
62
- source_table_name: str,
63
- source_table_version: str,
64
- source_table_stream: Stream,
65
- partition_values_param,
66
- incremental_deltas: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
67
- ds_mock_kwargs: Optional[Dict[str, Any]] = None,
68
- ) -> Tuple[PartitionLocator, Delta, int, bool]:
69
- import deltacat.tests.local_deltacat_storage as ds
70
-
71
- incremental_delta_length = 0
72
- is_delete = False
73
- src_partition: Partition = ds.get_partition(
74
- source_table_stream.locator,
75
- partition_values_param,
76
- **ds_mock_kwargs,
77
- )
78
- for (
79
- incremental_data,
80
- incremental_delta_type,
81
- incremental_delete_parameters,
82
- ) in incremental_deltas:
83
- if incremental_delta_type is DeltaType.DELETE:
84
- is_delete = True
85
- incremental_delta: Delta = ds.commit_delta(
86
- ds.stage_delta(
87
- incremental_data,
88
- src_partition,
89
- incremental_delta_type,
90
- entry_params=incremental_delete_parameters,
91
- **ds_mock_kwargs,
92
- ),
93
- **ds_mock_kwargs,
94
- )
95
- incremental_delta_length += len(incremental_data) if incremental_data else 0
96
- src_table_stream_after_committed_delta: Stream = ds.get_stream(
97
- source_namespace,
98
- source_table_name,
99
- source_table_version,
100
- **ds_mock_kwargs,
101
- )
102
- src_partition_after_committed_delta: Partition = ds.get_partition(
103
- src_table_stream_after_committed_delta.locator,
104
- partition_values_param,
105
- **ds_mock_kwargs,
106
- )
107
- return (
108
- src_partition_after_committed_delta.locator,
109
- incremental_delta,
110
- incremental_delta_length,
111
- is_delete,
112
- )
113
-
114
-
115
- def create_src_w_deltas_destination_plus_destination(
116
- sort_keys: Optional[List[Any]],
117
- partition_keys: Optional[List[PartitionKey]],
118
- input_deltas: pa.Table,
119
- input_delta_type: DeltaType,
120
- partition_values: Optional[List[Any]],
121
- ds_mock_kwargs: Optional[Dict[str, Any]],
122
- simulate_is_inplace: bool = False,
123
- ) -> Tuple[Stream, Stream, Optional[Stream], str, str, str]:
124
- import deltacat.tests.local_deltacat_storage as ds
125
-
126
- source_namespace, source_table_name, source_table_version = create_src_table(
127
- sort_keys, partition_keys, ds_mock_kwargs
128
- )
129
-
130
- source_table_stream: Stream = ds.get_stream(
131
- namespace=source_namespace,
132
- table_name=source_table_name,
133
- table_version=source_table_version,
134
- **ds_mock_kwargs,
135
- )
136
- staged_partition: Partition = ds.stage_partition(
137
- source_table_stream, partition_values, **ds_mock_kwargs
138
- )
139
- ds.commit_delta(
140
- ds.stage_delta(
141
- input_deltas, staged_partition, input_delta_type, **ds_mock_kwargs
142
- ),
143
- **ds_mock_kwargs,
144
- )
145
- ds.commit_partition(staged_partition, **ds_mock_kwargs)
146
- source_table_stream_after_committed: Stream = ds.get_stream(
147
- namespace=source_namespace,
148
- table_name=source_table_name,
149
- table_version=source_table_version,
150
- **ds_mock_kwargs,
151
- )
152
- destination_table_namespace: Optional[str] = None
153
- destination_table_name: Optional[str] = None
154
- destination_table_version: Optional[str] = None
155
- if not simulate_is_inplace:
156
- (
157
- destination_table_namespace,
158
- destination_table_name,
159
- destination_table_version,
160
- ) = create_destination_table(sort_keys, partition_keys, ds_mock_kwargs)
161
- else:
162
- # not creating a table as in-place
163
- destination_table_namespace = source_namespace
164
- destination_table_name = source_table_name
165
- destination_table_version = source_table_version
166
-
167
- destination_table_stream: Stream = ds.get_stream(
168
- namespace=destination_table_namespace,
169
- table_name=destination_table_name,
170
- table_version=destination_table_version,
171
- **ds_mock_kwargs,
172
- )
173
- return (
174
- source_table_stream_after_committed,
175
- destination_table_stream,
176
- None,
177
- source_namespace,
178
- source_table_name,
179
- source_table_version,
180
- )
181
-
182
-
183
- def create_src_w_deltas_destination_rebase_w_deltas_strategy(
184
- sort_keys: Optional[List[Any]],
185
- partition_keys: Optional[List[PartitionKey]],
186
- input_deltas: pa.Table,
187
- input_delta_type: DeltaType,
188
- partition_values: Optional[List[Any]],
189
- ds_mock_kwargs: Optional[Dict[str, Any]],
190
- ) -> Tuple[Stream, Stream, Optional[Stream]]:
191
- import deltacat.tests.local_deltacat_storage as ds
192
- from deltacat.storage import Delta
193
- from deltacat.utils.common import current_time_ms
194
-
195
- last_stream_position = current_time_ms()
196
- source_namespace, source_table_name, source_table_version = create_src_table(
197
- sort_keys, partition_keys, ds_mock_kwargs
198
- )
199
-
200
- source_table_stream: Stream = ds.get_stream(
201
- namespace=source_namespace,
202
- table_name=source_table_name,
203
- table_version=source_table_version,
204
- **ds_mock_kwargs,
205
- )
206
- staged_partition: Partition = ds.stage_partition(
207
- source_table_stream, partition_values, **ds_mock_kwargs
208
- )
209
- staged_delta: Delta = ds.stage_delta(
210
- input_deltas, staged_partition, input_delta_type, **ds_mock_kwargs
211
- )
212
- staged_delta.locator.stream_position = last_stream_position
213
- ds.commit_delta(
214
- staged_delta,
215
- **ds_mock_kwargs,
216
- )
217
- ds.commit_partition(staged_partition, **ds_mock_kwargs)
218
- source_table_stream_after_committed: Stream = ds.get_stream(
219
- namespace=source_namespace,
220
- table_name=source_table_name,
221
- table_version=source_table_version,
222
- **ds_mock_kwargs,
223
- )
224
- # create the destination table
225
- (
226
- destination_table_namespace,
227
- destination_table_name,
228
- destination_table_version,
229
- ) = create_destination_table(sort_keys, partition_keys, ds_mock_kwargs)
230
- # create the rebase table
231
- (
232
- rebase_table_namespace,
233
- rebase_table_name,
234
- rebase_table_version,
235
- ) = create_rebase_table(sort_keys, partition_keys, ds_mock_kwargs)
236
- rebasing_table_stream: Stream = ds.get_stream(
237
- namespace=rebase_table_namespace,
238
- table_name=rebase_table_name,
239
- table_version=rebase_table_version,
240
- **ds_mock_kwargs,
241
- )
242
- staged_partition: Partition = ds.stage_partition(
243
- rebasing_table_stream, partition_values, **ds_mock_kwargs
244
- )
245
- staged_delta: Delta = ds.stage_delta(
246
- input_deltas, staged_partition, **ds_mock_kwargs
247
- )
248
- staged_delta.locator.stream_position = last_stream_position
249
- ds.commit_delta(
250
- staged_delta,
251
- **ds_mock_kwargs,
252
- )
253
- ds.commit_partition(staged_partition, **ds_mock_kwargs)
254
-
255
- # get streams
256
- # TODO: Add deltas to destination stream
257
- destination_table_stream: Stream = ds.get_stream(
258
- namespace=destination_table_namespace,
259
- table_name=destination_table_name,
260
- table_version=destination_table_version,
261
- **ds_mock_kwargs,
262
- )
263
- rebased_stream_after_committed: Stream = ds.get_stream(
264
- namespace=rebase_table_namespace,
265
- table_name=rebase_table_name,
266
- table_version=rebase_table_version,
267
- **ds_mock_kwargs,
268
- )
269
- return (
270
- source_table_stream_after_committed,
271
- destination_table_stream,
272
- rebased_stream_after_committed,
273
- )
274
-
275
-
276
- def multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
277
- sort_keys: Optional[List[Any]],
278
- partition_keys: Optional[List[PartitionKey]],
279
- input_deltas: List[pa.Table],
280
- partition_values: Optional[List[Any]],
281
- ds_mock_kwargs: Optional[Dict[str, Any]],
282
- ) -> Tuple[Stream, Stream, Optional[Stream], bool]:
283
- import deltacat.tests.local_deltacat_storage as ds
284
- from deltacat.storage import Partition, Stream
285
-
286
- source_namespace, source_table_name, source_table_version = create_src_table(
287
- sort_keys, partition_keys, ds_mock_kwargs
288
- )
289
-
290
- source_table_stream: Stream = ds.get_stream(
291
- namespace=source_namespace,
292
- table_name=source_table_name,
293
- table_version=source_table_version,
294
- **ds_mock_kwargs,
295
- )
296
- staged_partition: Partition = ds.stage_partition(
297
- source_table_stream, partition_values, **ds_mock_kwargs
298
- )
299
- is_delete = False
300
- input_delta_length = 0
301
- for (
302
- input_delta,
303
- input_delta_type,
304
- input_delta_parameters,
305
- ) in input_deltas:
306
- if input_delta_type is DeltaType.DELETE:
307
- is_delete = True
308
- staged_delta = ds.stage_delta(
309
- input_delta,
310
- staged_partition,
311
- input_delta_type,
312
- entry_params=input_delta_parameters,
313
- **ds_mock_kwargs,
314
- )
315
- ds.commit_delta(
316
- staged_delta,
317
- **ds_mock_kwargs,
318
- )
319
- input_delta_length += len(input_delta) if input_delta else 0
320
- ds.commit_partition(staged_partition, **ds_mock_kwargs)
321
- source_table_stream_after_committed: Stream = ds.get_stream(
322
- namespace=source_namespace,
323
- table_name=source_table_name,
324
- table_version=source_table_version,
325
- **ds_mock_kwargs,
326
- )
327
- # create the destination table
328
- (
329
- destination_table_namespace,
330
- destination_table_name,
331
- destination_table_version,
332
- ) = create_destination_table(sort_keys, partition_keys, ds_mock_kwargs)
333
- # create the rebase table
334
- (
335
- rebase_table_namespace,
336
- rebase_table_name,
337
- rebase_table_version,
338
- ) = create_rebase_table(sort_keys, partition_keys, ds_mock_kwargs)
339
- rebasing_table_stream: Stream = ds.get_stream(
340
- namespace=rebase_table_namespace,
341
- table_name=rebase_table_name,
342
- table_version=rebase_table_version,
343
- **ds_mock_kwargs,
344
- )
345
- staged_partition: Partition = ds.stage_partition(
346
- rebasing_table_stream, partition_values, **ds_mock_kwargs
347
- )
348
- input_delta_length = 0
349
- for (
350
- input_delta,
351
- input_delta_type,
352
- input_delta_parameters,
353
- ) in input_deltas:
354
- if input_delta_type is DeltaType.DELETE:
355
- is_delete = True
356
- staged_delta = ds.stage_delta(
357
- input_delta,
358
- staged_partition,
359
- input_delta_type,
360
- entry_params=input_delta_parameters,
361
- **ds_mock_kwargs,
362
- )
363
- ds.commit_delta(
364
- staged_delta,
365
- **ds_mock_kwargs,
366
- )
367
- input_delta_length += len(input_delta) if input_delta else 0
368
- ds.commit_partition(staged_partition, **ds_mock_kwargs)
369
-
370
- # get streams
371
- destination_table_stream: Stream = ds.get_stream(
372
- namespace=destination_table_namespace,
373
- table_name=destination_table_name,
374
- table_version=destination_table_version,
375
- **ds_mock_kwargs,
376
- )
377
- rebased_stream_after_committed: Stream = ds.get_stream(
378
- namespace=rebase_table_namespace,
379
- table_name=rebase_table_name,
380
- table_version=rebase_table_version,
381
- **ds_mock_kwargs,
382
- )
383
- return (
384
- source_table_stream_after_committed,
385
- destination_table_stream,
386
- rebased_stream_after_committed,
387
- is_delete,
388
- )