deltacat 1.1.35__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +2 -3
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -1
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -5
  40. deltacat/compute/compactor_v2/steps/merge.py +11 -80
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  45. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  46. deltacat/compute/converter/constants.py +4 -0
  47. deltacat/compute/converter/converter_session.py +143 -0
  48. deltacat/compute/converter/model/convert_input.py +69 -0
  49. deltacat/compute/converter/model/convert_input_files.py +61 -0
  50. deltacat/compute/converter/model/converter_session_params.py +99 -0
  51. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  52. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  53. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  54. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  55. deltacat/compute/converter/steps/__init__.py +0 -0
  56. deltacat/compute/converter/steps/convert.py +211 -0
  57. deltacat/compute/converter/steps/dedupe.py +60 -0
  58. deltacat/compute/converter/utils/__init__.py +0 -0
  59. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  60. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  61. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  62. deltacat/compute/converter/utils/io.py +43 -0
  63. deltacat/compute/converter/utils/s3u.py +133 -0
  64. deltacat/compute/resource_estimation/delta.py +1 -19
  65. deltacat/constants.py +47 -1
  66. deltacat/env.py +51 -0
  67. deltacat/examples/__init__.py +0 -0
  68. deltacat/examples/basic_logging.py +101 -0
  69. deltacat/examples/common/__init__.py +0 -0
  70. deltacat/examples/common/fixtures.py +15 -0
  71. deltacat/examples/hello_world.py +27 -0
  72. deltacat/examples/iceberg/__init__.py +0 -0
  73. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  74. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  75. deltacat/exceptions.py +51 -9
  76. deltacat/logs.py +4 -1
  77. deltacat/storage/__init__.py +118 -28
  78. deltacat/storage/iceberg/__init__.py +0 -0
  79. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  80. deltacat/storage/iceberg/impl.py +737 -0
  81. deltacat/storage/iceberg/model.py +709 -0
  82. deltacat/storage/interface.py +217 -134
  83. deltacat/storage/main/__init__.py +0 -0
  84. deltacat/storage/main/impl.py +2077 -0
  85. deltacat/storage/model/delta.py +118 -71
  86. deltacat/storage/model/interop.py +24 -0
  87. deltacat/storage/model/list_result.py +8 -0
  88. deltacat/storage/model/locator.py +93 -3
  89. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  90. deltacat/storage/model/metafile.py +1316 -0
  91. deltacat/storage/model/namespace.py +34 -18
  92. deltacat/storage/model/partition.py +362 -37
  93. deltacat/storage/model/scan/__init__.py +0 -0
  94. deltacat/storage/model/scan/push_down.py +19 -0
  95. deltacat/storage/model/scan/scan_plan.py +10 -0
  96. deltacat/storage/model/scan/scan_task.py +34 -0
  97. deltacat/storage/model/schema.py +892 -0
  98. deltacat/storage/model/shard.py +47 -0
  99. deltacat/storage/model/sort_key.py +170 -13
  100. deltacat/storage/model/stream.py +208 -80
  101. deltacat/storage/model/table.py +123 -29
  102. deltacat/storage/model/table_version.py +322 -46
  103. deltacat/storage/model/transaction.py +757 -0
  104. deltacat/storage/model/transform.py +198 -61
  105. deltacat/storage/model/types.py +111 -13
  106. deltacat/storage/rivulet/__init__.py +11 -0
  107. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  108. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  109. deltacat/storage/rivulet/dataset.py +744 -0
  110. deltacat/storage/rivulet/dataset_executor.py +87 -0
  111. deltacat/storage/rivulet/feather/__init__.py +5 -0
  112. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  113. deltacat/storage/rivulet/feather/serializer.py +35 -0
  114. deltacat/storage/rivulet/fs/__init__.py +0 -0
  115. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  116. deltacat/storage/rivulet/fs/file_store.py +130 -0
  117. deltacat/storage/rivulet/fs/input_file.py +76 -0
  118. deltacat/storage/rivulet/fs/output_file.py +86 -0
  119. deltacat/storage/rivulet/logical_plan.py +105 -0
  120. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  121. deltacat/storage/rivulet/metastore/delta.py +190 -0
  122. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  123. deltacat/storage/rivulet/metastore/sst.py +82 -0
  124. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  125. deltacat/storage/rivulet/mvp/Table.py +101 -0
  126. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  127. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  129. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  130. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  131. deltacat/storage/rivulet/reader/__init__.py +0 -0
  132. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  133. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  134. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  135. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  136. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  137. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  138. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  139. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  140. deltacat/storage/rivulet/schema/__init__.py +0 -0
  141. deltacat/storage/rivulet/schema/datatype.py +128 -0
  142. deltacat/storage/rivulet/schema/schema.py +251 -0
  143. deltacat/storage/rivulet/serializer.py +40 -0
  144. deltacat/storage/rivulet/serializer_factory.py +42 -0
  145. deltacat/storage/rivulet/writer/__init__.py +0 -0
  146. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  147. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  148. deltacat/tests/_io/__init__.py +1 -0
  149. deltacat/tests/catalog/test_catalogs.py +324 -0
  150. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  151. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  152. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  153. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  154. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  155. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  156. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  157. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  158. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  159. deltacat/tests/compute/conftest.py +75 -0
  160. deltacat/tests/compute/converter/__init__.py +0 -0
  161. deltacat/tests/compute/converter/conftest.py +80 -0
  162. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  163. deltacat/tests/compute/converter/utils.py +123 -0
  164. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  165. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  166. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  167. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  168. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  169. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  170. deltacat/tests/compute/test_util_common.py +19 -12
  171. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  172. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  173. deltacat/tests/storage/__init__.py +0 -0
  174. deltacat/tests/storage/conftest.py +25 -0
  175. deltacat/tests/storage/main/__init__.py +0 -0
  176. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  177. deltacat/tests/storage/model/__init__.py +0 -0
  178. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  179. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  180. deltacat/tests/storage/model/test_schema.py +308 -0
  181. deltacat/tests/storage/model/test_shard.py +22 -0
  182. deltacat/tests/storage/model/test_table_version.py +110 -0
  183. deltacat/tests/storage/model/test_transaction.py +308 -0
  184. deltacat/tests/storage/rivulet/__init__.py +0 -0
  185. deltacat/tests/storage/rivulet/conftest.py +149 -0
  186. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  187. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  188. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  189. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  190. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  191. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  192. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  193. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  194. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  195. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  197. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  198. deltacat/tests/test_deltacat_api.py +39 -0
  199. deltacat/tests/test_utils/filesystem.py +14 -0
  200. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  201. deltacat/tests/test_utils/pyarrow.py +8 -15
  202. deltacat/tests/test_utils/storage.py +266 -3
  203. deltacat/tests/utils/test_daft.py +3 -3
  204. deltacat/tests/utils/test_pyarrow.py +0 -432
  205. deltacat/types/partial_download.py +1 -1
  206. deltacat/types/tables.py +1 -1
  207. deltacat/utils/export.py +59 -0
  208. deltacat/utils/filesystem.py +320 -0
  209. deltacat/utils/metafile_locator.py +73 -0
  210. deltacat/utils/pyarrow.py +36 -183
  211. deltacat-2.0.dist-info/METADATA +65 -0
  212. deltacat-2.0.dist-info/RECORD +347 -0
  213. deltacat/aws/redshift/__init__.py +0 -19
  214. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  215. deltacat/io/dataset.py +0 -73
  216. deltacat/io/read_api.py +0 -143
  217. deltacat/storage/model/delete_parameters.py +0 -40
  218. deltacat/storage/model/partition_spec.py +0 -71
  219. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  220. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  221. deltacat-1.1.35.dist-info/METADATA +0 -64
  222. deltacat-1.1.35.dist-info/RECORD +0 -219
  223. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  224. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  225. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  226. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  227. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  228. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  229. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  233. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  234. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  235. {deltacat-1.1.35.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,308 @@
1
+ import pytest
2
+
3
+ from deltacat.storage import (
4
+ Transaction,
5
+ TransactionOperation,
6
+ TransactionType,
7
+ TransactionOperationType,
8
+ )
9
+ from deltacat.storage.model.metafile import (
10
+ Metafile,
11
+ )
12
+
13
+
14
+ class TestAbsToRelative:
15
+ @classmethod
16
+ def setup_method(cls):
17
+ cls.catalog_root = "/catalog/root/path"
18
+
19
+ # Test cases for the abs_to_relative function
20
+ def test_abs_to_relative_simple(self):
21
+ """
22
+ Tests the function which relativizes absolute paths (string) into relative paths (string)
23
+ """
24
+ catalog_root = TestAbsToRelative.catalog_root
25
+ absolute_path = "/catalog/root/path/namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk"
26
+ relative_path = Transaction._abs_txn_meta_path_to_relative(
27
+ catalog_root, absolute_path
28
+ )
29
+ assert (
30
+ relative_path
31
+ == "namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk"
32
+ )
33
+
34
+ def test_abs_to_relative_same_paths(self):
35
+ catalog_root = TestAbsToRelative.catalog_root
36
+ absolute_path = TestAbsToRelative.catalog_root
37
+ with pytest.raises(
38
+ ValueError,
39
+ match="Target and root are identical, but expected target to be a child of root.",
40
+ ):
41
+ Transaction._abs_txn_meta_path_to_relative(catalog_root, absolute_path)
42
+
43
+ def test_abs_to_relative_root_with_trailing_slash(self):
44
+ catalog_root = "/catalog/root/path/"
45
+ absolute_path = "/catalog/root/path/namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk"
46
+ relative_path = Transaction._abs_txn_meta_path_to_relative(
47
+ catalog_root, absolute_path
48
+ )
49
+ assert (
50
+ relative_path
51
+ == "namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk"
52
+ )
53
+
54
+ def test_abs_to_relative_bad_root(self):
55
+ catalog_root = TestAbsToRelative.catalog_root
56
+ absolute_path = "/cat/rt/pth/namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk"
57
+ with pytest.raises(ValueError, match="Expected target to be a child of root."):
58
+ Transaction._abs_txn_meta_path_to_relative(catalog_root, absolute_path)
59
+
60
+ def test_abs_to_relative_empty_path(self):
61
+ with pytest.raises(ValueError, match="Expected target to be a child of root."):
62
+ Transaction._abs_txn_meta_path_to_relative("", "/lorem/ipsum")
63
+ with pytest.raises(ValueError, match="Expected target to be a child of root."):
64
+ Transaction._abs_txn_meta_path_to_relative("/lorem/ipsum/", "")
65
+
66
+ # Test cases for the relativize_operation_paths function
67
+ def test_relativize_metafile_write_paths(self):
68
+ catalog_root = "/catalog/root"
69
+ absolute_paths = [
70
+ "/catalog/root/path/to/metafile1.mpk",
71
+ "/catalog/root/path/to/metafile2.mpk",
72
+ "/catalog/root/another/path/lore_ipsum.mpk",
73
+ "/catalog/root/another/path/meta/to/lorem_ipsum.mpk",
74
+ "/catalog/root/another/path/lorem_ipsum.mpk",
75
+ "/catalog/root/here.mpk",
76
+ ]
77
+ expected_relative_paths = [
78
+ "path/to/metafile1.mpk",
79
+ "path/to/metafile2.mpk",
80
+ "another/path/lore_ipsum.mpk",
81
+ "another/path/meta/to/lorem_ipsum.mpk",
82
+ "another/path/lorem_ipsum.mpk",
83
+ "here.mpk",
84
+ ]
85
+ # Create a dummy transaction operation with absolute paths
86
+ dest_metafile = Metafile({"id": "dummy_metafile_id"})
87
+ transaction_operation = TransactionOperation.of(
88
+ operation_type=TransactionOperationType.CREATE,
89
+ dest_metafile=dest_metafile,
90
+ )
91
+ # use replace method as setter
92
+ transaction_operation.metafile_write_paths = absolute_paths
93
+ # Create a transaction and relativize paths
94
+ transaction = Transaction.of(
95
+ txn_type=TransactionType.APPEND, txn_operations=[transaction_operation]
96
+ )
97
+ transaction.relativize_operation_paths(transaction_operation, catalog_root)
98
+ # Verify the paths have been correctly relativized
99
+ assert transaction_operation.metafile_write_paths == expected_relative_paths
100
+
101
+ def test_relativize_locator_write_paths(self):
102
+ catalog_root = "/catalog/root"
103
+ absolute_paths = [
104
+ "/catalog/root/path/to/loc1.mpk",
105
+ "/catalog/root/path/to/loc2.mpk",
106
+ "/catalog/root/another/path/lore_ipsum.mpk",
107
+ "/catalog/root/another/path/meta/to/lorem_ipsum.mpk",
108
+ "/catalog/root/another/path/lorem_ipsum.mpk",
109
+ "/catalog/root/here.mpk",
110
+ ]
111
+ expected_relative_paths = [
112
+ "path/to/loc1.mpk",
113
+ "path/to/loc2.mpk",
114
+ "another/path/lore_ipsum.mpk",
115
+ "another/path/meta/to/lorem_ipsum.mpk",
116
+ "another/path/lorem_ipsum.mpk",
117
+ "here.mpk",
118
+ ]
119
+ # Create a dummy transaction operation with absolute paths
120
+ dest_metafile = Metafile({"id": "dummy_metafile_id"})
121
+ transaction_operation = TransactionOperation.of(
122
+ operation_type=TransactionOperationType.CREATE,
123
+ dest_metafile=dest_metafile,
124
+ )
125
+ # use replace as setter
126
+ transaction_operation.locator_write_paths = absolute_paths
127
+ # Create a transaction and relativize paths
128
+ transaction = Transaction.of(
129
+ txn_type=TransactionType.APPEND, txn_operations=[transaction_operation]
130
+ )
131
+ transaction.relativize_operation_paths(transaction_operation, catalog_root)
132
+ # Verify the paths have been correctly relativized
133
+ assert transaction_operation.locator_write_paths == expected_relative_paths
134
+
135
+ def test_relativize_metafile_and_locator_paths(self):
136
+ catalog_root = "/meta_catalog/root_dir/a/b/c"
137
+ meta_absolute_paths = [
138
+ "/meta_catalog/root_dir/a/b/c/namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk",
139
+ "/meta_catalog/root_dir/a/b/c/namespace/table/table_version/stream_id/partition_id/00000000000000000002.mpk",
140
+ "/meta_catalog/root_dir/a/b/c/namespace/table/table_version/stream_id/partition_id/00000000000000000003.mpk",
141
+ ]
142
+ loc_absolute_paths = [
143
+ "/meta_catalog/root_dir/a/b/c/d/table/table_version/stream_id/partition_id/00000000000000000001.mpk",
144
+ "/meta_catalog/root_dir/a/b/c/e/table/table_version/stream_id/partition_id/00000000000000000002.mpk",
145
+ "/meta_catalog/root_dir/a/b/c/f/table/table_version/stream_id/partition_id/00000000000000000003.mpk",
146
+ ]
147
+ meta_relative_paths = [
148
+ "namespace/table/table_version/stream_id/partition_id/00000000000000000001.mpk",
149
+ "namespace/table/table_version/stream_id/partition_id/00000000000000000002.mpk",
150
+ "namespace/table/table_version/stream_id/partition_id/00000000000000000003.mpk",
151
+ ]
152
+ loc_relative_paths = [
153
+ "d/table/table_version/stream_id/partition_id/00000000000000000001.mpk",
154
+ "e/table/table_version/stream_id/partition_id/00000000000000000002.mpk",
155
+ "f/table/table_version/stream_id/partition_id/00000000000000000003.mpk",
156
+ ]
157
+ # Create a dummy transaction operation with absolute paths
158
+ dest_metafile = Metafile({"id": "dummy_metafile_id"})
159
+ transaction_operation = TransactionOperation.of(
160
+ operation_type=TransactionOperationType.CREATE,
161
+ dest_metafile=dest_metafile,
162
+ )
163
+ # use replace as setter
164
+ transaction_operation.metafile_write_paths = meta_absolute_paths
165
+ transaction_operation.locator_write_paths = loc_absolute_paths
166
+ # Create a transaction and relativize paths
167
+ transaction = Transaction.of(
168
+ txn_type=TransactionType.APPEND, txn_operations=[transaction_operation]
169
+ )
170
+ transaction.relativize_operation_paths(transaction_operation, catalog_root)
171
+ # Verify the paths have been correctly relativized
172
+ assert (
173
+ transaction_operation.metafile_write_paths == meta_relative_paths
174
+ ), f"Expected: {meta_relative_paths}, but got: {transaction_operation.metafile_write_paths}"
175
+ assert (
176
+ transaction_operation.locator_write_paths == loc_relative_paths
177
+ ), f"Expected: {loc_relative_paths}, but got: {transaction_operation.locator_write_paths}"
178
+
179
+ def test_multiple_operations_relativize_paths(self):
180
+ catalog_root = "/catalog/root"
181
+ meta_absolute_paths = [
182
+ "/catalog/root/path/to/metafile1.mpk",
183
+ "/catalog/root/path/to/metafile2.mpk",
184
+ "/catalog/root/another/path/lore_ipsum.mpk",
185
+ "/catalog/root/another/path/meta/to/lorem_ipsum.mpk",
186
+ "/catalog/root/another/path/lorem_ipsum.mpk",
187
+ "/catalog/root/here.mpk",
188
+ ]
189
+ loc_absolute_paths = [
190
+ "/catalog/root/path/to/loc1.mpk",
191
+ "/catalog/root/path/to/loc2.mpk",
192
+ "/catalog/root/another/path/lore_ipsum.mpk",
193
+ "/catalog/root/another/path/meta/to/lorem_ipsum.mpk",
194
+ "/catalog/root/another/path/lorem_ipsum.mpk",
195
+ "/catalog/root/here.mpk",
196
+ ]
197
+ meta_expected_relative_paths = [
198
+ "path/to/metafile1.mpk",
199
+ "path/to/metafile2.mpk",
200
+ "another/path/lore_ipsum.mpk",
201
+ "another/path/meta/to/lorem_ipsum.mpk",
202
+ "another/path/lorem_ipsum.mpk",
203
+ "here.mpk",
204
+ ]
205
+ loc_expected_relative_paths = [
206
+ "path/to/loc1.mpk",
207
+ "path/to/loc2.mpk",
208
+ "another/path/lore_ipsum.mpk",
209
+ "another/path/meta/to/lorem_ipsum.mpk",
210
+ "another/path/lorem_ipsum.mpk",
211
+ "here.mpk",
212
+ ]
213
+ # Create a dummy transaction operation with absolute paths
214
+ dest_metafile = Metafile({"id": "dummy_metafile_id"})
215
+ transaction_operations = []
216
+ for i in range(11):
217
+ transaction_operation = TransactionOperation.of(
218
+ operation_type=TransactionOperationType.CREATE,
219
+ dest_metafile=dest_metafile,
220
+ )
221
+ transaction_operation.metafile_write_paths = meta_absolute_paths
222
+ transaction_operation.locator_write_paths = loc_absolute_paths
223
+ transaction_operations.append(transaction_operation)
224
+ # Create a transaction and relativize paths
225
+ transaction = Transaction.of(
226
+ txn_type=TransactionType.APPEND, txn_operations=transaction_operations
227
+ )
228
+ for operation in transaction_operations:
229
+ transaction.relativize_operation_paths(operation, catalog_root)
230
+ # Verify the paths have been correctly relativized
231
+ for operation in transaction_operations:
232
+ assert operation.metafile_write_paths == meta_expected_relative_paths
233
+ assert operation.locator_write_paths == loc_expected_relative_paths
234
+
235
+ def test_empty_metafile_and_locator_write_paths(self):
236
+ catalog_root = "/catalog/root"
237
+ transaction_operation = TransactionOperation.of(
238
+ operation_type=TransactionOperationType.CREATE,
239
+ dest_metafile=Metafile({"id": "dummy_metafile_id"}),
240
+ )
241
+ # Empty paths
242
+ transaction_operation.metafile_write_paths = []
243
+ transaction_operation.locator_write_paths = []
244
+ transaction = Transaction.of(
245
+ txn_type=TransactionType.APPEND, txn_operations=[transaction_operation]
246
+ )
247
+ transaction.relativize_operation_paths(transaction_operation, catalog_root)
248
+ assert transaction_operation.metafile_write_paths == []
249
+ assert transaction_operation.locator_write_paths == []
250
+
251
+ def test_large_number_of_paths(self):
252
+ catalog_root = "/catalog/root"
253
+ absolute_paths = [f"/catalog/root/path/to/file{i}.mpk" for i in range(5000)]
254
+ expected_paths = [f"path/to/file{i}.mpk" for i in range(5000)]
255
+ transaction_operation = TransactionOperation.of(
256
+ operation_type=TransactionOperationType.CREATE,
257
+ dest_metafile=Metafile({"id": "dummy_metafile_id"}),
258
+ )
259
+ transaction_operation.metafile_write_paths = absolute_paths
260
+ transaction = Transaction.of(
261
+ txn_type=TransactionType.APPEND, txn_operations=[transaction_operation]
262
+ )
263
+ transaction.relativize_operation_paths(transaction_operation, catalog_root)
264
+ assert transaction_operation.metafile_write_paths == expected_paths
265
+
266
+ def test_large_number_of_paths_multi_ops(self):
267
+ catalog_root = "/catalog/root"
268
+ absolute_paths = [f"/catalog/root/path/to/file{i}.mpk" for i in range(1000)]
269
+ expected_paths = [f"path/to/file{i}.mpk" for i in range(1000)]
270
+
271
+ # Different operation types to test
272
+ operation_types = [
273
+ TransactionOperationType.CREATE,
274
+ # TransactionOperationType.UPDATE,
275
+ TransactionOperationType.DELETE,
276
+ TransactionOperationType.READ_EXISTS,
277
+ TransactionOperationType.READ_LATEST,
278
+ TransactionOperationType.READ_CHILDREN,
279
+ TransactionOperationType.READ_SIBLINGS,
280
+ ]
281
+
282
+ # Different transaction types to test
283
+ txn_types = [
284
+ TransactionType.APPEND,
285
+ TransactionType.ALTER,
286
+ TransactionType.DELETE,
287
+ TransactionType.OVERWRITE,
288
+ TransactionType.READ,
289
+ TransactionType.RESTATE,
290
+ ]
291
+
292
+ for txn_type in txn_types:
293
+ transaction_ops = []
294
+ for op_type in operation_types:
295
+ transaction_operation = TransactionOperation.of(
296
+ operation_type=op_type,
297
+ dest_metafile=Metafile({"id": "dummy_metafile_id"}),
298
+ )
299
+ transaction_operation.metafile_write_paths = absolute_paths
300
+ transaction_ops.append(transaction_operation)
301
+ transaction = Transaction.of(
302
+ txn_type=txn_type, txn_operations=[transaction_operation]
303
+ )
304
+ transaction.relativize_operation_paths(transaction_operation, catalog_root)
305
+ # Assert paths are relativized correctly
306
+ assert (
307
+ transaction_operation.metafile_write_paths == expected_paths
308
+ ), f"Failed for transaction type {txn_type} and operation type {op_type}"
File without changes
@@ -0,0 +1,149 @@
1
+ import io
2
+
3
+ import pytest
4
+ from faker import Faker
5
+
6
+ from deltacat.storage.rivulet.schema.datatype import Datatype
7
+ from deltacat.storage.rivulet.mvp.Table import MvpTable
8
+ from deltacat.storage.rivulet.schema.schema import Schema
9
+ import random
10
+ import string
11
+ from PIL import Image
12
+
13
+ FIXTURE_ROW_COUNT = 10000
14
+
15
+
16
+ @pytest.fixture
17
+ def ds1_dataset() -> MvpTable:
18
+ """
19
+ dataset with one million rows
20
+ primary key is integer between 1 and 1,000,000
21
+
22
+ TODO change to user Faker instead of int ranges
23
+ """
24
+
25
+ # Function to generate random names
26
+ def generate_random_name():
27
+ return "".join(
28
+ random.choices(
29
+ string.ascii_uppercase + string.ascii_lowercase, k=random.randint(3, 10)
30
+ )
31
+ )
32
+
33
+ # Create a list of numbers from 1 to TEST_ROW_COUNT
34
+ ids = list(range(1, FIXTURE_ROW_COUNT + 1))
35
+ random.shuffle(ids)
36
+
37
+ # Generate one million rows
38
+ return MvpTable(
39
+ {
40
+ "id": ids,
41
+ "name": [generate_random_name() for _ in range(FIXTURE_ROW_COUNT)],
42
+ "age": [random.randint(18, 100) for _ in range(FIXTURE_ROW_COUNT)],
43
+ }
44
+ )
45
+
46
+
47
+ @pytest.fixture
48
+ def ds1_schema():
49
+ return Schema(
50
+ {
51
+ ("id", Datatype.int32()),
52
+ ("name", Datatype.string()),
53
+ ("age", Datatype.int32()),
54
+ },
55
+ "id",
56
+ )
57
+
58
+
59
+ @pytest.fixture
60
+ def ds2_dataset():
61
+ """
62
+ dataset2 with one million rows that can be joined to ds1
63
+ primary key is integer between 1 and 1,000,000
64
+ """
65
+ # Create a list of numbers from 1 to 1,000,000
66
+ ids = list(range(1, FIXTURE_ROW_COUNT + 1))
67
+ random.shuffle(ids)
68
+
69
+ fake = Faker()
70
+
71
+ # Generate one million rows
72
+ return MvpTable(
73
+ {
74
+ "id": ids,
75
+ "address": [fake.address() for _ in range(FIXTURE_ROW_COUNT)],
76
+ "zip": [fake.zipcode() for _ in range(FIXTURE_ROW_COUNT)],
77
+ }
78
+ )
79
+
80
+
81
+ @pytest.fixture
82
+ def ds2_schema():
83
+ return Schema(
84
+ {
85
+ ("id", Datatype.int32()),
86
+ ("address", Datatype.string()),
87
+ ("zip", Datatype.string()),
88
+ },
89
+ "id",
90
+ )
91
+
92
+
93
+ @pytest.fixture
94
+ def combined_schema(ds1_schema, ds2_schema):
95
+ return Schema(
96
+ {
97
+ ("id", Datatype.int32()),
98
+ ("address", Datatype.string()),
99
+ ("zip", Datatype.string()),
100
+ ("name", Datatype.string()),
101
+ ("age", Datatype.int32()),
102
+ },
103
+ "id",
104
+ )
105
+
106
+
107
+ @pytest.fixture
108
+ def dataset_images_with_label() -> (MvpTable, Schema):
109
+ """
110
+ dataset with one thousand images and labels, generated dynamically
111
+ primary key is integer between 1 and 1,000
112
+ """
113
+ ROW_COUNT = 1000
114
+ fake = Faker()
115
+ schema = Schema(
116
+ {
117
+ ("id", Datatype.int32()),
118
+ ("image", Datatype.image("jpg")),
119
+ ("label", Datatype.string()),
120
+ },
121
+ "id",
122
+ )
123
+
124
+ # Create a list of numbers from 1 to ROW_COUNT
125
+ ids = list(range(1, ROW_COUNT + 1))
126
+ random.shuffle(ids)
127
+
128
+ fake_image = Image.new(
129
+ "RGB",
130
+ (512, 512),
131
+ color=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
132
+ )
133
+ # get bytes from image encoded as png
134
+ buffer = io.BytesIO()
135
+ fake_image.save(buffer, format="PNG")
136
+ # seek to start of buffer since we just wrote to it
137
+ buffer.seek(0)
138
+ image_bytes = buffer.read()
139
+ # Generate one million rows
140
+ return (
141
+ MvpTable(
142
+ {
143
+ "id": ids,
144
+ "image": [image_bytes for _ in range(ROW_COUNT)],
145
+ "label": [fake.name() for _ in range(ROW_COUNT)],
146
+ }
147
+ ),
148
+ schema,
149
+ )
File without changes
@@ -0,0 +1,93 @@
1
+ import pytest
2
+
3
+ import pyarrow as pa
4
+ import pyarrow.parquet as pq
5
+ from deltacat import Datatype, Dataset
6
+ from deltacat.storage.rivulet import Schema, Field
7
+ from deltacat.utils.metafile_locator import _find_partition_path
8
+
9
+
10
+ @pytest.fixture
11
+ def sample_schema():
12
+ return Schema(
13
+ fields=[
14
+ Field("id", Datatype.int32(), is_merge_key=True),
15
+ Field("name", Datatype.string()),
16
+ Field("age", Datatype.int32()),
17
+ ]
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def sample_pydict():
23
+ return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
24
+
25
+
26
+ @pytest.fixture
27
+ def temp_storage_path(tmp_path):
28
+ return tmp_path
29
+
30
+
31
+ @pytest.fixture
32
+ def sample_parquet_data(temp_storage_path, sample_pydict):
33
+ parquet_path = temp_storage_path / "test.parquet"
34
+ table = pa.Table.from_pydict(sample_pydict)
35
+ pq.write_table(table, parquet_path)
36
+ return parquet_path
37
+
38
+
39
+ @pytest.fixture
40
+ def dataset(sample_parquet_data):
41
+ return Dataset.from_parquet(
42
+ file_uri=sample_parquet_data, name="dataset", merge_keys="id"
43
+ )
44
+
45
+
46
+ @pytest.fixture
47
+ def file_provider(dataset):
48
+ return dataset._file_provider
49
+
50
+
51
+ def test_provide_data_file(file_provider):
52
+ output_file = file_provider.provide_data_file("parquet")
53
+ assert "data" in output_file.location
54
+ assert output_file.location.endswith(".parquet")
55
+
56
+ output_file2 = file_provider.provide_data_file("parquet")
57
+ assert "data" in output_file2.location
58
+ assert output_file2.location.endswith(".parquet")
59
+
60
+ assert (
61
+ output_file.location != output_file2.location
62
+ ), "Two output files should have different locations."
63
+
64
+
65
+ def test_provide_manifest_file(file_provider):
66
+ output_file = file_provider.provide_manifest_file()
67
+ assert "metadata/manifests" in output_file.location
68
+ assert output_file.location.endswith(".json")
69
+
70
+
71
+ def test_provide_l0_sst_file(file_provider):
72
+ output_file = file_provider.provide_l0_sst_file()
73
+ assert "metadata/ssts/0" in output_file.location
74
+ assert output_file.location.endswith(".json")
75
+
76
+
77
+ def test_provide_input_file(file_provider, sample_parquet_data):
78
+ input_file = file_provider.provide_input_file(str(sample_parquet_data))
79
+ assert input_file.location == str(sample_parquet_data)
80
+
81
+
82
+ def test_generate_sst_uris(file_provider):
83
+ generated_files = list(file_provider.generate_sst_uris())
84
+ for file in generated_files:
85
+ assert "metadata/ssts/0" in file.location
86
+ assert file.location.endswith(".json")
87
+
88
+
89
+ def test_get_scan_directories(file_provider):
90
+ partition_path = _find_partition_path(file_provider.uri, file_provider._locator)
91
+ assert file_provider.get_sst_scan_directories() == [
92
+ f"{partition_path}/metadata/ssts/0/"
93
+ ]
File without changes