deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (236) hide show
  1. deltacat/__init__.py +42 -3
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +168 -0
  4. deltacat/aws/s3u.py +4 -4
  5. deltacat/benchmarking/benchmark_engine.py +82 -0
  6. deltacat/benchmarking/benchmark_report.py +86 -0
  7. deltacat/benchmarking/benchmark_suite.py +11 -0
  8. deltacat/benchmarking/conftest.py +21 -0
  9. deltacat/benchmarking/data/random_row_generator.py +94 -0
  10. deltacat/benchmarking/data/row_generator.py +10 -0
  11. deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
  12. deltacat/catalog/__init__.py +14 -0
  13. deltacat/catalog/delegate.py +199 -106
  14. deltacat/catalog/iceberg/__init__.py +4 -0
  15. deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
  16. deltacat/catalog/iceberg/impl.py +368 -0
  17. deltacat/catalog/iceberg/overrides.py +74 -0
  18. deltacat/catalog/interface.py +273 -76
  19. deltacat/catalog/main/impl.py +720 -0
  20. deltacat/catalog/model/catalog.py +227 -20
  21. deltacat/catalog/model/properties.py +116 -0
  22. deltacat/catalog/model/table_definition.py +32 -1
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +5 -5
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +1 -1
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +1 -1
  32. deltacat/compute/compactor/steps/materialize.py +6 -2
  33. deltacat/compute/compactor/utils/io.py +1 -1
  34. deltacat/compute/compactor/utils/sort_key.py +9 -2
  35. deltacat/compute/compactor_v2/compaction_session.py +5 -9
  36. deltacat/compute/compactor_v2/constants.py +1 -30
  37. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  38. deltacat/compute/compactor_v2/model/merge_input.py +1 -7
  39. deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
  40. deltacat/compute/compactor_v2/steps/merge.py +17 -126
  41. deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
  42. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  43. deltacat/compute/compactor_v2/utils/io.py +1 -1
  44. deltacat/compute/compactor_v2/utils/merge.py +0 -1
  45. deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
  46. deltacat/compute/compactor_v2/utils/task_options.py +23 -43
  47. deltacat/compute/converter/constants.py +4 -0
  48. deltacat/compute/converter/converter_session.py +143 -0
  49. deltacat/compute/converter/model/convert_input.py +69 -0
  50. deltacat/compute/converter/model/convert_input_files.py +61 -0
  51. deltacat/compute/converter/model/converter_session_params.py +99 -0
  52. deltacat/compute/converter/pyiceberg/__init__.py +0 -0
  53. deltacat/compute/converter/pyiceberg/catalog.py +75 -0
  54. deltacat/compute/converter/pyiceberg/overrides.py +135 -0
  55. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
  56. deltacat/compute/converter/steps/__init__.py +0 -0
  57. deltacat/compute/converter/steps/convert.py +211 -0
  58. deltacat/compute/converter/steps/dedupe.py +60 -0
  59. deltacat/compute/converter/utils/__init__.py +0 -0
  60. deltacat/compute/converter/utils/convert_task_options.py +88 -0
  61. deltacat/compute/converter/utils/converter_session_utils.py +109 -0
  62. deltacat/compute/converter/utils/iceberg_columns.py +82 -0
  63. deltacat/compute/converter/utils/io.py +43 -0
  64. deltacat/compute/converter/utils/s3u.py +133 -0
  65. deltacat/compute/resource_estimation/delta.py +1 -19
  66. deltacat/constants.py +47 -1
  67. deltacat/env.py +51 -0
  68. deltacat/examples/__init__.py +0 -0
  69. deltacat/examples/basic_logging.py +101 -0
  70. deltacat/examples/common/__init__.py +0 -0
  71. deltacat/examples/common/fixtures.py +15 -0
  72. deltacat/examples/hello_world.py +27 -0
  73. deltacat/examples/iceberg/__init__.py +0 -0
  74. deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
  75. deltacat/examples/iceberg/iceberg_reader.py +149 -0
  76. deltacat/exceptions.py +51 -9
  77. deltacat/logs.py +4 -1
  78. deltacat/storage/__init__.py +118 -28
  79. deltacat/storage/iceberg/__init__.py +0 -0
  80. deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
  81. deltacat/storage/iceberg/impl.py +737 -0
  82. deltacat/storage/iceberg/model.py +709 -0
  83. deltacat/storage/interface.py +217 -134
  84. deltacat/storage/main/__init__.py +0 -0
  85. deltacat/storage/main/impl.py +2077 -0
  86. deltacat/storage/model/delta.py +118 -71
  87. deltacat/storage/model/interop.py +24 -0
  88. deltacat/storage/model/list_result.py +8 -0
  89. deltacat/storage/model/locator.py +93 -3
  90. deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
  91. deltacat/storage/model/metafile.py +1316 -0
  92. deltacat/storage/model/namespace.py +34 -18
  93. deltacat/storage/model/partition.py +362 -37
  94. deltacat/storage/model/scan/__init__.py +0 -0
  95. deltacat/storage/model/scan/push_down.py +19 -0
  96. deltacat/storage/model/scan/scan_plan.py +10 -0
  97. deltacat/storage/model/scan/scan_task.py +34 -0
  98. deltacat/storage/model/schema.py +892 -0
  99. deltacat/storage/model/shard.py +47 -0
  100. deltacat/storage/model/sort_key.py +170 -13
  101. deltacat/storage/model/stream.py +208 -80
  102. deltacat/storage/model/table.py +123 -29
  103. deltacat/storage/model/table_version.py +322 -46
  104. deltacat/storage/model/transaction.py +757 -0
  105. deltacat/storage/model/transform.py +198 -61
  106. deltacat/storage/model/types.py +111 -13
  107. deltacat/storage/rivulet/__init__.py +11 -0
  108. deltacat/storage/rivulet/arrow/__init__.py +0 -0
  109. deltacat/storage/rivulet/arrow/serializer.py +75 -0
  110. deltacat/storage/rivulet/dataset.py +744 -0
  111. deltacat/storage/rivulet/dataset_executor.py +87 -0
  112. deltacat/storage/rivulet/feather/__init__.py +5 -0
  113. deltacat/storage/rivulet/feather/file_reader.py +136 -0
  114. deltacat/storage/rivulet/feather/serializer.py +35 -0
  115. deltacat/storage/rivulet/fs/__init__.py +0 -0
  116. deltacat/storage/rivulet/fs/file_provider.py +105 -0
  117. deltacat/storage/rivulet/fs/file_store.py +130 -0
  118. deltacat/storage/rivulet/fs/input_file.py +76 -0
  119. deltacat/storage/rivulet/fs/output_file.py +86 -0
  120. deltacat/storage/rivulet/logical_plan.py +105 -0
  121. deltacat/storage/rivulet/metastore/__init__.py +0 -0
  122. deltacat/storage/rivulet/metastore/delta.py +190 -0
  123. deltacat/storage/rivulet/metastore/json_sst.py +105 -0
  124. deltacat/storage/rivulet/metastore/sst.py +82 -0
  125. deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  126. deltacat/storage/rivulet/mvp/Table.py +101 -0
  127. deltacat/storage/rivulet/mvp/__init__.py +5 -0
  128. deltacat/storage/rivulet/parquet/__init__.py +5 -0
  129. deltacat/storage/rivulet/parquet/data_reader.py +0 -0
  130. deltacat/storage/rivulet/parquet/file_reader.py +127 -0
  131. deltacat/storage/rivulet/parquet/serializer.py +37 -0
  132. deltacat/storage/rivulet/reader/__init__.py +0 -0
  133. deltacat/storage/rivulet/reader/block_scanner.py +378 -0
  134. deltacat/storage/rivulet/reader/data_reader.py +136 -0
  135. deltacat/storage/rivulet/reader/data_scan.py +63 -0
  136. deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
  137. deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
  138. deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
  139. deltacat/storage/rivulet/reader/query_expression.py +99 -0
  140. deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
  141. deltacat/storage/rivulet/schema/__init__.py +0 -0
  142. deltacat/storage/rivulet/schema/datatype.py +128 -0
  143. deltacat/storage/rivulet/schema/schema.py +251 -0
  144. deltacat/storage/rivulet/serializer.py +40 -0
  145. deltacat/storage/rivulet/serializer_factory.py +42 -0
  146. deltacat/storage/rivulet/writer/__init__.py +0 -0
  147. deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
  148. deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
  149. deltacat/tests/_io/__init__.py +1 -0
  150. deltacat/tests/catalog/test_catalogs.py +324 -0
  151. deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
  152. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  153. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  154. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  155. deltacat/tests/compute/compact_partition_test_cases.py +19 -53
  156. deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
  157. deltacat/tests/compute/compactor/utils/test_io.py +6 -8
  158. deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
  159. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
  160. deltacat/tests/compute/conftest.py +75 -0
  161. deltacat/tests/compute/converter/__init__.py +0 -0
  162. deltacat/tests/compute/converter/conftest.py +80 -0
  163. deltacat/tests/compute/converter/test_convert_session.py +478 -0
  164. deltacat/tests/compute/converter/utils.py +123 -0
  165. deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
  166. deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
  167. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
  168. deltacat/tests/compute/test_compact_partition_params.py +3 -3
  169. deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
  170. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
  171. deltacat/tests/compute/test_util_common.py +19 -12
  172. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
  173. deltacat/tests/local_deltacat_storage/__init__.py +76 -103
  174. deltacat/tests/storage/__init__.py +0 -0
  175. deltacat/tests/storage/conftest.py +25 -0
  176. deltacat/tests/storage/main/__init__.py +0 -0
  177. deltacat/tests/storage/main/test_main_storage.py +1399 -0
  178. deltacat/tests/storage/model/__init__.py +0 -0
  179. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  180. deltacat/tests/storage/model/test_metafile_io.py +2535 -0
  181. deltacat/tests/storage/model/test_schema.py +308 -0
  182. deltacat/tests/storage/model/test_shard.py +22 -0
  183. deltacat/tests/storage/model/test_table_version.py +110 -0
  184. deltacat/tests/storage/model/test_transaction.py +308 -0
  185. deltacat/tests/storage/rivulet/__init__.py +0 -0
  186. deltacat/tests/storage/rivulet/conftest.py +149 -0
  187. deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
  188. deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
  189. deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
  190. deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
  191. deltacat/tests/storage/rivulet/test_dataset.py +406 -0
  192. deltacat/tests/storage/rivulet/test_manifest.py +67 -0
  193. deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
  194. deltacat/tests/storage/rivulet/test_utils.py +122 -0
  195. deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
  196. deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
  197. deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
  198. deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  199. deltacat/tests/test_deltacat_api.py +39 -0
  200. deltacat/tests/test_utils/filesystem.py +14 -0
  201. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  202. deltacat/tests/test_utils/pyarrow.py +8 -15
  203. deltacat/tests/test_utils/storage.py +266 -3
  204. deltacat/tests/utils/test_daft.py +3 -3
  205. deltacat/tests/utils/test_pyarrow.py +0 -432
  206. deltacat/types/partial_download.py +1 -1
  207. deltacat/types/tables.py +1 -1
  208. deltacat/utils/export.py +59 -0
  209. deltacat/utils/filesystem.py +320 -0
  210. deltacat/utils/metafile_locator.py +73 -0
  211. deltacat/utils/pyarrow.py +36 -183
  212. deltacat-2.0.dist-info/METADATA +65 -0
  213. deltacat-2.0.dist-info/RECORD +347 -0
  214. deltacat/aws/redshift/__init__.py +0 -19
  215. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  216. deltacat/io/dataset.py +0 -73
  217. deltacat/io/read_api.py +0 -143
  218. deltacat/storage/model/delete_parameters.py +0 -40
  219. deltacat/storage/model/partition_spec.py +0 -71
  220. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
  221. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
  222. deltacat-1.1.36.dist-info/METADATA +0 -64
  223. deltacat-1.1.36.dist-info/RECORD +0 -219
  224. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  225. /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
  226. /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
  227. /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
  228. /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
  229. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  230. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  231. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  232. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  233. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  234. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
  235. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
  236. {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,241 @@
1
+ import pytest
2
+ import pyarrow as pa
3
+ from deltacat.storage.rivulet import Schema, Field, Datatype
4
+
5
+
6
+ def test_field_initialization():
7
+ field = Field(name="test_field", datatype=Datatype.string(), is_merge_key=True)
8
+ assert field.name == "test_field"
9
+ assert field.datatype == Datatype.string()
10
+ assert field.is_merge_key
11
+
12
+
13
+ def test_schema_initialization():
14
+ fields = [("id", Datatype.int64()), ("name", Datatype.string())]
15
+ schema = Schema(fields, merge_keys=["id"])
16
+ assert len(schema) == 2
17
+ assert "id" in schema.keys()
18
+ assert schema["id"].datatype == Datatype.int64()
19
+ assert "name" in schema.keys()
20
+ assert schema["name"].datatype == Datatype.string()
21
+
22
+
23
+ def test_merge_key_conflict_on_init():
24
+ fields = [
25
+ Field("id", Datatype.int64(), is_merge_key=False), # Merge key off here
26
+ ("name", Datatype.string()),
27
+ ]
28
+ with pytest.raises(TypeError):
29
+ Schema(fields, merge_keys=["id"]) # Merge key on here
30
+
31
+
32
+ def test_simultaneous_duplicate_field():
33
+ with pytest.raises(ValueError):
34
+ Schema(
35
+ [
36
+ ("id", Datatype.int32()),
37
+ ("name", Datatype.string()),
38
+ ("age", Datatype.int32()),
39
+ ("age", Datatype.string()),
40
+ ],
41
+ merge_keys=["id"],
42
+ )
43
+
44
+
45
+ def test_add_field():
46
+ schema = Schema()
47
+ field = Field("new_field", Datatype.float(), True)
48
+ schema.add_field(field)
49
+ assert len(schema) == 1
50
+ assert "new_field" in schema.keys()
51
+ assert schema["new_field"].datatype == Datatype.float()
52
+
53
+ field2 = Field("another_field", Datatype.string(), True)
54
+ schema.add_field(field2)
55
+ assert len(schema) == 2
56
+ assert "another_field" in schema.keys()
57
+ assert schema["another_field"].datatype == Datatype.string()
58
+
59
+ with pytest.raises(ValueError):
60
+ schema.add_field(field2)
61
+
62
+
63
+ def test_setitem_field():
64
+ schema = Schema()
65
+ field = Field("test_field", Datatype.int64(), is_merge_key=True)
66
+ schema["test_field"] = field
67
+ assert schema["test_field"] == field
68
+
69
+
70
+ def test_setitem_datatype():
71
+ schema = Schema()
72
+ schema["id"] = (Datatype.int64(), True)
73
+ schema["test_field"] = Datatype.int64()
74
+ assert schema["test_field"].name == "test_field"
75
+ assert schema["test_field"].datatype == Datatype.int64()
76
+ assert not schema["test_field"].is_merge_key
77
+
78
+
79
+ def test_setitem_tuple_with_merge_key():
80
+ schema = Schema()
81
+ schema["test_field"] = (Datatype.int64(), True)
82
+ assert schema["test_field"].name == "test_field"
83
+ assert schema["test_field"].datatype == Datatype.int64()
84
+ assert schema["test_field"].is_merge_key
85
+
86
+
87
+ def test_setitem_invalid_type():
88
+ schema = Schema()
89
+ with pytest.raises(TypeError):
90
+ schema["test_field"] = "invalid"
91
+
92
+
93
+ def test_non_empty_merge_key():
94
+ with pytest.raises(TypeError):
95
+ _ = Schema([], merge_keys=["id"])
96
+
97
+
98
+ def test_merge_schemas():
99
+ schema1 = Schema([("id", Datatype.int64())], merge_keys=["id"])
100
+ schema2 = Schema(
101
+ [("other_id", Datatype.string()), ("name", Datatype.string())],
102
+ merge_keys="other_id",
103
+ )
104
+ schema1.merge(schema2)
105
+ assert len(schema1) == 3
106
+ assert "id" in schema1.keys()
107
+ assert "name" in schema1.keys()
108
+ assert "other_id" in schema1.keys()
109
+
110
+
111
+ def test_merge_schemas_same_merge_key():
112
+ schema1 = Schema(
113
+ [("id", Datatype.int64()), ("name", Datatype.string())], merge_keys=["id"]
114
+ )
115
+ schema2 = Schema(
116
+ [("id", Datatype.int64()), ("other_name", Datatype.string())],
117
+ merge_keys="id",
118
+ )
119
+ schema1.merge(schema2)
120
+ assert len(schema1) == 3
121
+ assert "id" in schema1.keys()
122
+ assert "name" in schema1.keys()
123
+ assert "other_name" in schema1.keys()
124
+
125
+
126
+ def test_merge_schema_conflict():
127
+ schema1 = Schema([("id", Datatype.int64())], merge_keys=["id"])
128
+ schema1_dup = Schema([("id", Datatype.int64())], merge_keys=["id"])
129
+ schema2 = Schema([("id", Datatype.string())], merge_keys=["id"])
130
+
131
+ with pytest.raises(ValueError):
132
+ schema1.merge(schema2)
133
+
134
+ schema1.merge(
135
+ schema1_dup
136
+ ) # Merging the same field is allowed (unlike using add_field)
137
+ assert schema1["id"].datatype == Datatype.int64()
138
+ assert len(schema1) == 1
139
+
140
+
141
+ def test_to_pyarrow_schema():
142
+ fields = [("id", Datatype.int64()), ("name", Datatype.string())]
143
+ schema = Schema(fields, merge_keys=["id"])
144
+ pa_schema = schema.to_pyarrow()
145
+ assert isinstance(pa_schema, pa.Schema)
146
+ assert len(pa_schema) == 2
147
+ assert pa_schema.field("id").type == pa.int64()
148
+ assert pa_schema.field("name").type == pa.string()
149
+
150
+
151
+ def test_from_pyarrow_schema():
152
+ pa_schema = pa.schema([("id", pa.int64()), ("name", pa.string())])
153
+ schema = Schema.from_pyarrow(pa_schema, merge_keys=["id"])
154
+ assert len(schema) == 2
155
+ assert schema["id"].is_merge_key
156
+
157
+
158
+ def test_from_pyarrow_schema_invalid_merge_keys():
159
+ pa_schema = pa.schema([("id", pa.int64()), ("name", pa.string())])
160
+ with pytest.raises(ValueError):
161
+ Schema.from_pyarrow(pa_schema, merge_keys=["bad_key"])
162
+
163
+
164
+ def test_get_field():
165
+ schema = Schema([("id", Datatype.int64())], merge_keys=["id"])
166
+ field = schema["id"]
167
+ assert field.name == "id"
168
+ assert field.datatype == Datatype.int64()
169
+
170
+
171
+ def test_set_field():
172
+ schema = Schema([("id", Datatype.int64())], merge_keys=["id"])
173
+ schema["name"] = Field("name", Datatype.string())
174
+ assert len(schema) == 2
175
+ assert "name" in schema.keys()
176
+ assert schema["name"].datatype == Datatype.string()
177
+
178
+
179
+ def test_delete_field():
180
+ schema = Schema(
181
+ [("name", Datatype.string()), ("zip", Datatype.int32())], merge_keys=["name"]
182
+ )
183
+ del schema["zip"]
184
+ assert "zip" not in schema.keys()
185
+ assert "name" in schema.keys()
186
+
187
+
188
+ def test_delete_merge_key_field():
189
+ schema = Schema([("id", Datatype.int64())], merge_keys=["id"])
190
+ with pytest.raises(ValueError):
191
+ del schema["id"]
192
+
193
+
194
+ def test_schema_iter():
195
+ fields = [
196
+ Field("id", Datatype.int32(), is_merge_key=True),
197
+ Field("name", Datatype.string()),
198
+ ]
199
+ schema = Schema(fields)
200
+ iter_result = list(iter(schema))
201
+ assert len(iter_result) == 2
202
+ assert all(isinstance(item, str) for item in iter_result)
203
+
204
+
205
+ def test_merge_all():
206
+ schema1 = Schema(
207
+ [
208
+ Field("id", Datatype.int64(), is_merge_key=True),
209
+ Field("name", Datatype.string()),
210
+ ]
211
+ )
212
+ schema2 = Schema(
213
+ [
214
+ Field("age", Datatype.int32()),
215
+ Field("email", Datatype.string(), is_merge_key=True),
216
+ ]
217
+ )
218
+ merged_schema = Schema.merge_all([schema1, schema2])
219
+ assert len(merged_schema) == 4
220
+
221
+
222
+ def test_schema_values():
223
+ fields = [
224
+ Field("id", Datatype.int64(), is_merge_key=True),
225
+ Field("name", Datatype.string()),
226
+ ]
227
+ schema = Schema(fields)
228
+ values = list(schema.values())
229
+ assert len(values) == 2
230
+ assert all(isinstance(v, Field) for v in values)
231
+
232
+
233
+ def test_schema_items():
234
+ fields = [
235
+ Field("id", Datatype.int64(), is_merge_key=True),
236
+ Field("name", Datatype.string()),
237
+ ]
238
+ schema = Schema(fields)
239
+ items = list(schema.items())
240
+ assert len(items) == 2
241
+ assert all(isinstance(k, str) and isinstance(v, Field) for k, v in items)
@@ -0,0 +1,406 @@
1
+ import posixpath
2
+ from deltacat.utils.metafile_locator import _find_partition_path
3
+ import pytest
4
+
5
+ import pyarrow as pa
6
+ from deltacat.storage.rivulet import Schema, Field, Datatype
7
+ from deltacat.storage.rivulet.dataset import Dataset
8
+ from deltacat.storage.rivulet.reader.query_expression import QueryExpression
9
+
10
+
11
+ @pytest.fixture
12
+ def sample_schema():
13
+ return Schema(
14
+ fields=[
15
+ Field("id", Datatype.int32(), is_merge_key=True),
16
+ Field("name", Datatype.string()),
17
+ Field("age", Datatype.int32()),
18
+ ]
19
+ )
20
+
21
+
22
+ @pytest.fixture
23
+ def sample_pydict():
24
+ return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
25
+
26
+
27
+ @pytest.fixture
28
+ def sample_parquet_data(tmp_path, sample_pydict):
29
+ parquet_path = tmp_path / "test.parquet"
30
+ table = pa.Table.from_pydict(sample_pydict)
31
+ pa.parquet.write_table(table, parquet_path)
32
+ return parquet_path
33
+
34
+
35
+ # Updated Tests
36
+
37
+
38
+ def test_dataset_creation_with_schema(tmp_path, sample_schema):
39
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
40
+ assert len(dataset.fields) == 3
41
+ assert "id" in dataset.fields
42
+ assert dataset.fields["id"].is_merge_key
43
+
44
+
45
+ def test_dataset_initialization_with_metadata(tmp_path):
46
+ dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
47
+ assert dataset.dataset_name == "test_dataset"
48
+ assert dataset._metadata_folder.startswith(".riv-meta")
49
+
50
+
51
+ def test_invalid_dataset_initialization():
52
+ with pytest.raises(ValueError, match="Name must be a non-empty string"):
53
+ Dataset(dataset_name="")
54
+
55
+
56
+ def test_dataset_creation_metadata_structure(tmp_path):
57
+ dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
58
+
59
+ assert dataset._metadata_folder.startswith(".riv-meta")
60
+ assert dataset._namespace == "DEFAULT"
61
+ assert dataset.dataset_name == "test_dataset"
62
+ assert dataset._metadata_path == str(tmp_path / ".riv-meta-test_dataset")
63
+
64
+ locator = dataset._locator
65
+ root_uri = dataset._metadata_path
66
+
67
+ partition_path = _find_partition_path(root_uri, locator)
68
+
69
+ # Ensures that directory structure for namespace -> table -> table_version -> stream_id -> partition_id exists
70
+ assert posixpath.exists(partition_path)
71
+
72
+
73
+ def test_fields_accessor_add_field(tmp_path, sample_schema):
74
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
75
+ dataset.fields.add("new_field", Datatype.float())
76
+ assert "new_field" in dataset.fields
77
+ assert dataset.fields["new_field"].datatype == Datatype.float()
78
+
79
+ dataset.fields["new_field2"] = Field("new_field2", Datatype.int32())
80
+ assert "new_field2" in dataset.fields
81
+ assert "new_field2" in dataset.schemas["all"]
82
+ with pytest.raises(TypeError):
83
+ dataset.fields["new_field3"] = 2
84
+
85
+
86
+ def test_field_removal(tmp_path, sample_schema):
87
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
88
+ del dataset.fields["age"]
89
+ assert "age" not in dataset.fields
90
+ with pytest.raises(ValueError):
91
+ del dataset.fields["age"]
92
+ with pytest.raises(KeyError):
93
+ _ = dataset.fields["age"]
94
+
95
+
96
+ def test_fields_accessor_repr(tmp_path, sample_schema):
97
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
98
+ repr_output = repr(dataset.fields)
99
+ for field_name in ["id", "name", "age"]:
100
+ assert field_name in repr_output, f"Field '{field_name}' missing in repr output"
101
+
102
+
103
+ def test_schemas_accessor_add_group(tmp_path, sample_schema):
104
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
105
+ dataset.schemas["analytics"] = ["id", "name"]
106
+ assert "analytics" in dataset.schemas
107
+ assert len(dataset.schemas["analytics"]) == 2
108
+
109
+
110
+ def test_schema_removal(tmp_path, sample_schema):
111
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
112
+ with pytest.raises(ValueError):
113
+ del dataset.schemas["all"]
114
+ with pytest.raises(ValueError):
115
+ del dataset.schemas["does_not_exist"]
116
+ dataset.schemas["new"] = ["id", "name"]
117
+ del dataset.schemas["new"]
118
+ with pytest.raises(KeyError):
119
+ _ = dataset.schemas["new"]
120
+
121
+
122
+ def test_dataset_from_parquet(tmp_path, sample_parquet_data):
123
+ dataset = Dataset.from_parquet(
124
+ name="test_dataset",
125
+ file_uri=str(sample_parquet_data),
126
+ metadata_uri=str(tmp_path),
127
+ merge_keys="id",
128
+ )
129
+ assert len(dataset.fields) == 3
130
+ assert "id" in dataset.fields
131
+ assert dataset.fields["id"].is_merge_key
132
+
133
+
134
+ def test_parquet_schema_modes(tmp_path, sample_pydict):
135
+ # Create two parquet files with overlapping and unique schemas
136
+ data_1 = {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
137
+ data_2 = {"id": [4, 5, 6], "age": [25, 30, 35]}
138
+
139
+ path_1 = tmp_path / "data1.parquet"
140
+ path_2 = tmp_path / "data2.parquet"
141
+ pa.parquet.write_table(pa.Table.from_pydict(data_1), path_1)
142
+ pa.parquet.write_table(pa.Table.from_pydict(data_2), path_2)
143
+
144
+ dataset_union = Dataset.from_parquet(
145
+ name="test_dataset_union",
146
+ file_uri=str(tmp_path),
147
+ merge_keys="id",
148
+ schema_mode="union",
149
+ )
150
+ assert len(dataset_union.fields) == 3 # id, name, age
151
+
152
+ dataset_intersect = Dataset.from_parquet(
153
+ name="test_dataset_intersect",
154
+ file_uri=str(tmp_path),
155
+ merge_keys="id",
156
+ schema_mode="intersect",
157
+ )
158
+ assert len(dataset_intersect.fields) == 1 # Only id
159
+
160
+
161
+ def test_merge_all_schemas():
162
+ schema1 = Schema(
163
+ fields=[
164
+ Field("id", Datatype.int32(), is_merge_key=True),
165
+ Field("name", Datatype.string()),
166
+ ]
167
+ )
168
+ schema2 = Schema(
169
+ fields=[
170
+ Field("id", Datatype.int32(), is_merge_key=True),
171
+ Field("age", Datatype.int32()),
172
+ ]
173
+ )
174
+ merged_schema = Schema.merge_all([schema1, schema2])
175
+ assert len(merged_schema) == 3
176
+ assert "id" in merged_schema
177
+ assert "name" in merged_schema
178
+ assert "age" in merged_schema
179
+
180
+
181
+ def test_writer_creation_with_custom_format(tmp_path, sample_schema):
182
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
183
+ writer = dataset.writer(file_format="feather")
184
+ assert writer is not None
185
+
186
+
187
+ def test_scan_with_query(tmp_path, sample_schema):
188
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
189
+ query = QueryExpression() # Placeholder query
190
+ scan = dataset.scan(query)
191
+ assert scan is not None
192
+
193
+
194
+ def test_add_schema_to_new_schemas(tmp_path):
195
+ """Test adding a schema to a new field group."""
196
+ base_uri = str(tmp_path / "test_dataset")
197
+ dataset = Dataset(dataset_name=base_uri)
198
+
199
+ schema = Schema(
200
+ [
201
+ ("id", Datatype.int32()),
202
+ ("name", Datatype.string()),
203
+ ("age", Datatype.int32()),
204
+ ],
205
+ merge_keys=["id"],
206
+ )
207
+
208
+ dataset.add_schema(schema, schema_name="new_group")
209
+
210
+ # Verify the field group is added
211
+ assert "new_group" in dataset.schemas
212
+ assert len(dataset.schemas["new_group"]) == 3
213
+ assert dataset.schemas["new_group"]["id"].datatype == Datatype.int32()
214
+ assert dataset.schemas["new_group"]["name"].datatype == Datatype.string()
215
+ assert dataset.schemas["new_group"]["age"].datatype == Datatype.int32()
216
+
217
+
218
+ def test_add_schema_to_existing_schemas(tmp_path):
219
+ """Test merging a schema into an existing field group."""
220
+ base_uri = str(tmp_path / "test_dataset")
221
+ dataset = Dataset(dataset_name=base_uri)
222
+
223
+ schema_1 = Schema(
224
+ [
225
+ ("id", Datatype.int32()),
226
+ ("name", Datatype.string()),
227
+ ],
228
+ merge_keys=["id"],
229
+ )
230
+
231
+ dataset.add_schema(schema_1, schema_name="existing_group")
232
+
233
+ schema_2 = Schema(
234
+ [
235
+ ("age", Datatype.int32()),
236
+ ("email", Datatype.string()),
237
+ ],
238
+ merge_keys=["id"],
239
+ )
240
+
241
+ dataset.add_schema(schema_2, schema_name="existing_group")
242
+
243
+ # Verify the merged schema
244
+ assert "existing_group" in dataset.schemas
245
+ assert len(dataset.schemas["existing_group"]) == 4
246
+ assert dataset.schemas["existing_group"]["id"].datatype == Datatype.int32()
247
+ assert dataset.schemas["existing_group"]["name"].datatype == Datatype.string()
248
+ assert dataset.schemas["existing_group"]["age"].datatype == Datatype.int32()
249
+ assert dataset.schemas["existing_group"]["email"].datatype == Datatype.string()
250
+
251
+
252
+ def test_add_schema_conflicting_fields(tmp_path):
253
+ """Test adding a schema with conflicting fields."""
254
+ base_uri = str(tmp_path / "test_dataset")
255
+ dataset = Dataset(dataset_name=base_uri)
256
+
257
+ schema_1 = Schema(
258
+ [
259
+ ("id", Datatype.int32()),
260
+ ("name", Datatype.string()),
261
+ ],
262
+ merge_keys=["id"],
263
+ )
264
+
265
+ dataset.add_schema(schema_1, schema_name="conflicting_group")
266
+
267
+ schema_2 = Schema(
268
+ [
269
+ ("id", Datatype.string()), # Conflict: datatype mismatch
270
+ ("age", Datatype.int32()),
271
+ ],
272
+ merge_keys=["id"],
273
+ )
274
+
275
+ with pytest.raises(ValueError, match="already exists"):
276
+ dataset.add_schema(schema_2, schema_name="conflicting_group")
277
+
278
+ schema_3 = Schema(
279
+ [
280
+ ("id", Datatype.int32()), # Conflict: datatype mismatch
281
+ ("age", Datatype.int32()),
282
+ ],
283
+ merge_keys=["id"],
284
+ )
285
+
286
+ dataset.add_schema(schema_3, schema_name="conflicting_group")
287
+ assert "conflicting_group" in dataset.schemas
288
+ assert len(dataset.schemas["conflicting_group"]) == 3
289
+ assert dataset.schemas["conflicting_group"]["id"].datatype == Datatype.int32()
290
+ assert dataset.schemas["conflicting_group"]["name"].datatype == Datatype.string()
291
+ assert dataset.schemas["conflicting_group"]["age"].datatype == Datatype.int32()
292
+
293
+
294
+ def test_add_fields_with_merge_key_field(tmp_path):
295
+ base_uri = str(tmp_path / "test_dataset")
296
+ dataset = Dataset(dataset_name=base_uri)
297
+ dataset.add_fields([Field("my_merge_key", Datatype.string(), True)])
298
+ assert dataset.schemas["default"].get_merge_key() == "my_merge_key"
299
+
300
+
301
+ def test_add_schema_to_nonexistent_schemas(tmp_path):
302
+ """Test adding a schema to a nonexistent field group."""
303
+ base_uri = str(tmp_path / "test_dataset")
304
+ dataset = Dataset(dataset_name=base_uri)
305
+
306
+ schema = Schema(
307
+ [
308
+ ("id", Datatype.int32()),
309
+ ("name", Datatype.string()),
310
+ ],
311
+ merge_keys=["id"],
312
+ )
313
+
314
+ # Add to a non-existent field group
315
+ dataset.add_schema(schema, schema_name="nonexistent_group")
316
+
317
+ # Verify the field group is created
318
+ assert "nonexistent_group" in dataset.schemas
319
+ assert len(dataset.schemas["nonexistent_group"]) == 2
320
+
321
+
322
+ def test_add_missing_field_to_schema_raises_error(tmp_path, sample_schema):
323
+ """
324
+ Test that attempting to add a missing field to the 'all' schema raises a ValueError.
325
+ """
326
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
327
+
328
+ # Attempt to add a non-existent field to the 'all' schema
329
+ with pytest.raises(
330
+ ValueError, match="Field 'missing_field' does not exist in the dataset."
331
+ ):
332
+ dataset.schemas["all"] = [
333
+ "missing_field"
334
+ ] # Attempt to set a list with a missing field
335
+
336
+
337
+ def test_schemas_accessor_methods(tmp_path, sample_schema):
338
+ """
339
+ Test the __iter__, __len__, and __repr__ methods of SchemasAccessor.
340
+ """
341
+ dataset = Dataset(
342
+ dataset_name="test_dataset", schema=sample_schema
343
+ ) # Default schema is defined automatically
344
+ dataset.schemas["schema_1"] = ["id", "name"]
345
+ dataset.schemas["schema_2"] = ["age"]
346
+
347
+ # Test __iter__
348
+ schema_names = list(iter(dataset.schemas))
349
+ assert set(schema_names) == {
350
+ "schema_1",
351
+ "schema_2",
352
+ "all",
353
+ "default",
354
+ }, "Schema names do not match expected values"
355
+
356
+ # Test __len__
357
+ assert len(dataset.schemas) == 4, "Length of schemas accessor is incorrect"
358
+
359
+ # Test __repr__
360
+ repr_output = repr(dataset.schemas)
361
+ for schema_name in ["schema_1", "schema_2", "all"]:
362
+ assert (
363
+ schema_name in repr_output
364
+ ), f"Schema '{schema_name}' missing in repr output"
365
+
366
+
367
+ def test_get_merge_keys(tmp_path, sample_schema):
368
+ """
369
+ Test the get_merge_keys method to ensure it returns all merge keys in the dataset.
370
+ """
371
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
372
+
373
+ # Add fields with additional merge key to the dataset
374
+ other_schema = Schema(
375
+ [("id2", Datatype.int32()), ("zip", Datatype.string())], merge_keys=["id2"]
376
+ )
377
+
378
+ dataset.add_schema(other_schema, "id2+zip")
379
+
380
+ # Call get_merge_keys and validate the result
381
+ merge_keys = dataset.get_merge_keys()
382
+ assert merge_keys == [
383
+ "id",
384
+ "id2",
385
+ ], f"Expected merge keys ['id', 'id2'], got {merge_keys}"
386
+
387
+
388
+ def test_add_fields_no_fields_raises_error(tmp_path, sample_schema):
389
+ dataset = Dataset(dataset_name="test_dataset")
390
+ with pytest.raises(ValueError):
391
+ dataset.add_fields(fields=[])
392
+
393
+
394
+ def test_add_fields_mismatched_merge_keys_raises_error(tmp_path, sample_schema):
395
+ dataset = Dataset(dataset_name="test_dataset")
396
+ with pytest.raises(
397
+ ValueError,
398
+ match="The following merge keys were not found in the provided fields: does_not_exist",
399
+ ):
400
+ dataset.add_fields(fields=sample_schema.values(), merge_keys=["does_not_exist"])
401
+
402
+ with pytest.raises(TypeError, match="Merge key status conflict"):
403
+ dataset.add_fields(
404
+ fields=[Field("id", Datatype.int32()), Field("name", Datatype.string())],
405
+ merge_keys=["id"],
406
+ )