deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,124 @@
1
+ import inspect
2
+ import os
3
+
4
+ from pyarrow import RecordBatch, Table
5
+
6
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
7
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
8
+ QueryExpression,
9
+ )
10
+ from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
11
+
12
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable, MvpRow
13
+ from deltacat.experimental.storage.rivulet import Schema
14
+ from typing import Dict, List, Generator, Set
15
+
16
+ FIXTURE_ROW_COUNT = 10000
17
+
18
+
19
+ def write_mvp_table(writer: DatasetWriter, table: MvpTable):
20
+ writer.write(table.to_rows_list())
21
+
22
+
23
+ def mvp_table_to_record_batches(table: MvpTable, schema: Schema) -> RecordBatch:
24
+ data = table.to_rows_list()
25
+ columns = {key: [d.get(key) for d in data] for key in schema.keys()}
26
+ record_batch = RecordBatch.from_pydict(columns, schema=schema.to_pyarrow())
27
+ return record_batch
28
+
29
+
30
+ def compare_mvp_table_to_scan_results(
31
+ table: MvpTable, scan_results: List[dict], pk: str
32
+ ):
33
+ table_row_list = table.to_rows_list()
34
+ assert len(scan_results) == len(table_row_list)
35
+ rows_by_pk: Dict[str, MvpRow] = table.to_rows_by_key(pk)
36
+ assert len(rows_by_pk) == len(scan_results)
37
+ for record in scan_results:
38
+ pk_val = record[pk]
39
+ assert rows_by_pk[pk_val].data == record
40
+
41
+
42
+ def validate_with_full_scan(dataset: Dataset, expected: MvpTable, schema: Schema):
43
+ # best way to validate is to use dataset reader and read records
44
+ read_records = list(dataset.scan(QueryExpression()).to_pydict())
45
+ compare_mvp_table_to_scan_results(
46
+ expected, read_records, list(dataset.get_merge_keys())[0]
47
+ )
48
+
49
+
50
+ def generate_data_files(dataset: Dataset) -> Generator[str, None, None]:
51
+ for ma in dataset._metastore.generate_manifests():
52
+ for sstable in ma.generate_sstables():
53
+ for row in sstable.rows:
54
+ yield row.uri
55
+
56
+
57
+ def assert_data_file_extension(dataset: Dataset, file_extension: str):
58
+ data_file_count = 0
59
+ for data_file in generate_data_files(dataset):
60
+ data_file_count += 1
61
+ assert data_file.endswith(file_extension)
62
+ assert data_file_count > 0, "No data files found in dataset"
63
+ print(f"Asserted that {data_file_count} data files end with {file_extension}")
64
+
65
+
66
+ def assert_data_file_extension_set(dataset: Dataset, file_extension_set: Set[str]):
67
+ """
68
+ Asserts that each file extension in set appears at least once in dataset
69
+ """
70
+ data_file_count = 0
71
+ found_extensions = set()
72
+
73
+ for data_file in generate_data_files(dataset):
74
+ data_file_count += 1
75
+ for extension in file_extension_set:
76
+ if data_file.endswith(extension):
77
+ found_extensions.add(extension)
78
+ break
79
+
80
+ assert data_file_count > 0, "No data files found in dataset"
81
+ assert (
82
+ found_extensions == file_extension_set
83
+ ), f"Missing extensions: {file_extension_set - found_extensions}"
84
+ print(
85
+ f"Asserted that among {data_file_count} data files, all extensions {file_extension_set} were found"
86
+ )
87
+
88
+
89
+ def create_dataset_for_method(temp_dir: str):
90
+ """
91
+ Given a temp directory, creates a directory within it based on the name of the function calling this.
92
+ Then returns a dataset based from that directory
93
+ """
94
+ caller_frame = inspect.getouterframes(inspect.currentframe())[1]
95
+ dataset_dir = os.path.join(temp_dir, caller_frame.function)
96
+ os.makedirs(dataset_dir)
97
+ return Dataset(
98
+ dataset_name=f"dataset-${caller_frame.function}", metadata_uri=dataset_dir
99
+ )
100
+
101
+
102
+ def verify_pyarrow_scan(
103
+ scan_result: Generator[RecordBatch, None, None],
104
+ expected_schema: Schema,
105
+ expected_data: dict,
106
+ ):
107
+ record_batches = list(scan_result)
108
+ assert record_batches, "Scan should return at least one record batch."
109
+
110
+ combined_table = Table.from_batches(record_batches)
111
+
112
+ expected_fields = {field.name for field in expected_schema.values()}
113
+ scanned_fields = set(combined_table.schema.names)
114
+ assert (
115
+ scanned_fields == expected_fields
116
+ ), f"Scanned fields {scanned_fields} do not match expected fields {expected_fields}."
117
+
118
+ for field in expected_fields:
119
+ assert (
120
+ field in combined_table.column_names
121
+ ), f"Field '{field}' is missing in the scan result."
122
+ assert (
123
+ combined_table[field].to_pylist() == expected_data[field]
124
+ ), f"Field '{field}' data does not match expected values."
@@ -0,0 +1,343 @@
1
+ import math
2
+ import shutil
3
+ import tempfile
4
+ from typing import Dict, List, Iterator
5
+ import msgpack
6
+
7
+ import pytest
8
+ from pyarrow import RecordBatch
9
+
10
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
11
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
12
+ from deltacat.experimental.storage.rivulet.metastore.delta import (
13
+ ManifestIO,
14
+ TreeLevel,
15
+ DeltacatManifestIO,
16
+ )
17
+
18
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable, MvpRow
19
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
20
+ QueryExpression,
21
+ )
22
+ from deltacat.experimental.storage.rivulet import Schema
23
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
24
+ MemtableDatasetWriter,
25
+ )
26
+
27
+ from deltacat.tests.experimental.storage.rivulet.test_utils import FIXTURE_ROW_COUNT
28
+ from deltacat.tests.experimental.storage.rivulet.test_utils import (
29
+ write_mvp_table,
30
+ compare_mvp_table_to_scan_results,
31
+ mvp_table_to_record_batches,
32
+ validate_with_full_scan,
33
+ assert_data_file_extension_set,
34
+ create_dataset_for_method,
35
+ )
36
+
37
+ MemtableDatasetWriter.MAX_ROW_SIZE = 100
38
+
39
+
40
+ class TestBasicEndToEnd:
41
+ temp_dir = None
42
+
43
+ @classmethod
44
+ def setup_class(cls):
45
+ cls.temp_dir = tempfile.mkdtemp()
46
+ cls.dataset: Dataset = Dataset(dataset_name="test", metadata_uri=cls.temp_dir)
47
+
48
+ @classmethod
49
+ def teardown_class(cls):
50
+ shutil.rmtree(cls.temp_dir)
51
+ pass
52
+
53
+ @pytest.fixture
54
+ def ds1_schema(self, ds1_schema: Schema, ds1_dataset: MvpTable):
55
+ self.dataset.add_schema(ds1_schema, "ds1_schema")
56
+ with self.dataset.writer("ds1_schema") as writer:
57
+ write_mvp_table(writer, ds1_dataset)
58
+ return ds1_schema
59
+
60
+ def test_end_to_end_scan_pydict(self, ds1_schema, ds1_dataset):
61
+ # Read out dataset written to ds1_schema fixture, with full scan
62
+ read_records: List[Dict] = list(
63
+ self.dataset.scan(QueryExpression()).to_pydict()
64
+ ) # compare all_records to ds1
65
+ compare_mvp_table_to_scan_results(
66
+ ds1_dataset, read_records, ds1_schema.get_merge_key()
67
+ )
68
+
69
+ def test_end_to_end_scan_key_range(self, ds1_schema, ds1_dataset):
70
+ read_records_range: List[Dict] = list(
71
+ self.dataset.scan(QueryExpression().with_range(100, 500)).to_pydict()
72
+ )
73
+ assert len(read_records_range) == 401
74
+
75
+ def test_end_to_end_scan_single_key(self, ds1_schema, ds1_dataset):
76
+ read_records_single_key: List[Dict] = list(
77
+ self.dataset.scan(QueryExpression().with_key(600)).to_pydict()
78
+ )
79
+ assert len(read_records_single_key) == 1
80
+ assert read_records_single_key[0]["id"] == 600
81
+
82
+ def test_end_to_end_scan_pyarrow(self, ds1_schema, ds1_dataset):
83
+ batches: Iterator[RecordBatch] = self.dataset.scan(QueryExpression()).to_arrow()
84
+ read_records = [record for batch in batches for record in batch.to_pylist()]
85
+ compare_mvp_table_to_scan_results(
86
+ ds1_dataset, read_records, ds1_schema.get_merge_key()
87
+ )
88
+
89
+
90
+ class TestMultiLayerCompactionEndToEnd:
91
+ """Tests the merge-on-read compaction
92
+
93
+ The priority of records with the same primary key should go as follows:
94
+
95
+ 1. Prioritize higher layers over lower layers
96
+ 1. Prioritize newer SSTs over older SSTs (which is really only relevant for L0)
97
+
98
+ To this end, we'll create 4 manifests (in order of oldest to newest):
99
+ 1. L0 manifest A (oldest perhaps because of compaction) with ids {x}
100
+ 1. L1 manifest B with ids {x} and {y}
101
+ 1. L1 manifest C with ids {y}
102
+ 1. L2 manifest D with ids {y} (technically not required for this demonstration)
103
+
104
+ The output dataset should contain:
105
+ - {x} from manifest A (since it's at a higher layer than manifest B)
106
+ - {y} from manifest C (since it's newer than manifest B)
107
+ """
108
+
109
+ temp_dir = None
110
+ file_store: FileStore
111
+ manifest_io: ManifestIO
112
+
113
+ @classmethod
114
+ def setup_class(cls):
115
+ cls.temp_dir = tempfile.mkdtemp()
116
+ path, filesystem = FileStore.filesystem(cls.temp_dir)
117
+ cls.dataset: Dataset = Dataset(dataset_name="test", metadata_uri=path)
118
+ cls.file_store = cls.dataset._file_store
119
+ cls.manifest_io = DeltacatManifestIO(cls.temp_dir, cls.dataset._locator)
120
+
121
+ @classmethod
122
+ def teardown_class(cls):
123
+ shutil.rmtree(cls.temp_dir)
124
+
125
+ @pytest.fixture
126
+ def l0_overwrite(self, ds1_dataset):
127
+ """Transform the 2nd half of the records"""
128
+ return self._transform_dataset(
129
+ ds1_dataset,
130
+ math.floor(FIXTURE_ROW_COUNT / 2),
131
+ FIXTURE_ROW_COUNT,
132
+ transform_name=lambda x: "overwritten",
133
+ transform_age=lambda x: None,
134
+ )
135
+
136
+ @pytest.fixture
137
+ def l1_overwrite(self, ds1_dataset):
138
+ """Transform the 1st half of the records"""
139
+ return self._transform_dataset(
140
+ ds1_dataset,
141
+ 0,
142
+ math.floor(FIXTURE_ROW_COUNT / 2),
143
+ transform_name=lambda x: "overwritten",
144
+ )
145
+
146
+ @pytest.fixture
147
+ def l2_ignored(self, ds1_dataset):
148
+ """Transform the 1st half of the records"""
149
+ return self._transform_dataset(
150
+ ds1_dataset,
151
+ 0,
152
+ math.floor(FIXTURE_ROW_COUNT / 2),
153
+ transform_name=lambda x: "ignored",
154
+ )
155
+
156
+ def _transform_dataset(
157
+ self,
158
+ dataset,
159
+ min_index=0,
160
+ max_index=FIXTURE_ROW_COUNT,
161
+ transform_id=lambda x: x,
162
+ transform_name=lambda x: x,
163
+ transform_age=lambda x: x,
164
+ ):
165
+ data = dataset.data
166
+ return MvpTable(
167
+ {
168
+ "id": [transform_id(x) for x in data["id"][min_index:max_index]],
169
+ "name": [transform_name(x) for x in data["name"][min_index:max_index]],
170
+ "age": [transform_age(x) for x in data["age"][min_index:max_index]],
171
+ }
172
+ )
173
+
174
+ @pytest.fixture
175
+ def expected_dataset(self, l1_overwrite, l0_overwrite):
176
+ return MvpTable(
177
+ {
178
+ "id": l1_overwrite.data["id"] + l0_overwrite.data["id"],
179
+ "name": l1_overwrite.data["name"] + l0_overwrite.data["name"],
180
+ "age": l1_overwrite.data["age"] + l0_overwrite.data["age"],
181
+ }
182
+ )
183
+
184
+ @pytest.fixture
185
+ def ds1_written_uri(
186
+ self, ds1_schema, ds1_dataset, l2_ignored, l0_overwrite, l1_overwrite
187
+ ):
188
+ print(f"Writing test data to directory {self.temp_dir}")
189
+ self.dataset.add_schema(ds1_schema, "ds1_schema")
190
+ # oldest at L0 (should take precedence)
191
+ self.write_dataset("ds1_schema", l0_overwrite)
192
+ # original dataset (at L1)
193
+ uri = self.write_dataset("ds1_schema", ds1_dataset)
194
+ self.rewrite_at_level(uri, 1)
195
+ # newer dataset at L1 (should take precedence)
196
+ uri = self.write_dataset("ds1_schema", l1_overwrite)
197
+ self.rewrite_at_level(uri, 1)
198
+ # newer at L2 (loses out to L0 data)
199
+ uri = self.write_dataset("ds1_schema", l2_ignored)
200
+ self.rewrite_at_level(uri, 2)
201
+
202
+ def test_end_to_end_scan(self, ds1_written_uri, ds1_schema, expected_dataset):
203
+ """Rewrite entire dataset into 2nd manifest with same primary keys but "redacted" name."""
204
+ read_records: List[Dict] = list(
205
+ self.dataset.scan(QueryExpression()).to_pydict()
206
+ )
207
+ key = ds1_schema.get_merge_key()
208
+ rows_by_key: Dict[str, MvpRow] = expected_dataset.to_rows_by_key(key)
209
+ assert len(read_records) == len(rows_by_key)
210
+ for record in read_records:
211
+ pk_val = record[key]
212
+ assert record == rows_by_key[pk_val].data
213
+
214
+ # Test scan primary key range
215
+ read_records_range: List[Dict] = list(
216
+ self.dataset.scan(QueryExpression().with_range(100, 500)).to_pydict()
217
+ )
218
+ assert len(read_records_range) == 401
219
+
220
+ # Test scan single primary key
221
+ read_records_single_key: List[Dict] = list(
222
+ self.dataset.scan(QueryExpression().with_key(600)).to_pydict()
223
+ )
224
+ assert len(read_records_single_key) == 1
225
+ assert read_records_single_key[0]["id"] == 600
226
+
227
+ def write_dataset(self, schema_name: str, dataset) -> str:
228
+ ds1_writer = self.dataset.writer(schema_name)
229
+ write_mvp_table(ds1_writer, dataset)
230
+ return ds1_writer.flush()
231
+
232
+ def rewrite_at_level(self, uri: str, level: TreeLevel):
233
+ """Rewrite the given manifest with the new tree level
234
+
235
+ TODO: replace this with a compaction operation
236
+ """
237
+ with open(uri, "rb") as f:
238
+ data = msgpack.unpack(f)
239
+ data["level"] = level
240
+
241
+ with open(uri, "wb") as f:
242
+ msgpack.pack(data, f)
243
+
244
+
245
+ class TestZipperMergeEndToEnd:
246
+ temp_dir = None
247
+ file_store: FileStore
248
+
249
+ @classmethod
250
+ def setup_class(cls):
251
+ cls.temp_dir = tempfile.mkdtemp()
252
+ path, filesystem = FileStore.filesystem(cls.temp_dir)
253
+ cls.dataset: Dataset = Dataset(dataset_name="test", metadata_uri=cls.temp_dir)
254
+ cls.file_store = FileStore(path, filesystem=filesystem)
255
+
256
+ @classmethod
257
+ def teardown_class(cls):
258
+ shutil.rmtree(cls.temp_dir)
259
+
260
+ @pytest.fixture
261
+ def schema1(self, ds1_dataset: MvpTable, ds1_schema: Schema):
262
+ self.dataset.add_schema(ds1_schema, "ds1_schema")
263
+ with self.dataset.writer("ds1_schema") as writer:
264
+ write_mvp_table(writer, ds1_dataset)
265
+ return ds1_schema
266
+
267
+ @pytest.fixture
268
+ def schema2(self, ds2_dataset: MvpTable, ds2_schema: Schema):
269
+ self.dataset.add_schema(ds2_schema, "ds2_schema")
270
+ with self.dataset.writer("ds2_schema") as writer:
271
+ write_mvp_table(writer, ds2_dataset)
272
+ return ds2_schema
273
+
274
+ def test_end_to_end_scan(
275
+ self,
276
+ schema1,
277
+ schema2,
278
+ ds1_schema,
279
+ ds1_dataset,
280
+ ds2_dataset,
281
+ ds2_schema,
282
+ combined_schema,
283
+ ):
284
+ read_records: List[Dict] = list(
285
+ self.dataset.scan(QueryExpression()).to_pydict()
286
+ )
287
+
288
+ merge_key = ds1_schema.get_merge_key()
289
+ ds1_rows_by_pk: Dict[str, MvpRow] = ds1_dataset.to_rows_by_key(merge_key)
290
+ ds2_rows_by_pk: Dict[str, MvpRow] = ds2_dataset.to_rows_by_key(merge_key)
291
+
292
+ assert len(read_records) == len(ds1_rows_by_pk)
293
+ for record in read_records:
294
+ pk_val = record[merge_key]
295
+ ds1_row = ds1_rows_by_pk[pk_val]
296
+ ds2_row = ds2_rows_by_pk[pk_val]
297
+ expected_merged_record = ds1_row.data | ds2_row.data
298
+ assert expected_merged_record == record
299
+
300
+ # Test scan primary key range
301
+ read_records_range: List[Dict] = list(
302
+ self.dataset.scan(QueryExpression().with_range(100, 500)).to_pydict()
303
+ )
304
+ assert len(read_records_range) == 401
305
+
306
+ # Test scan single primary key
307
+ read_records_single_key: List[Dict] = list(
308
+ self.dataset.scan(QueryExpression().with_key(600)).to_pydict()
309
+ )
310
+ assert len(read_records_single_key) == 1
311
+ assert read_records_single_key[0]["id"] == 600
312
+
313
+
314
+ class TestDataFormatSupport:
315
+ temp_dir = None
316
+ file_store: FileStore
317
+
318
+ @classmethod
319
+ def setup_class(cls):
320
+ cls.temp_dir = tempfile.mkdtemp()
321
+ path, filesystem = FileStore.filesystem(cls.temp_dir)
322
+ cls.file_store = FileStore(path, filesystem=filesystem)
323
+
324
+ @classmethod
325
+ def teardown_class(cls):
326
+ shutil.rmtree(cls.temp_dir)
327
+ pass
328
+
329
+ # TODO expand coverage - below test is more like smoke test since dataset rows the same across types
330
+ def test_mixed_content_dataset(self, dataset_images_with_label):
331
+ dataset = create_dataset_for_method(self.temp_dir)
332
+ table, schema = dataset_images_with_label
333
+ dataset.add_schema(schema, "schema")
334
+ with dataset.writer("schema", "feather") as writer:
335
+ record_batch = mvp_table_to_record_batches(table, schema)
336
+ writer.write([record_batch])
337
+
338
+ with dataset.writer("schema", "parquet") as writer:
339
+ record_batch = mvp_table_to_record_batches(table, schema)
340
+ writer.write([record_batch])
341
+
342
+ validate_with_full_scan(dataset, table, schema)
343
+ assert_data_file_extension_set(dataset, {".feather", ".parquet"})
@@ -0,0 +1,79 @@
1
+ import pytest
2
+ import shutil
3
+ import tempfile
4
+
5
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
6
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
7
+ MemtableDatasetWriter,
8
+ )
9
+ from ..test_utils import (
10
+ write_mvp_table,
11
+ mvp_table_to_record_batches,
12
+ validate_with_full_scan,
13
+ create_dataset_for_method,
14
+ assert_data_file_extension,
15
+ )
16
+
17
+ MemtableDatasetWriter.MAX_ROW_SIZE = 100
18
+
19
+
20
+ class TestWriter:
21
+ temp_dir = None
22
+ file_store: FileStore
23
+
24
+ @classmethod
25
+ def setup_class(cls):
26
+ cls.temp_dir = tempfile.mkdtemp()
27
+ path, filesystem = FileStore.filesystem(cls.temp_dir)
28
+ cls.file_store = FileStore(path, filesystem)
29
+
30
+ @classmethod
31
+ def teardown_class(cls):
32
+ shutil.rmtree(cls.temp_dir)
33
+ pass
34
+
35
+ def test_write_unsupported_data_type(self, ds1_dataset, ds1_schema):
36
+ dataset = create_dataset_for_method(self.temp_dir)
37
+ dataset.add_schema(ds1_schema, "ds1_schema")
38
+ with dataset.writer("ds1_schema") as writer:
39
+ with pytest.raises(ValueError):
40
+ writer.write("a string")
41
+
42
+ def test_write_pydict(self, ds1_dataset, ds1_schema):
43
+ dataset = create_dataset_for_method(self.temp_dir)
44
+ dataset.add_schema(ds1_schema, "ds1_schema")
45
+ with dataset.writer("ds1_schema") as writer:
46
+ write_mvp_table(writer, ds1_dataset)
47
+
48
+ validate_with_full_scan(dataset, ds1_dataset, ds1_schema)
49
+
50
+ def test_write_record_batch(self, ds1_dataset, ds1_schema):
51
+ dataset = create_dataset_for_method(self.temp_dir)
52
+ dataset.add_schema(ds1_schema, "ds1_schema")
53
+ with dataset.writer("ds1_schema") as writer:
54
+ record_batch = mvp_table_to_record_batches(ds1_dataset, ds1_schema)
55
+ writer.write(record_batch)
56
+
57
+ validate_with_full_scan(dataset, ds1_dataset, ds1_schema)
58
+
59
+ def test_write_list_of_record_batch(self, ds1_dataset, ds1_schema):
60
+ dataset = create_dataset_for_method(self.temp_dir)
61
+ dataset.add_schema(ds1_schema, "ds1_schema")
62
+ with dataset.writer("ds1_schema", "feather") as writer:
63
+ record_batch = mvp_table_to_record_batches(ds1_dataset, ds1_schema)
64
+ writer.write([record_batch])
65
+
66
+ validate_with_full_scan(dataset, ds1_dataset, ds1_schema)
67
+ assert_data_file_extension(dataset, ".feather")
68
+
69
+ def test_write_feather(self, dataset_images_with_label):
70
+ dataset = create_dataset_for_method(self.temp_dir)
71
+
72
+ table, schema = dataset_images_with_label
73
+ dataset.add_schema(schema, "schema")
74
+ with dataset.writer("schema", "feather") as writer:
75
+ record_batch = mvp_table_to_record_batches(table, schema)
76
+ writer.write([record_batch])
77
+
78
+ validate_with_full_scan(dataset, table, schema)
79
+ assert_data_file_extension(dataset, "feather")
@@ -0,0 +1,75 @@
1
+ import pytest
2
+
3
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
4
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
5
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
6
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
7
+ from deltacat.experimental.storage.rivulet import Schema
8
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
9
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
10
+ MemtableDatasetWriter,
11
+ )
12
+
13
+
14
+ @pytest.fixture
15
+ def test_schema():
16
+ return Schema(
17
+ fields=[
18
+ ("id", Datatype.int32()),
19
+ ("name", Datatype.string()),
20
+ ],
21
+ merge_keys="id",
22
+ )
23
+
24
+
25
+ @pytest.fixture
26
+ def resolve_path_and_filesystem(tmp_path):
27
+ return FileStore.filesystem(tmp_path)
28
+
29
+
30
+ @pytest.fixture
31
+ def file_provider(resolve_path_and_filesystem):
32
+ path, filesystem = resolve_path_and_filesystem
33
+ file_store = FileStore(path, filesystem)
34
+ return FileProvider(path, file_store)
35
+
36
+
37
+ @pytest.fixture
38
+ def file_store(resolve_path_and_filesystem):
39
+ path, filesystem = resolve_path_and_filesystem
40
+ return FileStore(path, filesystem=filesystem)
41
+
42
+
43
+ def test_write_after_flush(tmp_path, test_schema):
44
+ dataset = Dataset(metadata_uri=tmp_path, dataset_name="dataset")
45
+ file_store = dataset._file_store
46
+ writer = MemtableDatasetWriter(
47
+ file_provider=dataset._file_provider,
48
+ schema=test_schema,
49
+ locator=dataset._locator,
50
+ )
51
+ writer.write_dict({"id": 100, "name": "alpha"})
52
+ manifest_uri_1 = writer.flush()
53
+
54
+ manifest_io = DeltacatManifestIO(writer.file_provider.uri, dataset._locator)
55
+ manifest_1 = manifest_io.read(manifest_uri_1)
56
+ sst_files_1 = manifest_1.sst_files
57
+
58
+ assert len(sst_files_1) > 0, "First flush: no SST files found."
59
+ assert manifest_1.context.schema == writer.schema, "Schema mismatch in first flush."
60
+
61
+ writer.write_dict({"id": 200, "name": "gamma"})
62
+ manifest_uri_2 = writer.flush()
63
+
64
+ manifest_2 = manifest_io.read(file_store.create_input_file(manifest_uri_2).location)
65
+ sst_files_2 = manifest_2.sst_files
66
+
67
+ assert len(sst_files_2) > 0, "Second flush: no SST files found."
68
+
69
+ # ensures data_files and sst_files from first write are not included in second write.
70
+ assert set(sst_files_1).isdisjoint(
71
+ set(sst_files_2)
72
+ ), "Expected no overlap of SST files between first and second flush."
73
+ assert (
74
+ manifest_2.context.schema == writer.schema
75
+ ), "Schema mismatch in second flush."
File without changes
File without changes