deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,136 @@
1
+ import daft
2
+ from daft import Table, Identifier
3
+ import pytest
4
+ import uuid
5
+
6
+ from deltacat.catalog import Catalog as DeltaCATCatalog
7
+ from deltacat.catalog import CatalogProperties
8
+ from deltacat.experimental.daft.daft_catalog import DaftCatalog
9
+ import shutil
10
+ import tempfile
11
+
12
+ from deltacat.experimental.catalog.iceberg import impl as IcebergCatalog
13
+ from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
14
+
15
+ from pyiceberg.catalog import CatalogType
16
+
17
+
18
+ class TestCatalogIntegration:
19
+ @classmethod
20
+ def setup_method(cls):
21
+ cls.tmpdir = tempfile.mkdtemp()
22
+
23
+ @classmethod
24
+ def teardown_method(cls):
25
+ shutil.rmtree(cls.tmpdir)
26
+
27
+ def test_create_table(self):
28
+ """Demonstrate DeltaCAT-Daft integration."""
29
+ # Create a DeltaCAT catalog
30
+ catalog_props = CatalogProperties(root=self.tmpdir)
31
+ dc_catalog = DeltaCATCatalog(catalog_props)
32
+
33
+ # Use a random catalog name to prevent namespacing conflicts with other tests
34
+ # Convert the DeltaCAT catalog to a Daft catalog
35
+ catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
36
+
37
+ daft_catalog = DaftCatalog(catalog=dc_catalog, name=catalog_name)
38
+
39
+ # Register the catalog with Daft's catalog system
40
+ daft.attach_catalog(daft_catalog, catalog_name)
41
+
42
+ # Create a sample DataFrame
43
+ df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
44
+ # Create then get table
45
+ daft_catalog.create_table(Identifier("example_table"), df)
46
+ table: Table = daft_catalog.get_table(Identifier("example_table"))
47
+ assert table.name == "example_table"
48
+
49
+ def test_get_table(self):
50
+ """Test getting a table from the DeltaCAT-Daft catalog."""
51
+ # Create a DeltaCAT catalog using the existing tmpdir
52
+ catalog_props = CatalogProperties(root=self.tmpdir)
53
+ dc_catalog = DeltaCATCatalog(catalog_props)
54
+
55
+ # Convert to DaftCatalog and attach to Daft
56
+ catalog_name = f"deltacat_{uuid.uuid4().hex[:8]}"
57
+ daft_catalog = DaftCatalog(dc_catalog, catalog_name)
58
+ daft.attach_catalog(daft_catalog, catalog_name)
59
+
60
+ # Create a sample DataFrame and table
61
+ df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
62
+ table_name = "test_get_table"
63
+ daft_catalog.create_table(Identifier(table_name), df)
64
+
65
+ # Get the table using different forms of identifiers
66
+ table2 = daft_catalog.get_table(Identifier(table_name))
67
+ assert table2 is not None
68
+ assert table2.name == table_name
69
+
70
+ # 3. With namespace. DeltaCAT used the default namespace since it was not provided
71
+ table3 = daft_catalog.get_table(Identifier("default", table_name))
72
+ assert table3 is not None
73
+ assert table3.name == table_name
74
+
75
+ # Test non-existent table raises an appropriate error
76
+ with pytest.raises(ValueError, match="Table nonexistent_table not found"):
77
+ daft_catalog.get_table(Identifier("nonexistent_table"))
78
+
79
+
80
+ class TestIcebergCatalogIntegration:
81
+ @classmethod
82
+ def setup_method(cls):
83
+ cls.tmpdir = tempfile.mkdtemp()
84
+
85
+ @classmethod
86
+ def teardown_method(cls):
87
+ shutil.rmtree(cls.tmpdir)
88
+
89
+ def test_iceberg_catalog_integration(self):
90
+ # Create a unique warehouse path for this test
91
+ warehouse_path = self.tmpdir
92
+
93
+ # Configure an Iceberg catalog with the warehouse path
94
+ config = IcebergCatalogConfig(
95
+ type=CatalogType.SQL,
96
+ properties={
97
+ "warehouse": warehouse_path,
98
+ "uri": f"sqlite:////{warehouse_path}/sql-catalog.db",
99
+ },
100
+ )
101
+ dc_catalog = IcebergCatalog.from_config(config)
102
+
103
+ # Convert the DeltaCAT catalog to a Daft catalog
104
+ catalog_name = f"deltacat_iceberg_{uuid.uuid4().hex[:8]}"
105
+ daft_catalog = DaftCatalog(dc_catalog, catalog_name)
106
+ daft.attach_catalog(daft_catalog, catalog_name)
107
+
108
+ # Create a sample DataFrame
109
+ df = daft.from_pydict({"id": [1, 2, 3], "value": ["a", "b", "c"]})
110
+
111
+ # Create a table with the Daft catalog
112
+ table_name = "example_table"
113
+ namespace = "example_namespace"
114
+ daft_catalog.create_table(Identifier(namespace, table_name), df)
115
+
116
+ # Query that Iceberg table exists using PyIceberg
117
+ iceberg_catalog = dc_catalog.inner
118
+
119
+ # Verify the table exists in the Iceberg catalog
120
+ tables = iceberg_catalog.list_tables(namespace)
121
+
122
+ assert any(
123
+ t[0] == namespace and t[1] == table_name for t in tables
124
+ ), f"Table {table_name} not found in Iceberg catalog"
125
+
126
+ # Load the table from Iceberg catalog and verify its properties
127
+ iceberg_table = iceberg_catalog.load_table(f"{namespace}.{table_name}")
128
+
129
+ # Check that the schema matches our DataFrame
130
+ schema = iceberg_table.schema()
131
+ assert (
132
+ schema.find_field("id") is not None
133
+ ), "Field 'id' not fcound in table schema"
134
+ assert (
135
+ schema.find_field("value") is not None
136
+ ), "Field 'value' not found in table schema"
File without changes
@@ -0,0 +1,149 @@
1
+ import io
2
+
3
+ import pytest
4
+ from faker import Faker
5
+
6
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
7
+ from deltacat.experimental.storage.rivulet.mvp.Table import MvpTable
8
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
9
+ import random
10
+ import string
11
+ from PIL import Image
12
+
13
+ FIXTURE_ROW_COUNT = 10000
14
+
15
+
16
+ @pytest.fixture
17
+ def ds1_dataset() -> MvpTable:
18
+ """
19
+ dataset with one million rows
20
+ primary key is integer between 1 and 1,000,000
21
+
22
+ TODO change to user Faker instead of int ranges
23
+ """
24
+
25
+ # Function to generate random names
26
+ def generate_random_name():
27
+ return "".join(
28
+ random.choices(
29
+ string.ascii_uppercase + string.ascii_lowercase, k=random.randint(3, 10)
30
+ )
31
+ )
32
+
33
+ # Create a list of numbers from 1 to TEST_ROW_COUNT
34
+ ids = list(range(1, FIXTURE_ROW_COUNT + 1))
35
+ random.shuffle(ids)
36
+
37
+ # Generate one million rows
38
+ return MvpTable(
39
+ {
40
+ "id": ids,
41
+ "name": [generate_random_name() for _ in range(FIXTURE_ROW_COUNT)],
42
+ "age": [random.randint(18, 100) for _ in range(FIXTURE_ROW_COUNT)],
43
+ }
44
+ )
45
+
46
+
47
+ @pytest.fixture
48
+ def ds1_schema():
49
+ return Schema(
50
+ {
51
+ ("id", Datatype.int32()),
52
+ ("name", Datatype.string()),
53
+ ("age", Datatype.int32()),
54
+ },
55
+ "id",
56
+ )
57
+
58
+
59
+ @pytest.fixture
60
+ def ds2_dataset():
61
+ """
62
+ dataset2 with one million rows that can be joined to ds1
63
+ primary key is integer between 1 and 1,000,000
64
+ """
65
+ # Create a list of numbers from 1 to 1,000,000
66
+ ids = list(range(1, FIXTURE_ROW_COUNT + 1))
67
+ random.shuffle(ids)
68
+
69
+ fake = Faker()
70
+
71
+ # Generate one million rows
72
+ return MvpTable(
73
+ {
74
+ "id": ids,
75
+ "address": [fake.address() for _ in range(FIXTURE_ROW_COUNT)],
76
+ "zip": [fake.zipcode() for _ in range(FIXTURE_ROW_COUNT)],
77
+ }
78
+ )
79
+
80
+
81
+ @pytest.fixture
82
+ def ds2_schema():
83
+ return Schema(
84
+ {
85
+ ("id", Datatype.int32()),
86
+ ("address", Datatype.string()),
87
+ ("zip", Datatype.string()),
88
+ },
89
+ "id",
90
+ )
91
+
92
+
93
+ @pytest.fixture
94
+ def combined_schema(ds1_schema, ds2_schema):
95
+ return Schema(
96
+ {
97
+ ("id", Datatype.int32()),
98
+ ("address", Datatype.string()),
99
+ ("zip", Datatype.string()),
100
+ ("name", Datatype.string()),
101
+ ("age", Datatype.int32()),
102
+ },
103
+ "id",
104
+ )
105
+
106
+
107
+ @pytest.fixture
108
+ def dataset_images_with_label() -> (MvpTable, Schema):
109
+ """
110
+ dataset with one thousand images and labels, generated dynamically
111
+ primary key is integer between 1 and 1,000
112
+ """
113
+ ROW_COUNT = 1000
114
+ fake = Faker()
115
+ schema = Schema(
116
+ {
117
+ ("id", Datatype.int32()),
118
+ ("image", Datatype.image("jpg")),
119
+ ("label", Datatype.string()),
120
+ },
121
+ "id",
122
+ )
123
+
124
+ # Create a list of numbers from 1 to ROW_COUNT
125
+ ids = list(range(1, ROW_COUNT + 1))
126
+ random.shuffle(ids)
127
+
128
+ fake_image = Image.new(
129
+ "RGB",
130
+ (512, 512),
131
+ color=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
132
+ )
133
+ # get bytes from image encoded as png
134
+ buffer = io.BytesIO()
135
+ fake_image.save(buffer, format="PNG")
136
+ # seek to start of buffer since we just wrote to it
137
+ buffer.seek(0)
138
+ image_bytes = buffer.read()
139
+ # Generate one million rows
140
+ return (
141
+ MvpTable(
142
+ {
143
+ "id": ids,
144
+ "image": [image_bytes for _ in range(ROW_COUNT)],
145
+ "label": [fake.name() for _ in range(ROW_COUNT)],
146
+ }
147
+ ),
148
+ schema,
149
+ )
@@ -0,0 +1,94 @@
1
+ import pytest
2
+
3
+ import pyarrow as pa
4
+ import pyarrow.parquet as pq
5
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
6
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
7
+ from deltacat.experimental.storage.rivulet import Schema, Field
8
+ from deltacat.utils.metafile_locator import _find_partition_path
9
+
10
+
11
+ @pytest.fixture
12
+ def sample_schema():
13
+ return Schema(
14
+ fields=[
15
+ Field("id", Datatype.int32(), is_merge_key=True),
16
+ Field("name", Datatype.string()),
17
+ Field("age", Datatype.int32()),
18
+ ]
19
+ )
20
+
21
+
22
+ @pytest.fixture
23
+ def sample_pydict():
24
+ return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
25
+
26
+
27
+ @pytest.fixture
28
+ def temp_storage_path(tmp_path):
29
+ return tmp_path
30
+
31
+
32
+ @pytest.fixture
33
+ def sample_parquet_data(temp_storage_path, sample_pydict):
34
+ parquet_path = temp_storage_path / "test.parquet"
35
+ table = pa.Table.from_pydict(sample_pydict)
36
+ pq.write_table(table, parquet_path)
37
+ return parquet_path
38
+
39
+
40
+ @pytest.fixture
41
+ def dataset(sample_parquet_data):
42
+ return Dataset.from_parquet(
43
+ file_uri=sample_parquet_data, name="dataset", merge_keys="id"
44
+ )
45
+
46
+
47
+ @pytest.fixture
48
+ def file_provider(dataset):
49
+ return dataset._file_provider
50
+
51
+
52
+ def test_provide_data_file(file_provider):
53
+ output_file = file_provider.provide_data_file("parquet")
54
+ assert "data" in output_file.location
55
+ assert output_file.location.endswith(".parquet")
56
+
57
+ output_file2 = file_provider.provide_data_file("parquet")
58
+ assert "data" in output_file2.location
59
+ assert output_file2.location.endswith(".parquet")
60
+
61
+ assert (
62
+ output_file.location != output_file2.location
63
+ ), "Two output files should have different locations."
64
+
65
+
66
+ def test_provide_manifest_file(file_provider):
67
+ output_file = file_provider.provide_manifest_file()
68
+ assert "metadata/manifests" in output_file.location
69
+ assert output_file.location.endswith(".json")
70
+
71
+
72
+ def test_provide_l0_sst_file(file_provider):
73
+ output_file = file_provider.provide_l0_sst_file()
74
+ assert "metadata/ssts/0" in output_file.location
75
+ assert output_file.location.endswith(".json")
76
+
77
+
78
+ def test_provide_input_file(file_provider, sample_parquet_data):
79
+ input_file = file_provider.provide_input_file(str(sample_parquet_data))
80
+ assert input_file.location == str(sample_parquet_data)
81
+
82
+
83
+ def test_generate_sst_uris(file_provider):
84
+ generated_files = list(file_provider.generate_sst_uris())
85
+ for file in generated_files:
86
+ assert "metadata/ssts/0" in file.location
87
+ assert file.location.endswith(".json")
88
+
89
+
90
+ def test_get_scan_directories(file_provider):
91
+ partition_path = _find_partition_path(file_provider.uri, file_provider._locator)
92
+ assert file_provider.get_sst_scan_directories() == [
93
+ f"{partition_path}/metadata/ssts/0/"
94
+ ]
@@ -0,0 +1,80 @@
1
+ import pytest
2
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
3
+ QueryExpression,
4
+ )
5
+ from deltacat.experimental.storage.rivulet.shard.range_shard import RangeShard
6
+
7
+
8
+ @pytest.fixture
9
+ def sample_range_shard():
10
+ return RangeShard(min_key=5, max_key=15)
11
+
12
+
13
+ @pytest.fixture
14
+ def sample_string_shard():
15
+ return RangeShard(min_key="apple", max_key="zebra")
16
+
17
+
18
+ def test_with_key():
19
+ query = QueryExpression[int]()
20
+ query.with_key(5)
21
+ assert query.min_key == 5
22
+ assert query.max_key == 5
23
+ with pytest.raises(ValueError):
24
+ query.with_key(10)
25
+
26
+
27
+ def test_with_range():
28
+ query = QueryExpression[int]()
29
+ query.with_range(10, 5)
30
+ assert query.min_key == 5
31
+ assert query.max_key == 10
32
+ with pytest.raises(ValueError):
33
+ query.with_range(20, 25)
34
+
35
+
36
+ def test_matches_query():
37
+ query = QueryExpression[int]()
38
+ assert query.matches_query(5)
39
+ assert query.matches_query(-999)
40
+ query.with_range(10, 20)
41
+ assert query.matches_query(15)
42
+ assert not query.matches_query(25)
43
+ assert not query.matches_query(5)
44
+
45
+
46
+ def test_below_query_range():
47
+ query = QueryExpression[int]()
48
+ assert not query.below_query_range(5)
49
+ query.with_range(10, 20)
50
+ assert query.below_query_range(5)
51
+ assert not query.below_query_range(15)
52
+ assert not query.below_query_range(25)
53
+
54
+
55
+ def test_with_shard_existing_query(sample_range_shard):
56
+ query = QueryExpression[int]().with_range(10, 20)
57
+ new_query = QueryExpression.with_shard(query, sample_range_shard)
58
+ assert new_query.min_key == 5
59
+ assert new_query.max_key == 20
60
+
61
+
62
+ def test_with_shard_none_shard():
63
+ query = QueryExpression[int]().with_range(10, 20)
64
+ result = QueryExpression.with_shard(query, None)
65
+ assert result.min_key == 10
66
+ assert result.max_key == 20
67
+
68
+
69
+ def test_with_shard_existing_query_string(sample_string_shard):
70
+ query = QueryExpression[str]().with_range("banana", "yellow")
71
+ new_query = QueryExpression.with_shard(query, sample_string_shard)
72
+ assert new_query.min_key == "apple"
73
+ assert new_query.max_key == "zebra"
74
+
75
+
76
+ def test_query_expression_string_matches():
77
+ query = QueryExpression[str]().with_range("apple", "cat")
78
+ assert query.matches_query("apple")
79
+ assert query.matches_query("banana")
80
+ assert not query.matches_query("dog")
@@ -0,0 +1,119 @@
1
+ import pytest
2
+ from deltacat.tests.experimental.storage.rivulet.test_utils import verify_pyarrow_scan
3
+ import pyarrow as pa
4
+ from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
5
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
6
+
7
+
8
+ @pytest.fixture
9
+ def combined_schema():
10
+ return Schema(
11
+ fields=[
12
+ Field("id", Datatype.int64(), is_merge_key=True),
13
+ Field("name", Datatype.string()),
14
+ Field("age", Datatype.int32()),
15
+ Field("height", Datatype.int64()),
16
+ Field("gender", Datatype.string()),
17
+ ]
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def initial_schema():
23
+ return Schema(
24
+ fields=[
25
+ Field("id", Datatype.int32(), is_merge_key=True),
26
+ Field("name", Datatype.string()),
27
+ Field("age", Datatype.int32()),
28
+ ]
29
+ )
30
+
31
+
32
+ @pytest.fixture
33
+ def extended_schema():
34
+ return Schema(
35
+ fields=[
36
+ Field("id", Datatype.int64(), is_merge_key=True),
37
+ Field("height", Datatype.int64()),
38
+ Field("gender", Datatype.string()),
39
+ ]
40
+ )
41
+
42
+
43
+ @pytest.fixture
44
+ def sample_data():
45
+ return {
46
+ "id": [1, 2, 3],
47
+ "name": ["Alice", "Bob", "Charlie"],
48
+ "age": [25, 30, 35],
49
+ }
50
+
51
+
52
+ @pytest.fixture
53
+ def extended_data():
54
+ return {
55
+ "id": [1, 2, 3],
56
+ "height": [150, 160, 159],
57
+ "gender": ["male", "female", "male"],
58
+ }
59
+
60
+
61
+ @pytest.fixture
62
+ def combined_data(sample_data, extended_data):
63
+ data = sample_data.copy()
64
+ data.update(extended_data)
65
+ return data
66
+
67
+
68
+ @pytest.fixture
69
+ def parquet_data(tmp_path, sample_data):
70
+ parquet_path = tmp_path / "test.parquet"
71
+ table = pa.Table.from_pydict(sample_data)
72
+ pa.parquet.write_table(table, parquet_path)
73
+ return parquet_path
74
+
75
+
76
+ @pytest.fixture
77
+ def sample_dataset(parquet_data, tmp_path):
78
+ return Dataset.from_parquet(
79
+ name="test_dataset",
80
+ file_uri=str(parquet_data),
81
+ metadata_uri=str(tmp_path),
82
+ merge_keys="id",
83
+ )
84
+
85
+
86
+ def test_end_to_end_scan_with_multiple_schemas(
87
+ sample_dataset,
88
+ initial_schema,
89
+ extended_schema,
90
+ combined_schema,
91
+ sample_data,
92
+ extended_data,
93
+ combined_data,
94
+ ):
95
+ # Verify initial scan.
96
+ verify_pyarrow_scan(sample_dataset.scan().to_arrow(), initial_schema, sample_data)
97
+
98
+ # Add a new schema to the dataset
99
+ sample_dataset.add_schema(schema=extended_schema, schema_name="schema2")
100
+ new_data = [
101
+ {"id": 1, "height": 150, "gender": "male"},
102
+ {"id": 2, "height": 160, "gender": "female"},
103
+ {"id": 3, "height": 159, "gender": "male"},
104
+ ]
105
+ writer = sample_dataset.writer(schema_name="schema2")
106
+ writer.write(new_data)
107
+ writer.flush()
108
+
109
+ # Verify scan with the extended schema retrieves only extended datfa
110
+ verify_pyarrow_scan(
111
+ sample_dataset.scan(schema_name="schema2").to_arrow(),
112
+ extended_schema,
113
+ extended_data,
114
+ )
115
+
116
+ # Verify a combined scan retrieves data matching the combined schema
117
+ verify_pyarrow_scan(
118
+ sample_dataset.scan().to_arrow(), combined_schema, combined_data
119
+ )
@@ -0,0 +1,71 @@
1
+ import pytest
2
+ import os
3
+
4
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
5
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
6
+ DatasetMetastore,
7
+ )
8
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
9
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
10
+ from deltacat.experimental.storage.rivulet import Schema
11
+
12
+
13
+ @pytest.fixture
14
+ def sample_schema():
15
+ return Schema(
16
+ {("id", Datatype.int32()), ("name", Datatype.string())},
17
+ "id",
18
+ )
19
+
20
+
21
+ @pytest.fixture
22
+ def sample_pydict():
23
+ return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
24
+
25
+
26
+ def test_dataset_metastore_e2e(sample_schema, tmp_path):
27
+ # Setup
28
+ dataset = Dataset(metadata_uri=tmp_path, dataset_name="dataset")
29
+ file_provider = dataset._file_provider
30
+ manifest_io = DeltacatManifestIO(file_provider.uri, dataset._locator)
31
+
32
+ # Create multiple manifests
33
+ manifests_data = [
34
+ {"sst_files": ["sst1.sst", "sst2.sst"], "level": 1},
35
+ {"sst_files": ["sst3.sst", "sst4.sst"], "level": 2},
36
+ ]
37
+
38
+ # Create SST files and manifests
39
+ manifest_paths = []
40
+ for manifest_data in manifests_data:
41
+ sst_files = manifest_data["sst_files"]
42
+ for sst in sst_files:
43
+ with open(os.path.join(file_provider.uri, sst), "w") as f:
44
+ f.write("test data")
45
+
46
+ manifest_path = manifest_io.write(
47
+ sst_files, sample_schema, manifest_data["level"]
48
+ )
49
+ manifest_paths.append(manifest_path)
50
+
51
+ # Initialize DatasetMetastore
52
+ metastore = DatasetMetastore(
53
+ file_provider.uri,
54
+ file_provider,
55
+ file_provider._locator,
56
+ manifest_io=manifest_io,
57
+ )
58
+
59
+ # Test manifest generation
60
+ manifest_accessors = list(metastore.generate_manifests())
61
+ assert len(manifest_accessors) == len(manifests_data)
62
+
63
+ # Verify each manifest accessor
64
+ for accessor in manifest_accessors:
65
+ assert accessor.context.schema == sample_schema
66
+ manifests_data_index = 0 if accessor.context.level == 1 else 1
67
+ assert accessor.context.level == manifests_data[manifests_data_index]["level"]
68
+ assert (
69
+ accessor.manifest.sst_files
70
+ == manifests_data[manifests_data_index]["sst_files"]
71
+ )