deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,39 +1,1064 @@
1
1
  import shutil
2
2
  import tempfile
3
+ from collections import defaultdict
4
+
3
5
  import deltacat as dc
6
+ from deltacat.constants import METAFILE_FORMAT_MSGPACK
7
+ from deltacat import (
8
+ ContentType,
9
+ DeltaCatUrl,
10
+ DatasetType,
11
+ Namespace,
12
+ TableProperties,
13
+ TableWriteMode,
14
+ TableProperty,
15
+ TableReadOptimizationLevel,
16
+ )
17
+ from deltacat.storage import (
18
+ Metafile,
19
+ Table,
20
+ TableVersion,
21
+ Stream,
22
+ Partition,
23
+ Delta,
24
+ )
25
+ from deltacat.storage.model.partition import UNPARTITIONED_SCHEME_ID
26
+ from deltacat.catalog import write_to_table
27
+ import pandas as pd
28
+
29
+ from deltacat.io import (
30
+ METAFILE_TYPE_COLUMN_NAME,
31
+ METAFILE_DATA_COLUMN_NAME,
32
+ )
4
33
 
5
34
 
6
35
  class TestDeltaCAT:
7
36
  @classmethod
8
- def setup_class(cls):
37
+ def setup_method(cls):
9
38
  cls.temp_dir_1 = tempfile.mkdtemp()
10
39
  cls.temp_dir_2 = tempfile.mkdtemp()
11
40
  # Initialize DeltaCAT with two local catalogs.
12
- dc.put("test_catalog_1", root=cls.temp_dir_1)
13
- dc.put("test_catalog_2", root=cls.temp_dir_2)
41
+ dc.init()
42
+ dc.put(DeltaCatUrl("dc://test_catalog_1"), root=cls.temp_dir_1)
43
+ dc.put(DeltaCatUrl("dc://test_catalog_2"), root=cls.temp_dir_2)
14
44
 
15
45
  @classmethod
16
- def teardown_class(cls):
46
+ def teardown_method(cls):
17
47
  shutil.rmtree(cls.temp_dir_1)
18
48
  shutil.rmtree(cls.temp_dir_2)
19
49
 
20
50
  def test_cross_catalog_namespace_copy(self):
21
51
  # Given two empty DeltaCAT catalogs.
22
52
  # When a namespace is copied across catalogs.
23
- namespace_src = dc.put("test_catalog_1/test_namespace")
53
+ namespace_src = dc.put(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
24
54
  namespace_dst = dc.copy(
25
- "test_catalog_1/test_namespace",
26
- "test_catalog_2",
55
+ DeltaCatUrl("dc://test_catalog_1/test_namespace"),
56
+ DeltaCatUrl("dc://test_catalog_2/test_namespace"),
27
57
  )
28
58
  # Expect the catalog namespace created in each catalog
29
- # method to be equivalent and equal to the source namespace.
59
+ # method to be equivalent but not equal to the source namespace
60
+ # (due to different metafile IDs).
30
61
  assert namespace_src.equivalent_to(namespace_dst)
31
- assert namespace_src == namespace_dst
62
+ assert not namespace_src == namespace_dst
32
63
 
33
64
  # When each catalog namespace is fetched explicitly
34
65
  # Expect them to be equivalent but not equal
35
66
  # (due to different metafile IDs).
36
- actual_namespace_src = dc.get("test_catalog_1/test_namespace")
37
- actual_namespace_dst = dc.get("test_catalog_2/test_namespace")
67
+ actual_namespace_src = dc.get(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
68
+ actual_namespace_dst = dc.get(DeltaCatUrl("dc://test_catalog_2/test_namespace"))
38
69
  assert actual_namespace_src.equivalent_to(actual_namespace_dst)
70
+ assert actual_namespace_src == namespace_src
39
71
  assert not actual_namespace_src == actual_namespace_dst
72
+ assert namespace_dst == actual_namespace_dst
73
+
74
+ def test_catalog_listing_shallow_local_metafiles(self):
75
+ # Given two empty DeltaCAT catalogs.
76
+ # When a namespace is put in the catalog.
77
+ namespace_src: Namespace = dc.put(
78
+ DeltaCatUrl("dc://test_catalog_1/test_namespace")
79
+ )
80
+ # Expect the namespace to be listed.
81
+ assert any(
82
+ namespace_src.equivalent_to(other)
83
+ for other in dc.list(DeltaCatUrl("dc://test_catalog_1"))
84
+ )
85
+
86
+ def test_catalog_listing_shallow_ray_dataset(self):
87
+ # Given two empty DeltaCAT catalogs.
88
+ # When a namespace is put in the catalog.
89
+ namespace_src: Namespace = dc.put(
90
+ DeltaCatUrl("dc://test_catalog_1/test_namespace")
91
+ )
92
+ # Expect the namespace to be listed.
93
+ dataset = dc.list(
94
+ DeltaCatUrl("dc://test_catalog_1"),
95
+ dataset_type=DatasetType.RAY_DATASET,
96
+ )
97
+ actual_namespace = Metafile.deserialize(
98
+ serialized=dataset.take(1)[0][METAFILE_DATA_COLUMN_NAME],
99
+ meta_format=METAFILE_FORMAT_MSGPACK,
100
+ )
101
+ assert actual_namespace.equivalent_to(namespace_src)
102
+ namespace_type = dataset.take(1)[0][METAFILE_TYPE_COLUMN_NAME]
103
+ assert namespace_type == "Namespace"
104
+
105
+ def test_recursive_listing_multiple_namespaces_with_tables(self):
106
+ """
107
+ Test that recursive listing correctly processes namespaces, tables, and deltas.
108
+ """
109
+ # Create multiple namespaces with tables and data
110
+ dc.put(DeltaCatUrl("dc://test_catalog_1/namespace_alpha"))
111
+ dc.put(DeltaCatUrl("dc://test_catalog_1/namespace_beta"))
112
+ dc.put(DeltaCatUrl("dc://test_catalog_1/namespace_gamma"))
113
+
114
+ # Create tables with data in each namespace
115
+ test_data = pd.DataFrame({"id": [1, 2, 3], "value": ["a", "b", "c"]})
116
+
117
+ # Create tables in each namespace
118
+ write_to_table(
119
+ data=test_data,
120
+ table="table1",
121
+ namespace="namespace_alpha",
122
+ mode=TableWriteMode.CREATE,
123
+ content_type=ContentType.PARQUET,
124
+ catalog="test_catalog_1",
125
+ )
126
+
127
+ write_to_table(
128
+ data=test_data,
129
+ table="table2",
130
+ namespace="namespace_beta",
131
+ mode=TableWriteMode.CREATE,
132
+ content_type=ContentType.PARQUET,
133
+ catalog="test_catalog_1",
134
+ )
135
+
136
+ write_to_table(
137
+ data=test_data,
138
+ table="table3",
139
+ namespace="namespace_gamma",
140
+ mode=TableWriteMode.CREATE,
141
+ content_type=ContentType.PARQUET,
142
+ catalog="test_catalog_1",
143
+ )
144
+
145
+ # Test recursive listing
146
+ all_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
147
+
148
+ # Verify we found objects from ALL namespaces
149
+ object_types_to_names = defaultdict(list)
150
+
151
+ # Verify we found all namespaces, tables, and deltas
152
+ for obj in all_objects:
153
+ obj_type = Metafile.get_class(obj)
154
+ object_types_to_names[obj_type].append(obj.name)
155
+
156
+ # Assert we found all namespaces
157
+ expected_namespaces = {"namespace_alpha", "namespace_beta", "namespace_gamma"}
158
+ assert (
159
+ len(object_types_to_names[Namespace]) == 3
160
+ ), f"Expected 3 namespaces, found {len(object_types_to_names[Namespace])}"
161
+ assert (
162
+ set(object_types_to_names[Namespace]) == expected_namespaces
163
+ ), f"Expected namespaces: {expected_namespaces}, found: {object_types_to_names[Namespace]}"
164
+
165
+ # Assert we found all tables
166
+ expected_tables = {"table1", "table2", "table3"}
167
+ assert (
168
+ len(object_types_to_names[Table]) == 3
169
+ ), f"Expected 3 tables, found {len(object_types_to_names[Table])}"
170
+ assert (
171
+ set(object_types_to_names[Table]) == expected_tables
172
+ ), f"Expected tables: {expected_tables}, found: {object_types_to_names[Table]}"
173
+
174
+ # Assert we found all deltas
175
+ assert (
176
+ len(object_types_to_names[Delta]) == 3
177
+ ), f"Expected 3 deltas, found {len(object_types_to_names[Delta])}"
178
+ expected_deltas = {
179
+ "1"
180
+ } # all 3 deltas should have the same stream position in their respective partitions
181
+ assert (
182
+ set(object_types_to_names[Delta]) == expected_deltas
183
+ ), f"Expected deltas: {expected_deltas}, found: {object_types_to_names[Delta]}"
184
+
185
+ def test_recursive_listing_multiple_tables_per_namespace(self):
186
+ """
187
+ Test that recursive listing finds all tables within a namespace.
188
+ """
189
+ # Create one namespace with multiple tables
190
+ dc.put(DeltaCatUrl("dc://test_catalog_1/multi_table_namespace"))
191
+
192
+ test_data = pd.DataFrame({"id": [1, 2], "value": ["x", "y"]})
193
+
194
+ # Create multiple tables in the same namespace
195
+ table_names = ["events", "users", "products", "orders"]
196
+ for table_name in table_names:
197
+ write_to_table(
198
+ data=test_data,
199
+ table=table_name,
200
+ namespace="multi_table_namespace",
201
+ mode=TableWriteMode.CREATE,
202
+ content_type=ContentType.PARQUET,
203
+ catalog="test_catalog_1",
204
+ )
205
+
206
+ # Test recursive listing
207
+ all_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
208
+
209
+ # Extract table names from results
210
+ object_types_to_names = defaultdict(list)
211
+ for obj in all_objects:
212
+ obj_type = Metafile.get_class(obj)
213
+ object_types_to_names[obj_type].append(obj.name)
214
+
215
+ # Assert we found all tables
216
+ assert len(object_types_to_names[Table]) == len(
217
+ table_names
218
+ ), f"Expected {len(table_names)} tables, found {len(object_types_to_names[Table])}"
219
+ assert set(object_types_to_names[Table]) == set(
220
+ table_names
221
+ ), f"Expected tables: {table_names}, found: {object_types_to_names[Table]}"
222
+
223
+ def test_recursive_listing_multiple_deltas_per_table(self):
224
+ """
225
+ Test that recursive listing finds all deltas within a table.
226
+ """
227
+ # Create namespace and table
228
+ dc.put(DeltaCatUrl("dc://test_catalog_1/delta_test_namespace"))
229
+
230
+ # Create table with multiple deltas
231
+ batch1 = pd.DataFrame({"id": [1, 2, 3], "value": ["a", "b", "c"]})
232
+
233
+ batch2 = pd.DataFrame({"id": [4, 5, 6], "value": ["d", "e", "f"]})
234
+
235
+ # Write first batch (CREATE)
236
+ write_to_table(
237
+ data=batch1,
238
+ table="multi_delta_table",
239
+ namespace="delta_test_namespace",
240
+ mode=TableWriteMode.CREATE,
241
+ content_type=ContentType.PARQUET,
242
+ catalog="test_catalog_1",
243
+ )
244
+
245
+ # Write second batch (APPEND - creates second delta)
246
+ write_to_table(
247
+ data=batch2,
248
+ table="multi_delta_table",
249
+ namespace="delta_test_namespace",
250
+ mode=TableWriteMode.APPEND,
251
+ content_type=ContentType.PARQUET,
252
+ catalog="test_catalog_1",
253
+ )
254
+
255
+ # Test recursive listing
256
+ all_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
257
+
258
+ # Extract table names from results
259
+ object_types_to_names = defaultdict(list)
260
+ for obj in all_objects:
261
+ obj_type = Metafile.get_class(obj)
262
+ object_types_to_names[obj_type].append(obj.name)
263
+
264
+ # Assert we found all deltas
265
+ expected_deltas = {
266
+ "1",
267
+ "2",
268
+ } # all deltas should have the same stream position in their respective partitions
269
+ assert (
270
+ len(object_types_to_names[Delta]) == 2
271
+ ), f"Expected 2 deltas, found {len(object_types_to_names[Delta])}"
272
+ assert (
273
+ set(object_types_to_names[Delta]) == expected_deltas
274
+ ), f"Expected deltas: {expected_deltas}, found: {object_types_to_names[Delta]}"
275
+
276
+ def test_recursive_listing_empty_namespaces_mixed_with_populated(self):
277
+ """
278
+ Test that recursive listing handles a mix of empty and populated namespaces correctly.
279
+ """
280
+ # Create mix of empty and populated namespaces
281
+ dc.put(DeltaCatUrl("dc://test_catalog_1/empty_namespace_1"))
282
+ dc.put(DeltaCatUrl("dc://test_catalog_1/empty_namespace_2"))
283
+ dc.put(DeltaCatUrl("dc://test_catalog_1/populated_namespace"))
284
+
285
+ # Add data only to the populated namespace
286
+ test_data = pd.DataFrame({"id": [1, 2], "data": ["test1", "test2"]})
287
+
288
+ write_to_table(
289
+ data=test_data,
290
+ table="test_table",
291
+ namespace="populated_namespace",
292
+ mode=TableWriteMode.CREATE,
293
+ content_type=ContentType.PARQUET,
294
+ catalog="test_catalog_1",
295
+ )
296
+
297
+ # Test recursive listing
298
+ all_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
299
+
300
+ object_types_to_names = defaultdict(list)
301
+ for obj in all_objects:
302
+ obj_type = Metafile.get_class(obj)
303
+ object_types_to_names[obj_type].append(obj.name)
304
+
305
+ # Verify we found all namespaces
306
+ expected_namespaces = {
307
+ "empty_namespace_1",
308
+ "empty_namespace_2",
309
+ "populated_namespace",
310
+ }
311
+ assert (
312
+ len(object_types_to_names[Namespace]) == 3
313
+ ), f"Expected 3 namespaces, found {len(object_types_to_names[Namespace])}"
314
+ assert (
315
+ set(object_types_to_names[Namespace]) == expected_namespaces
316
+ ), f"Expected namespaces: {expected_namespaces}, found: {object_types_to_names[Namespace]}"
317
+
318
+ # Verify we found the table in the populated namespace
319
+ expected_tables = {"test_table"}
320
+ assert (
321
+ len(object_types_to_names[Table]) == 1
322
+ ), f"Expected 1 table, found {len(object_types_to_names[Table])}"
323
+ assert (
324
+ set(object_types_to_names[Table]) == expected_tables
325
+ ), f"Expected tables: {expected_tables}, found: {object_types_to_names[Table]}"
326
+
327
+ def test_non_recursive_listing_vs_recursive_listing(self):
328
+ """
329
+ Test that non-recursive listing only returns top-level objects while recursive returns all.
330
+ """
331
+ # Create nested structure
332
+ dc.put(DeltaCatUrl("dc://test_catalog_1/namespace_one"))
333
+ dc.put(DeltaCatUrl("dc://test_catalog_1/namespace_two"))
334
+
335
+ test_data = pd.DataFrame({"id": [1], "value": ["test"]})
336
+
337
+ write_to_table(
338
+ data=test_data,
339
+ table="table_in_ns1",
340
+ namespace="namespace_one",
341
+ mode=TableWriteMode.CREATE,
342
+ content_type=ContentType.PARQUET,
343
+ catalog="test_catalog_1",
344
+ )
345
+
346
+ # Non-recursive listing (should only get namespaces)
347
+ shallow_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=False)
348
+
349
+ # Recursive listing (should get everything)
350
+ deep_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
351
+
352
+ # Shallow should have fewer objects than deep
353
+ assert len(shallow_objects) < len(deep_objects)
354
+
355
+ # Shallow should only contain namespaces
356
+ shallow_object_types_to_names = defaultdict(list)
357
+ for obj in shallow_objects:
358
+ obj_type = Metafile.get_class(obj)
359
+ shallow_object_types_to_names[obj_type].append(obj.name)
360
+
361
+ # Assert we found all namespaces
362
+ expected_namespaces = {"namespace_one", "namespace_two"}
363
+ assert (
364
+ len(shallow_object_types_to_names[Namespace]) == 2
365
+ ), f"Expected 2 namespaces, found {len(shallow_object_types_to_names[Namespace])}"
366
+ assert (
367
+ set(shallow_object_types_to_names[Namespace]) == expected_namespaces
368
+ ), f"Expected namespaces: {expected_namespaces}, found: {shallow_object_types_to_names[Namespace]}"
369
+ assert (
370
+ len(shallow_object_types_to_names) == 1
371
+ ), f"Expected 1 object type, found {len(shallow_object_types_to_names)}"
372
+ assert set(shallow_object_types_to_names.keys()) == {
373
+ Namespace
374
+ }, f"Expected only Namespace object type, found: {shallow_object_types_to_names.keys()}"
375
+
376
+ # Deep should contain multiple types (namespaces, tables, streams, partitions, deltas)
377
+ deep_object_types_to_names = defaultdict(list)
378
+ for obj in deep_objects:
379
+ deep_object_types_to_names[Metafile.get_class(obj)].append(obj.name)
380
+
381
+ expected_namespaces = {"namespace_one", "namespace_two"}
382
+ assert (
383
+ len(deep_object_types_to_names[Namespace]) == 2
384
+ ), f"Expected 2 namespaces, found {len(deep_object_types_to_names[Namespace])}"
385
+ assert (
386
+ set(deep_object_types_to_names[Namespace]) == expected_namespaces
387
+ ), f"Expected namespaces: {expected_namespaces}, found: {deep_object_types_to_names[Namespace]}"
388
+
389
+ expected_tables = {"table_in_ns1"}
390
+ assert (
391
+ len(deep_object_types_to_names[Table]) == 1
392
+ ), f"Expected 1 table, found {len(deep_object_types_to_names[Table])}"
393
+ assert (
394
+ set(deep_object_types_to_names[Table]) == expected_tables
395
+ ), f"Expected tables: {expected_tables}, found: {deep_object_types_to_names[Table]}"
396
+
397
+ expected_table_versions = {"1"}
398
+ assert (
399
+ len(deep_object_types_to_names[TableVersion]) == 1
400
+ ), f"Expected 1 table version, found {len(deep_object_types_to_names[TableVersion])}"
401
+ assert (
402
+ set(deep_object_types_to_names[TableVersion]) == expected_table_versions
403
+ ), f"Expected table versions: {expected_table_versions}, found: {deep_object_types_to_names[TableVersion]}"
404
+
405
+ expected_streams = {"deltacat"}
406
+ assert (
407
+ len(deep_object_types_to_names[Stream]) == 1
408
+ ), f"Expected 1 stream, found {len(deep_object_types_to_names[Stream])}"
409
+ assert (
410
+ set(deep_object_types_to_names[Stream]) == expected_streams
411
+ ), f"Expected streams: {expected_streams}, found: {deep_object_types_to_names[Stream]}"
412
+
413
+ expected_partitions = {f"None|{UNPARTITIONED_SCHEME_ID}"}
414
+ assert (
415
+ len(deep_object_types_to_names[Partition]) == 1
416
+ ), f"Expected 1 partition, found {len(deep_object_types_to_names[Partition])}"
417
+ assert (
418
+ set(deep_object_types_to_names[Partition]) == expected_partitions
419
+ ), f"Expected partitions: {expected_partitions}, found: {deep_object_types_to_names[Partition]}"
420
+
421
+ expected_deltas = {"1"}
422
+ assert (
423
+ len(deep_object_types_to_names[Delta]) == 1
424
+ ), f"Expected 1 delta, found {len(deep_object_types_to_names[Delta])}"
425
+ assert (
426
+ set(deep_object_types_to_names[Delta]) == expected_deltas
427
+ ), f"Expected deltas: {expected_deltas}, found: {deep_object_types_to_names[Delta]}"
428
+
429
+ def test_recursive_listing_all_children_processed(self):
430
+ """
431
+ Ensure that all children are processed at each level of recursive listings.
432
+ """
433
+ # Create 3 namespaces
434
+ dc.put(DeltaCatUrl("dc://test_catalog_1/alpha_namespace"))
435
+ dc.put(DeltaCatUrl("dc://test_catalog_1/beta_namespace"))
436
+ dc.put(DeltaCatUrl("dc://test_catalog_1/gamma_namespace"))
437
+
438
+ # Create test data
439
+ test_data = pd.DataFrame(
440
+ {"id": [1, 2], "name": ["test1", "test2"], "value": [100, 200]}
441
+ )
442
+
443
+ # Create tables in EACH namespace
444
+ write_to_table(
445
+ data=test_data,
446
+ table="alpha_table",
447
+ namespace="alpha_namespace",
448
+ mode=TableWriteMode.CREATE,
449
+ content_type=ContentType.PARQUET,
450
+ catalog="test_catalog_1",
451
+ )
452
+
453
+ write_to_table(
454
+ data=test_data,
455
+ table="beta_table",
456
+ namespace="beta_namespace",
457
+ mode=TableWriteMode.CREATE,
458
+ content_type=ContentType.PARQUET,
459
+ catalog="test_catalog_1",
460
+ )
461
+
462
+ write_to_table(
463
+ data=test_data,
464
+ table="gamma_table",
465
+ namespace="gamma_namespace",
466
+ mode=TableWriteMode.CREATE,
467
+ content_type=ContentType.PARQUET,
468
+ catalog="test_catalog_1",
469
+ )
470
+
471
+ # Perform recursive listing
472
+ all_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
473
+
474
+ # Extract all objects found
475
+ object_types_to_names = defaultdict(list)
476
+ for obj in all_objects:
477
+ obj_type = Metafile.get_class(obj)
478
+ object_types_to_names[obj_type].append(obj.name)
479
+
480
+ # All namespaces should be found
481
+ expected_namespaces = {"alpha_namespace", "beta_namespace", "gamma_namespace"}
482
+ assert (
483
+ len(object_types_to_names[Namespace]) == 3
484
+ ), f"Expected 3 namespaces, found {len(object_types_to_names[Namespace])}"
485
+ assert (
486
+ set(object_types_to_names[Namespace]) == expected_namespaces
487
+ ), f"Expected namespaces: {expected_namespaces}, found: {object_types_to_names[Namespace]}"
488
+
489
+ # All tables should be found
490
+ expected_tables = {"alpha_table", "beta_table", "gamma_table"}
491
+ assert (
492
+ len(object_types_to_names[Table]) == 3
493
+ ), f"Expected 3 tables, found {len(object_types_to_names[Table])}"
494
+ assert (
495
+ set(object_types_to_names[Table]) == expected_tables
496
+ ), f"Expected tables: {expected_tables}, found: {object_types_to_names[Table]}"
497
+
498
+ # All table versions should be found
499
+ expected_table_versions = {"1"}
500
+ assert (
501
+ len(object_types_to_names[TableVersion]) == 3
502
+ ), f"Expected 3 table versions, found {len(object_types_to_names[TableVersion])}"
503
+ assert (
504
+ set(object_types_to_names[TableVersion]) == expected_table_versions
505
+ ), f"Expected table versions: {expected_table_versions}, found: {object_types_to_names[TableVersion]}"
506
+
507
+ # All streams should be found
508
+ expected_streams = {"deltacat"}
509
+ assert (
510
+ len(object_types_to_names[Stream]) == 3
511
+ ), f"Expected 1 stream, found {len(object_types_to_names[Stream])}"
512
+ assert (
513
+ set(object_types_to_names[Stream]) == expected_streams
514
+ ), f"Expected streams: {expected_streams}, found: {object_types_to_names[Stream]}"
515
+
516
+ # All partitions should be found
517
+ expected_partitions = {f"None|{UNPARTITIONED_SCHEME_ID}"}
518
+ assert (
519
+ len(object_types_to_names[Partition]) == 3
520
+ ), f"Expected 1 partition, found {len(object_types_to_names[Partition])}"
521
+ assert (
522
+ set(object_types_to_names[Partition]) == expected_partitions
523
+ ), f"Expected partitions: {expected_partitions}, found: {object_types_to_names[Partition]}"
524
+
525
+ # All deltas should be found
526
+ expected_deltas = {"1"}
527
+ assert (
528
+ len(object_types_to_names[Delta]) == 3
529
+ ), f"Expected 3 deltas, found {len(object_types_to_names[Delta])}"
530
+ assert (
531
+ set(object_types_to_names[Delta]) == expected_deltas
532
+ ), f"Expected deltas: {expected_deltas}, found: {object_types_to_names[Delta]}"
533
+
534
+ # Ensure we found the expected objects across all levels of hierarchy
535
+ total_objects = len(all_objects)
536
+ assert (
537
+ total_objects == 18
538
+ ), f"Expected 18 objects from deep traversal, found only {total_objects}."
539
+
540
+ def test_recursive_cross_catalog_copy(self):
541
+ """
542
+ Test comprehensive cross-catalog copy using dc.copy with ** pattern.
543
+ This test validates complete catalog copying with all metadata types:
544
+ namespaces, tables, table versions, streams, partitions, and deltas.
545
+ """
546
+ # Create multiple namespaces, multiple tables, versions, streams, partitions, and deltas
547
+
548
+ # Namespace 1: Analytics data with multiple table versions
549
+ dc.put(DeltaCatUrl("dc://test_catalog_1/analytics"))
550
+
551
+ # Create table with multiple versions
552
+ events_data_v1 = pd.DataFrame(
553
+ {
554
+ "event_id": [1, 2, 3],
555
+ "user_id": ["user_1", "user_2", "user_3"],
556
+ "event_type": ["click", "view", "purchase"],
557
+ "timestamp": pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03"]),
558
+ "value": [10.5, 20.0, 150.75],
559
+ }
560
+ )
561
+
562
+ table_properties: TableProperties = {
563
+ TableProperty.READ_OPTIMIZATION_LEVEL: TableReadOptimizationLevel.MAX,
564
+ TableProperty.APPENDED_RECORD_COUNT_COMPACTION_TRIGGER: 1,
565
+ TableProperty.APPENDED_FILE_COUNT_COMPACTION_TRIGGER: 1,
566
+ TableProperty.APPENDED_DELTA_COUNT_COMPACTION_TRIGGER: 1,
567
+ }
568
+
569
+ write_to_table(
570
+ data=events_data_v1,
571
+ table="events",
572
+ namespace="analytics",
573
+ mode=TableWriteMode.CREATE,
574
+ content_type=ContentType.PARQUET,
575
+ catalog="test_catalog_1",
576
+ table_properties=table_properties,
577
+ )
578
+
579
+ # Add more data to create additional deltas
580
+ events_data_v2 = pd.DataFrame(
581
+ {
582
+ "event_id": [4, 5, 6, 7],
583
+ "user_id": ["user_4", "user_1", "user_5", "user_2"],
584
+ "event_type": ["view", "click", "purchase", "refund"],
585
+ "timestamp": pd.to_datetime(
586
+ ["2023-01-04", "2023-01-05", "2023-01-06", "2023-01-07"]
587
+ ),
588
+ "value": [0.0, 5.25, 299.99, -150.75],
589
+ }
590
+ )
591
+
592
+ write_to_table(
593
+ data=events_data_v2,
594
+ table="events",
595
+ namespace="analytics",
596
+ mode=TableWriteMode.APPEND,
597
+ content_type=ContentType.PARQUET,
598
+ catalog="test_catalog_1",
599
+ )
600
+
601
+ # Create second table in analytics namespace
602
+ users_data = pd.DataFrame(
603
+ {
604
+ "user_id": ["user_1", "user_2", "user_3", "user_4", "user_5"],
605
+ "username": ["alice", "bob", "charlie", "diana", "eve"],
606
+ "email": [
607
+ "alice@test.com",
608
+ "bob@test.com",
609
+ "charlie@test.com",
610
+ "diana@test.com",
611
+ "eve@test.com",
612
+ ],
613
+ "created_at": pd.to_datetime(
614
+ [
615
+ "2022-12-01",
616
+ "2022-12-15",
617
+ "2022-12-20",
618
+ "2023-01-01",
619
+ "2023-01-03",
620
+ ]
621
+ ),
622
+ "is_active": [True, True, False, True, True],
623
+ }
624
+ )
625
+
626
+ write_to_table(
627
+ data=users_data,
628
+ table="users",
629
+ namespace="analytics",
630
+ mode=TableWriteMode.CREATE,
631
+ content_type=ContentType.PARQUET,
632
+ catalog="test_catalog_1",
633
+ )
634
+
635
+ # Create version 2 of the events table to test table version ordering in recursive copy
636
+ events_data_v3 = pd.DataFrame(
637
+ {
638
+ "event_id": [8, 9, 10],
639
+ "user_id": ["user_3", "user_4", "user_5"],
640
+ "event_type": ["signup", "login", "logout"],
641
+ "timestamp": pd.to_datetime(["2023-01-08", "2023-01-09", "2023-01-10"]),
642
+ "value": [0.0, 0.0, 0.0],
643
+ }
644
+ )
645
+
646
+ write_to_table(
647
+ data=events_data_v3,
648
+ table="events",
649
+ namespace="analytics",
650
+ table_version="2", # Explicitly create version 2
651
+ mode=TableWriteMode.CREATE,
652
+ content_type=ContentType.PARQUET,
653
+ catalog="test_catalog_1",
654
+ )
655
+
656
+ # Namespace 2: Product data with different schema
657
+ dc.put(DeltaCatUrl("dc://test_catalog_1/products"))
658
+
659
+ products_data = pd.DataFrame(
660
+ {
661
+ "product_id": ["prod_1", "prod_2", "prod_3"],
662
+ "name": ["Widget A", "Widget B", "Super Widget"],
663
+ "category": ["widgets", "widgets", "premium"],
664
+ "price": [19.99, 29.99, 149.99],
665
+ "in_stock": [True, False, True],
666
+ "metadata": [
667
+ {"color": "red"},
668
+ {"color": "blue", "size": "large"},
669
+ {"color": "gold", "premium": True},
670
+ ],
671
+ }
672
+ )
673
+
674
+ write_to_table(
675
+ data=products_data,
676
+ table="inventory",
677
+ namespace="products",
678
+ mode=TableWriteMode.CREATE,
679
+ content_type=ContentType.PARQUET,
680
+ catalog="test_catalog_1",
681
+ )
682
+
683
+ # Create product categories table
684
+ categories_data = pd.DataFrame(
685
+ {
686
+ "category_id": ["widgets", "premium", "accessories"],
687
+ "display_name": ["Standard Widgets", "Premium Products", "Accessories"],
688
+ "description": [
689
+ "Basic widget products",
690
+ "High-end premium items",
691
+ "Additional accessories",
692
+ ],
693
+ "active": [True, True, False],
694
+ }
695
+ )
696
+
697
+ write_to_table(
698
+ data=categories_data,
699
+ table="categories",
700
+ namespace="products",
701
+ mode=TableWriteMode.CREATE,
702
+ content_type=ContentType.PARQUET,
703
+ catalog="test_catalog_1",
704
+ )
705
+
706
+ # Namespace 3: Empty namespace (edge case testing)
707
+ dc.put(DeltaCatUrl("dc://test_catalog_1/empty_data"))
708
+
709
+ # Namespace 4: Orders with complex nested data
710
+ dc.put(DeltaCatUrl("dc://test_catalog_1/orders"))
711
+
712
+ orders_data = pd.DataFrame(
713
+ {
714
+ "order_id": ["order_1", "order_2", "order_3"],
715
+ "user_id": ["user_1", "user_2", "user_1"],
716
+ "product_ids": [["prod_1"], ["prod_2", "prod_3"], ["prod_1", "prod_2"]],
717
+ "order_date": pd.to_datetime(
718
+ ["2023-01-05", "2023-01-06", "2023-01-07"]
719
+ ),
720
+ "total_amount": [19.99, 179.98, 49.98],
721
+ "status": ["completed", "pending", "completed"],
722
+ }
723
+ )
724
+
725
+ write_to_table(
726
+ data=orders_data,
727
+ table="transactions",
728
+ namespace="orders",
729
+ mode=TableWriteMode.CREATE,
730
+ content_type=ContentType.PARQUET,
731
+ catalog="test_catalog_1",
732
+ )
733
+
734
+ # Verify source catalog structure before copy
735
+ source_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
736
+ source_urls_by_type = defaultdict(list)
737
+ source_by_type = defaultdict(list)
738
+
739
+ for obj in source_objects:
740
+ obj_class = Metafile.get_class(obj.to_serializable())
741
+ source_urls_by_type[obj_class].append(obj.url())
742
+ source_by_type[obj_class].append(obj)
743
+
744
+ assert (
745
+ len(source_urls_by_type[Namespace]) == 4
746
+ ), f"Expected 4 namespaces, got {len(source_urls_by_type[Namespace])}"
747
+ assert (
748
+ len(source_urls_by_type[Table]) == 5
749
+ ), f"Expected 5 tables, got {len(source_urls_by_type[Table])}"
750
+ assert (
751
+ len(source_urls_by_type[TableVersion]) == 6
752
+ ), f"Expected 6 table versions, got {len(source_urls_by_type[TableVersion])}"
753
+ assert (
754
+ len(source_urls_by_type[Stream]) == 6
755
+ ), f"Expected 6 streams, got {len(source_urls_by_type[Stream])}"
756
+ assert (
757
+ len(source_urls_by_type[Partition]) == 6
758
+ ), f"Expected 6 partitions, got {len(source_urls_by_type[Partition])}"
759
+ assert (
760
+ len(source_urls_by_type[Delta]) == 6
761
+ ), f"Expected 6 deltas, got {len(source_urls_by_type[Delta])}"
762
+
763
+ # Test the /** recursive copy pattern.
764
+ dc.copy(
765
+ DeltaCatUrl("dc://test_catalog_1/**"), # ** means recursive copy all
766
+ DeltaCatUrl("dc://test_catalog_2/"),
767
+ )
768
+
769
+ # Verify destination catalog has same structure
770
+ dest_objects = dc.list(DeltaCatUrl("dc://test_catalog_2"), recursive=True)
771
+ dest_urls_by_type = defaultdict(list)
772
+ dest_by_type = defaultdict(list)
773
+
774
+ assert len(dest_objects) == len(
775
+ source_objects
776
+ ), f"Expected {len(source_objects)} objects, got {len(dest_objects)}"
777
+
778
+ for obj in dest_objects:
779
+ obj_class = Metafile.get_class(obj.to_serializable())
780
+ dest_urls_by_type[obj_class].append(obj.url())
781
+ dest_by_type[obj_class].append(obj)
782
+
783
+ assert sorted(dest_urls_by_type[Namespace]) == sorted(
784
+ source_urls_by_type[Namespace]
785
+ ), f"Namespace mismatch: {dest_urls_by_type[Namespace]} vs {source_urls_by_type[Namespace]}"
786
+ assert sorted(dest_urls_by_type[Table]) == sorted(
787
+ source_urls_by_type[Table]
788
+ ), f"Table mismatch: {dest_urls_by_type[Table]} vs {source_urls_by_type[Table]}"
789
+ assert sorted(dest_urls_by_type[TableVersion]) == sorted(
790
+ source_urls_by_type[TableVersion]
791
+ ), f"Table version mismatch: {dest_urls_by_type[TableVersion]} vs {source_urls_by_type[TableVersion]}"
792
+ assert sorted(dest_urls_by_type[Stream]) == sorted(
793
+ source_urls_by_type[Stream]
794
+ ), f"Stream mismatch: {dest_urls_by_type[Stream]} vs {source_urls_by_type[Stream]}"
795
+ assert sorted(dest_urls_by_type[Partition]) == sorted(
796
+ source_urls_by_type[Partition]
797
+ ), f"Partition mismatch: {dest_urls_by_type[Partition]} vs {source_urls_by_type[Partition]}"
798
+ assert sorted(dest_urls_by_type[Delta]) == sorted(
799
+ source_urls_by_type[Delta]
800
+ ), f"Delta mismatch: {dest_urls_by_type[Delta]} vs {source_urls_by_type[Delta]}"
801
+
802
+ # Validate each hierarchy level
803
+ for obj_type in source_by_type.keys():
804
+ source_count = len(source_by_type.get(obj_type))
805
+ dest_count = len(dest_by_type.get(obj_type, []))
806
+ assert (
807
+ dest_count == source_count
808
+ ), f"{obj_type} count mismatch: {dest_count} vs {source_count}"
809
+
810
+ # Spot check equivalence of each type
811
+ if obj_type == Namespace and source_count > 0:
812
+ # Check namespace properties are preserved
813
+ source_ns = source_by_type[obj_type][0] # NamespaceModel
814
+ dest_ns = next(
815
+ (
816
+ ns
817
+ for ns in dest_by_type[obj_type]
818
+ if ns.namespace == source_ns.namespace
819
+ ),
820
+ None,
821
+ )
822
+ assert (
823
+ dest_ns is not None
824
+ ), f"Namespace {source_ns.namespace} not found in destination"
825
+ assert source_ns.equivalent_to(
826
+ dest_ns
827
+ ), f"Namespace {source_ns.namespace} not equivalent to {dest_ns.namespace}"
828
+ elif obj_type == Table:
829
+ source_table = source_by_type[obj_type][0] # TableModel
830
+ dest_table = next(
831
+ (
832
+ t
833
+ for t in dest_by_type[obj_type]
834
+ if t.namespace == source_table.namespace
835
+ and t.table_name == source_table.table_name
836
+ ),
837
+ None,
838
+ )
839
+ assert (
840
+ dest_table is not None
841
+ ), f"Table {source_table.namespace}.{source_table.table_name} not found in destination"
842
+ assert source_table.equivalent_to(
843
+ dest_table
844
+ ), f"Table {source_table.namespace}.{source_table.table_name} not equivalent to {dest_table.namespace}.{dest_table.table_name}"
845
+ elif obj_type == TableVersion and source_count > 0:
846
+ # Check table version properties are preserved
847
+ source_tv = source_by_type[obj_type][0] # TableVersionModel
848
+ dest_tv = next(
849
+ (
850
+ tv
851
+ for tv in dest_by_type[obj_type]
852
+ if tv.namespace == source_tv.namespace
853
+ and tv.table_name == source_tv.table_name
854
+ and tv.table_version == source_tv.table_version
855
+ ),
856
+ None,
857
+ )
858
+ assert (
859
+ dest_tv is not None
860
+ ), f"TableVersion {source_tv.namespace}.{source_tv.table_name}.{source_tv.table_version} not found in destination"
861
+ assert dest_tv.equivalent_to(
862
+ source_tv
863
+ ), f"TableVersion {source_tv.namespace}.{source_tv.table_name}.{source_tv.table_version} not equivalent to {dest_tv.namespace}.{dest_tv.table_name}.{dest_tv.table_version}"
864
+
865
+ # Special validation for table version ordering - check that analytics.events has versions 1 and 2
866
+ analytics_events_versions = [
867
+ tv
868
+ for tv in dest_by_type[obj_type]
869
+ if tv.namespace == "analytics" and tv.table_name == "events"
870
+ ]
871
+ if analytics_events_versions:
872
+ versions = sorted(
873
+ [tv.table_version for tv in analytics_events_versions]
874
+ )
875
+ assert versions == [
876
+ "1",
877
+ "2",
878
+ ], f"Expected analytics.events versions ['1', '2'], got {versions}"
879
+ elif obj_type == Stream and source_count > 0:
880
+ # Check stream properties are preserved
881
+ source_stream = source_by_type[obj_type][0] # StreamModel
882
+ dest_stream = next(
883
+ (
884
+ s
885
+ for s in dest_by_type[obj_type]
886
+ if s.namespace == source_stream.namespace
887
+ and s.table_name == source_stream.table_name
888
+ and s.stream_format == source_stream.stream_format
889
+ ),
890
+ None,
891
+ )
892
+ assert (
893
+ dest_stream is not None
894
+ ), f"Stream {source_stream.namespace}.{source_stream.table_name}.{source_stream.stream_format} not found in destination"
895
+ assert dest_stream.equivalent_to(
896
+ source_stream
897
+ ), f"Stream {source_stream.namespace}.{source_stream.table_name}.{source_stream.stream_format} not equivalent to {dest_stream.namespace}.{dest_stream.table_name}.{dest_stream.stream_format}"
898
+ elif obj_type == Partition and source_count > 0:
899
+ # Check partition properties are preserved (with new partition IDs)
900
+ source_partition = source_by_type[obj_type][0] # PartitionModel
901
+ dest_partition = next(
902
+ (
903
+ p
904
+ for p in dest_by_type[obj_type]
905
+ if p.namespace == source_partition.namespace
906
+ and p.table_name == source_partition.table_name
907
+ ),
908
+ None,
909
+ )
910
+ assert (
911
+ dest_partition is not None
912
+ ), f"Partition for {source_partition.namespace}.{source_partition.table_name} not found in destination"
913
+ assert dest_partition.equivalent_to(
914
+ source_partition
915
+ ), f"Partition {source_partition.namespace}.{source_partition.table_name} not equivalent to {dest_partition.namespace}.{dest_partition.table_name}"
916
+ elif obj_type == Delta and source_count > 0:
917
+ # Check delta properties are preserved (with same stream positions)
918
+ source_delta = source_by_type[obj_type][0] # DeltaModel
919
+ dest_delta = next(
920
+ (
921
+ d
922
+ for d in dest_by_type[obj_type]
923
+ if d.namespace == source_delta.namespace
924
+ and d.table_name == source_delta.table_name
925
+ and d.stream_position == source_delta.stream_position
926
+ ),
927
+ None,
928
+ )
929
+ assert (
930
+ dest_delta is not None
931
+ ), f"Delta for {source_delta.namespace}.{source_delta.table_name} at position {source_delta.stream_position} not found in destination"
932
+ assert dest_delta.equivalent_to(
933
+ source_delta
934
+ ), f"Delta {source_delta.namespace}.{source_delta.table_name} at position {source_delta.stream_position} not equivalent to {dest_delta.namespace}.{dest_delta.table_name} at position {dest_delta.stream_position}"
935
+
936
+ # Validate each table's data integrity
937
+ test_cases = [
938
+ ("analytics", "events"),
939
+ ("analytics", "users"),
940
+ ("products", "inventory"),
941
+ ("products", "categories"),
942
+ ("orders", "transactions"),
943
+ ]
944
+
945
+ for namespace, table in test_cases:
946
+ # Check table exists in destination
947
+ assert dc.table_exists(
948
+ table=table,
949
+ namespace=namespace,
950
+ catalog="test_catalog_2",
951
+ ), f"Table {namespace}/{table} should exist in destination catalog"
952
+
953
+ # Verify table data equivalence using read_table
954
+ source_df = dc.read_table(
955
+ table=table,
956
+ namespace=namespace,
957
+ catalog="test_catalog_1",
958
+ read_as=DatasetType.PANDAS,
959
+ )
960
+
961
+ dest_df = dc.read_table(
962
+ table=table,
963
+ namespace=namespace,
964
+ catalog="test_catalog_2",
965
+ read_as=DatasetType.PANDAS,
966
+ )
967
+
968
+ # Verify both datasets are valid pandas DataFrames
969
+ assert (
970
+ source_df is not None
971
+ ), f"Source data should not be None for {namespace}.{table}"
972
+ assert (
973
+ dest_df is not None
974
+ ), f"Destination data should not be None for {namespace}.{table}"
975
+
976
+ # Compare DataFrame properties
977
+ assert len(source_df) == len(
978
+ dest_df
979
+ ), f"Row count mismatch for {namespace}.{table}: {len(source_df)} vs {len(dest_df)}"
980
+ assert list(source_df.columns) == list(
981
+ dest_df.columns
982
+ ), f"Column mismatch for {namespace}.{table}"
983
+
984
+ # Sort both dataframes by first column for comparison (to handle potential row ordering differences)
985
+ _assert_data_equivalence(source_df, dest_df)
986
+
987
+ # Verify that writing to the source table doesn't affect the destination table
988
+ dc.write_to_table(
989
+ data=source_df,
990
+ table=table,
991
+ namespace=namespace,
992
+ catalog="test_catalog_1",
993
+ mode=TableWriteMode.APPEND,
994
+ )
995
+
996
+ # Verify that the destination table's data hasn't changed
997
+ dest_df = dc.read_table(
998
+ table=table,
999
+ namespace=namespace,
1000
+ catalog="test_catalog_2",
1001
+ read_as=DatasetType.PANDAS,
1002
+ )
1003
+ _assert_data_equivalence(source_df, dest_df)
1004
+
1005
+ # Verify that the source table has source_df repeated twice
1006
+ source_df_repeated = dc.read_table(
1007
+ table=table,
1008
+ namespace=namespace,
1009
+ catalog="test_catalog_1",
1010
+ read_as=DatasetType.PANDAS,
1011
+ )
1012
+ assert (
1013
+ len(source_df_repeated) == len(source_df) * 2
1014
+ ), f"Source table {namespace}.{table} should have {len(source_df) * 2} rows"
1015
+
1016
+ # Verify that writing to the destination table doesn't affect the source table
1017
+ dc.write_to_table(
1018
+ data=dest_df,
1019
+ table=table,
1020
+ namespace=namespace,
1021
+ catalog="test_catalog_2",
1022
+ mode=TableWriteMode.APPEND,
1023
+ )
1024
+
1025
+ # Verify that the source table's data hasn't changed
1026
+ source_df_unchanged = dc.read_table(
1027
+ table=table,
1028
+ namespace=namespace,
1029
+ catalog="test_catalog_1",
1030
+ read_as=DatasetType.PANDAS,
1031
+ )
1032
+ _assert_data_equivalence(source_df_repeated, source_df_unchanged)
1033
+
1034
+ # Verify that the destination table's data has dest_df repeated twice
1035
+ dest_df_repeated = dc.read_table(
1036
+ table=table,
1037
+ namespace=namespace,
1038
+ catalog="test_catalog_2",
1039
+ read_as=DatasetType.PANDAS,
1040
+ )
1041
+ assert (
1042
+ len(dest_df_repeated) == len(dest_df) * 2
1043
+ ), f"Destination table {namespace}.{table} should have {len(dest_df) * 2} rows"
1044
+
1045
+ # Verify empty namespace was copied correctly
1046
+ assert dc.namespace_exists(
1047
+ namespace="empty_data",
1048
+ catalog="test_catalog_2",
1049
+ ), "Empty namespace should exist in destination catalog"
1050
+
1051
+
1052
+ def _assert_data_equivalence(source_df: pd.DataFrame, dest_df: pd.DataFrame):
1053
+ # Sort both dataframes by first column for comparison (to handle potential row ordering differences)
1054
+ if len(source_df) > 0:
1055
+ first_col = source_df.columns[0]
1056
+ # Handle sorting with potential complex data types
1057
+ source_sorted = source_df.sort_values(first_col).reset_index(drop=True)
1058
+ dest_sorted = dest_df.sort_values(first_col).reset_index(drop=True)
1059
+
1060
+ # Compare data values using pandas testing
1061
+ pd.testing.assert_frame_equal(
1062
+ source_sorted,
1063
+ dest_sorted,
1064
+ )