deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -11,15 +11,15 @@ from deltacat.exceptions import (
11
11
  UnclassifiedDeltaCatError,
12
12
  )
13
13
  from daft.exceptions import DaftTransientError
14
- from deltacat.tests.local_deltacat_storage.exceptions import (
14
+ from deltacat.tests.utils.exceptions import (
15
15
  InvalidNamespaceError,
16
- LocalStorageValidationError,
16
+ MainStorageValidationError,
17
17
  )
18
+ from deltacat.tests.utils import main_deltacat_storage_mock as ds
18
19
  from botocore.exceptions import NoCredentialsError
19
20
  from tenacity import retry, retry_if_exception_type, stop_after_attempt
20
21
 
21
22
  from pyarrow.lib import ArrowCapacityError
22
- import deltacat.tests.local_deltacat_storage as ds
23
23
 
24
24
 
25
25
  class MockUnknownException(Exception):
@@ -41,7 +41,7 @@ def mock_remote_task(exception_to_raise):
41
41
  mock_raise_exception(exception_to_raise)
42
42
 
43
43
 
44
- class TestCategorizeErrors(unittest.TestCase):
44
+ class TestCategorizeErrorsMain(unittest.TestCase):
45
45
  def test_pyarrow_exception_categorizer(self):
46
46
  self.assertRaises(
47
47
  DependencyPyarrowCapacityError,
@@ -50,7 +50,7 @@ class TestCategorizeErrors(unittest.TestCase):
50
50
 
51
51
  def test_storage_exception_categorizer(self):
52
52
  self.assertRaises(
53
- LocalStorageValidationError,
53
+ MainStorageValidationError,
54
54
  lambda: mock_raise_exception(InvalidNamespaceError, deltacat_storage=ds),
55
55
  )
56
56
 
@@ -98,3 +98,7 @@ class TestCategorizeErrors(unittest.TestCase):
98
98
  return
99
99
 
100
100
  self.assertFalse(True)
101
+
102
+
103
+ if __name__ == "__main__":
104
+ unittest.main()
@@ -0,0 +1,14 @@
1
+ import shutil
2
+ import tempfile
3
+
4
+
5
+ from contextlib import contextmanager
6
+
7
+
8
+ @contextmanager
9
+ def temp_dir_autocleanup():
10
+ tmpdir = tempfile.mkdtemp()
11
+ try:
12
+ yield tmpdir
13
+ finally:
14
+ shutil.rmtree(tmpdir)
@@ -0,0 +1,54 @@
1
+ import base64
2
+ import msgpack
3
+ import json
4
+ import os
5
+ import shutil
6
+
7
+ from tempfile import mkdtemp
8
+
9
+
10
+ def _convert_bytes_to_base64_str(obj):
11
+ if isinstance(obj, dict):
12
+ for key, value in obj.items():
13
+ if isinstance(value, bytes):
14
+ obj[key] = base64.b64encode(value).decode("utf-8")
15
+ elif isinstance(value, list):
16
+ _convert_bytes_to_base64_str(value)
17
+ elif isinstance(value, dict):
18
+ _convert_bytes_to_base64_str(value)
19
+ elif isinstance(obj, list):
20
+ for i, item in enumerate(obj):
21
+ if isinstance(item, bytes):
22
+ obj[i] = base64.b64encode(item).decode("utf-8")
23
+ elif isinstance(item, (dict, list)):
24
+ _convert_bytes_to_base64_str(item)
25
+
26
+
27
+ def copy_and_convert(src_dir, dst_dir=None):
28
+ """
29
+ Helper function for copying a metastore recursively and converting all
30
+ messagepack files to json. This can be used manually to more easily
31
+ introspect metastore metadata.
32
+ """
33
+ if dst_dir is None:
34
+ dst_dir = mkdtemp()
35
+ print(f"destination is: {dst_dir}")
36
+ if not os.path.exists(dst_dir):
37
+ os.makedirs(dst_dir)
38
+
39
+ for item in os.listdir(src_dir):
40
+ src_path = os.path.join(src_dir, item)
41
+ dst_path = os.path.join(dst_dir, item)
42
+
43
+ if os.path.isdir(src_path):
44
+ copy_and_convert(src_path, dst_path)
45
+ else:
46
+ if item.endswith(".mpk"):
47
+ with open(src_path, "rb") as f:
48
+ data = msgpack.unpackb(f.read(), raw=False)
49
+ _convert_bytes_to_base64_str(data)
50
+ dst_path = dst_path[:-4] + ".json"
51
+ with open(dst_path, "w") as f:
52
+ json.dump(data, f)
53
+ else:
54
+ shutil.copy2(src_path, dst_path)
@@ -1,8 +1,9 @@
1
1
  from typing import List, Optional, Union
2
2
  import pyarrow as pa
3
3
  from deltacat.storage import Delta, Partition, PartitionLocator, DeltaLocator
4
- import deltacat.tests.local_deltacat_storage as ds
4
+ from deltacat.storage import metastore
5
5
  from deltacat.types.media import StorageType, ContentType
6
+ from deltacat.storage.model.schema import Schema
6
7
 
7
8
 
8
9
  def create_delta_from_csv_file(
@@ -14,60 +15,89 @@ def create_delta_from_csv_file(
14
15
  *args,
15
16
  **kwargs,
16
17
  ) -> Delta:
18
+ assert file_paths is not None, "file_paths cannot be empty"
19
+ pa_table = create_table_from_csv_file_paths(file_paths)
20
+ schema = Schema.of(pa_table.schema)
17
21
  staged_partition = stage_partition_from_file_paths(
18
22
  namespace,
19
23
  file_paths,
24
+ schema,
20
25
  *args,
21
26
  table_name=table_name,
22
27
  table_version=table_version,
23
28
  **kwargs,
24
29
  )
25
30
  committed_delta = commit_delta_to_staged_partition(
26
- staged_partition, file_paths, content_type=content_type, *args, **kwargs
31
+ staged_partition,
32
+ pa_table,
33
+ content_type,
34
+ *args,
35
+ **kwargs,
27
36
  )
28
37
  return committed_delta
29
38
 
30
39
 
40
+ def create_table_from_csv_file_paths(
41
+ file_paths: List[str],
42
+ ) -> pa.Table:
43
+ tables = []
44
+ for file_path in file_paths:
45
+ table = pa.csv.read_csv(file_path)
46
+ tables.append(table)
47
+ return pa.concat_tables(tables)
48
+
49
+
31
50
  def stage_partition_from_file_paths(
32
51
  namespace: str,
33
52
  file_paths: List[str],
53
+ schema: Schema,
34
54
  table_name: Optional[str] = None,
35
55
  table_version: int = 1,
36
56
  *args,
37
57
  **kwargs,
38
58
  ) -> Partition:
39
- ds.create_namespace(namespace, {}, **kwargs)
59
+ if not metastore.namespace_exists(namespace, **kwargs):
60
+ metastore.create_namespace(namespace, **kwargs)
40
61
  if table_name is None:
41
62
  table_name = "-".join(file_paths).replace("/", "_")
42
- ds.create_table_version(namespace, table_name, str(table_version), **kwargs)
43
- stream = ds.get_stream(namespace, table_name, str(table_version), **kwargs)
44
- staged_partition = ds.stage_partition(stream, [], **kwargs)
63
+ metastore.create_table_version(
64
+ namespace,
65
+ table_name,
66
+ str(table_version),
67
+ schema=schema,
68
+ **kwargs,
69
+ )
70
+ stream = metastore.get_stream(
71
+ namespace,
72
+ table_name,
73
+ str(table_version),
74
+ **kwargs,
75
+ )
76
+ staged_partition = metastore.stage_partition(stream, **kwargs)
45
77
  return staged_partition
46
78
 
47
79
 
48
80
  def commit_delta_to_staged_partition(
49
81
  staged_partition,
50
- file_paths: List[str] = None,
51
- pa_table: pa.Table = None,
82
+ pa_table: pa.Table,
52
83
  content_type: ContentType = ContentType.PARQUET,
53
84
  *args,
54
85
  **kwargs,
55
86
  ) -> Delta:
56
87
  committed_delta = commit_delta_to_partition(
57
88
  staged_partition,
89
+ pa_table,
90
+ content_type,
58
91
  *args,
59
- file_paths=file_paths,
60
- content_type=content_type,
61
- pa_table=pa_table,
62
92
  **kwargs,
63
93
  )
64
- ds.commit_partition(staged_partition, **kwargs)
94
+ metastore.commit_partition(staged_partition, **kwargs)
65
95
  return committed_delta
66
96
 
67
97
 
68
98
  def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> Delta:
69
99
  return pa.concat_tables(
70
- ds.download_delta(
100
+ metastore.download_delta(
71
101
  delta_like,
72
102
  storage_type=StorageType.LOCAL,
73
103
  *args,
@@ -78,7 +108,6 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
78
108
 
79
109
  def commit_delta_to_partition(
80
110
  partition: Union[Partition, PartitionLocator],
81
- file_paths: List[str] = None,
82
111
  pa_table: pa.Table = None,
83
112
  content_type: ContentType = ContentType.PARQUET,
84
113
  *args,
@@ -86,20 +115,15 @@ def commit_delta_to_partition(
86
115
  ) -> Delta:
87
116
 
88
117
  if isinstance(partition, PartitionLocator):
89
- partition = ds.get_partition(
118
+ partition = metastore.get_partition(
90
119
  partition.stream_locator, partition.partition_values, *args, **kwargs
91
120
  )
92
- if pa_table is None:
93
- assert file_paths is not None, "One of pa_table or file_paths must be passed."
94
- tables = []
95
- for file_path in file_paths:
96
- table = pa.csv.read_csv(file_path)
97
- tables.append(table)
98
-
99
- pa_table = pa.concat_tables(tables)
100
121
 
101
- staged_delta = ds.stage_delta(
102
- pa_table, partition, content_type=content_type, **kwargs
122
+ staged_delta = metastore.stage_delta(
123
+ pa_table,
124
+ partition,
125
+ content_type=content_type,
126
+ **kwargs,
103
127
  )
104
128
 
105
- return ds.commit_delta(staged_delta, **kwargs)
129
+ return metastore.commit_delta(staged_delta, **kwargs)
@@ -1,7 +1,57 @@
1
1
  from typing import Optional, Dict
2
2
 
3
- from deltacat.aws.redshift import Manifest, ManifestMeta
4
- from deltacat.storage import Partition, DeltaType, DeltaLocator, Delta
3
+ import pyarrow as pa
4
+
5
+ from deltacat import (
6
+ ContentEncoding,
7
+ ContentType,
8
+ )
9
+ from deltacat.storage import (
10
+ BucketTransform,
11
+ BucketTransformParameters,
12
+ BucketingStrategy,
13
+ CommitState,
14
+ Delta,
15
+ DeltaLocator,
16
+ DeltaType,
17
+ EntryParams,
18
+ EntryType,
19
+ Field,
20
+ LifecycleState,
21
+ ManifestAuthor,
22
+ ManifestEntry,
23
+ Namespace,
24
+ NamespaceLocator,
25
+ NullOrder,
26
+ Partition,
27
+ PartitionKey,
28
+ PartitionKeyList,
29
+ PartitionLocator,
30
+ PartitionScheme,
31
+ PartitionSchemeList,
32
+ Schema,
33
+ SchemaList,
34
+ SortScheme,
35
+ SortSchemeList,
36
+ SortKey,
37
+ SortKeyList,
38
+ SortOrder,
39
+ StreamLocator,
40
+ StreamFormat,
41
+ Stream,
42
+ Table,
43
+ TableLocator,
44
+ TableVersionLocator,
45
+ TableVersion,
46
+ TruncateTransform,
47
+ TruncateTransformParameters,
48
+ )
49
+
50
+ from deltacat.storage.model.manifest import (
51
+ Manifest,
52
+ ManifestMeta,
53
+ ManifestEntryList,
54
+ )
5
55
  from deltacat.utils.common import current_time_ms
6
56
 
7
57
 
@@ -13,11 +63,14 @@ def create_empty_delta(
13
63
  manifest_entry_id: Optional[str] = None,
14
64
  ) -> Delta:
15
65
  stream_position = current_time_ms()
16
- delta_locator = DeltaLocator.of(partition.locator, stream_position=stream_position)
66
+ delta_locator = DeltaLocator.of(
67
+ partition.locator,
68
+ stream_position=stream_position,
69
+ )
17
70
 
18
71
  if manifest_entry_id:
19
72
  manifest = Manifest.of(
20
- entries=[],
73
+ entries=ManifestEntryList.of([]),
21
74
  author=author,
22
75
  uuid=manifest_entry_id,
23
76
  )
@@ -32,3 +85,202 @@ def create_empty_delta(
32
85
  manifest=manifest,
33
86
  previous_stream_position=partition.stream_position,
34
87
  )
88
+
89
+
90
+ def create_test_namespace():
91
+ namespace_locator = NamespaceLocator.of(namespace="test_namespace")
92
+ return Namespace.of(locator=namespace_locator)
93
+
94
+
95
+ def create_test_table():
96
+ table_locator = TableLocator.at(
97
+ namespace="test_namespace",
98
+ table_name="test_table",
99
+ )
100
+ return Table.of(
101
+ locator=table_locator,
102
+ description="test table description",
103
+ )
104
+
105
+
106
+ def create_test_table_version():
107
+ table_version_locator = TableVersionLocator.at(
108
+ namespace="test_namespace",
109
+ table_name="test_table",
110
+ table_version="v.1",
111
+ )
112
+ schema = Schema.of(
113
+ [
114
+ Field.of(
115
+ field=pa.field("some_string", pa.string(), nullable=False),
116
+ field_id=1,
117
+ is_merge_key=True,
118
+ ),
119
+ Field.of(
120
+ field=pa.field("some_int32", pa.int32(), nullable=False),
121
+ field_id=2,
122
+ is_merge_key=True,
123
+ ),
124
+ Field.of(
125
+ field=pa.field("some_float64", pa.float64()),
126
+ field_id=3,
127
+ is_merge_key=False,
128
+ ),
129
+ ]
130
+ )
131
+ bucket_transform = BucketTransform.of(
132
+ BucketTransformParameters.of(
133
+ num_buckets=2,
134
+ bucketing_strategy=BucketingStrategy.DEFAULT,
135
+ )
136
+ )
137
+ partition_keys = [
138
+ PartitionKey.of(
139
+ key=["some_string", "some_int32"],
140
+ name="test_partition_key",
141
+ field_id=1,
142
+ transform=bucket_transform,
143
+ )
144
+ ]
145
+ partition_scheme = PartitionScheme.of(
146
+ keys=PartitionKeyList.of(partition_keys),
147
+ name="test_partition_scheme",
148
+ scheme_id="test_partition_scheme_id",
149
+ )
150
+ sort_keys = [
151
+ SortKey.of(
152
+ key=["some_int32"],
153
+ sort_order=SortOrder.DESCENDING,
154
+ null_order=NullOrder.AT_START,
155
+ transform=TruncateTransform.of(
156
+ TruncateTransformParameters.of(width=3),
157
+ ),
158
+ )
159
+ ]
160
+ sort_scheme = SortScheme.of(
161
+ keys=SortKeyList.of(sort_keys),
162
+ name="test_sort_scheme",
163
+ scheme_id="test_sort_scheme_id",
164
+ )
165
+ return TableVersion.of(
166
+ locator=table_version_locator,
167
+ schema=schema,
168
+ partition_scheme=partition_scheme,
169
+ description="test table version description",
170
+ properties={"test_property_key": "test_property_value"},
171
+ content_types=[ContentType.PARQUET],
172
+ sort_scheme=sort_scheme,
173
+ watermark=None,
174
+ lifecycle_state=LifecycleState.CREATED,
175
+ schemas=SchemaList.of([schema]),
176
+ partition_schemes=PartitionSchemeList.of([partition_scheme]),
177
+ sort_schemes=SortSchemeList.of([sort_scheme]),
178
+ )
179
+
180
+
181
+ def create_test_stream():
182
+ stream_locator = StreamLocator.at(
183
+ namespace="test_namespace",
184
+ table_name="test_table",
185
+ table_version="v.1",
186
+ stream_id="test_stream_id",
187
+ stream_format=StreamFormat.DELTACAT,
188
+ )
189
+ bucket_transform = BucketTransform.of(
190
+ BucketTransformParameters.of(
191
+ num_buckets=2,
192
+ bucketing_strategy=BucketingStrategy.DEFAULT,
193
+ )
194
+ )
195
+ partition_keys = [
196
+ PartitionKey.of(
197
+ key=["some_string", "some_int32"],
198
+ name="test_partition_key",
199
+ field_id=1,
200
+ transform=bucket_transform,
201
+ )
202
+ ]
203
+ partition_scheme = PartitionScheme.of(
204
+ keys=PartitionKeyList.of(partition_keys),
205
+ name="test_partition_scheme",
206
+ scheme_id="test_partition_scheme_id",
207
+ )
208
+ return Stream.of(
209
+ locator=stream_locator,
210
+ partition_scheme=partition_scheme,
211
+ state=CommitState.STAGED,
212
+ previous_stream_id="test_previous_stream_id",
213
+ watermark=1,
214
+ )
215
+
216
+
217
+ def create_test_partition():
218
+ partition_locator = PartitionLocator.at(
219
+ namespace="test_namespace",
220
+ table_name="test_table",
221
+ table_version="v.1",
222
+ stream_id="test_stream_id",
223
+ stream_format=StreamFormat.DELTACAT,
224
+ partition_values=["a", 1],
225
+ partition_id="test_partition_id",
226
+ )
227
+ return Partition.of(
228
+ locator=partition_locator,
229
+ content_types=[ContentType.PARQUET],
230
+ state=CommitState.STAGED,
231
+ previous_stream_position=0,
232
+ previous_partition_id="test_previous_partition_id",
233
+ stream_position=1,
234
+ partition_scheme_id="test_partition_scheme_id",
235
+ )
236
+
237
+
238
+ def create_test_delta():
239
+ delta_locator = DeltaLocator.at(
240
+ namespace="test_namespace",
241
+ table_name="test_table",
242
+ table_version="v.1",
243
+ stream_id="test_stream_id",
244
+ stream_format=StreamFormat.DELTACAT,
245
+ partition_values=["a", 1],
246
+ partition_id="test_partition_id",
247
+ stream_position=1,
248
+ )
249
+ manifest_entry_params = EntryParams.of(
250
+ equality_field_locators=["some_string", "some_int32"],
251
+ )
252
+ manifest_meta = ManifestMeta.of(
253
+ record_count=1,
254
+ content_length=10,
255
+ content_type=ContentType.PARQUET.value,
256
+ content_encoding=ContentEncoding.IDENTITY.value,
257
+ source_content_length=100,
258
+ credentials={"foo": "bar"},
259
+ content_type_parameters=[{"param1": "value1"}],
260
+ entry_type=EntryType.EQUALITY_DELETE,
261
+ entry_params=manifest_entry_params,
262
+ )
263
+ manifest = Manifest.of(
264
+ entries=ManifestEntryList(
265
+ [
266
+ ManifestEntry.of(
267
+ url="s3://test/url",
268
+ meta=manifest_meta,
269
+ )
270
+ ]
271
+ ),
272
+ author=ManifestAuthor.of(
273
+ name="deltacat",
274
+ version="2.0",
275
+ ),
276
+ entry_type=EntryType.EQUALITY_DELETE,
277
+ entry_params=manifest_entry_params,
278
+ )
279
+ return Delta.of(
280
+ locator=delta_locator,
281
+ delta_type=DeltaType.APPEND,
282
+ meta=manifest_meta,
283
+ properties={"property1": "value1"},
284
+ manifest=manifest,
285
+ previous_stream_position=0,
286
+ )
File without changes
@@ -0,0 +1,104 @@
1
+ import pytest
2
+ import pandas as pd
3
+ import pyarrow as pa
4
+
5
+ from deltacat.types.tables import (
6
+ to_pandas,
7
+ to_pyarrow,
8
+ get_table_length,
9
+ )
10
+
11
+
12
+ def test_convert_to_pandas_error_cases():
13
+ """Test convert_to_pandas with invalid inputs."""
14
+ # Test None input
15
+ with pytest.raises(
16
+ ValueError, match="No pandas conversion function found for table type"
17
+ ):
18
+ to_pandas(None)
19
+
20
+ # Test unsupported type
21
+ with pytest.raises(
22
+ ValueError, match="No pandas conversion function found for table type"
23
+ ):
24
+ to_pandas("invalid_string")
25
+
26
+ # Test unsupported type with complex object
27
+ with pytest.raises(
28
+ ValueError, match="No pandas conversion function found for table type"
29
+ ):
30
+ to_pandas({"not": "a_dataframe"})
31
+
32
+
33
+ def test_convert_to_arrow_error_cases():
34
+ """Test convert_to_arrow with invalid inputs."""
35
+ # Test None input
36
+ with pytest.raises(
37
+ ValueError, match="No pyarrow conversion function found for table type"
38
+ ):
39
+ to_pyarrow(None)
40
+
41
+ # Test unsupported type
42
+ with pytest.raises(
43
+ ValueError, match="No pyarrow conversion function found for table type"
44
+ ):
45
+ to_pyarrow("invalid_string")
46
+
47
+ # Test unsupported type with complex object
48
+ with pytest.raises(
49
+ ValueError, match="No pyarrow conversion function found for table type"
50
+ ):
51
+ to_pyarrow({"not": "a_table"})
52
+
53
+
54
+ def test_conversion_functions_with_real_data():
55
+ """Test conversion functions with actual data structures."""
56
+ # Create test data
57
+ test_df = pd.DataFrame({"id": [1, 2], "name": ["test1", "test2"]})
58
+ test_table = pa.Table.from_pandas(test_df)
59
+
60
+ # Test pandas conversion
61
+ converted_df = to_pandas(test_df)
62
+ assert isinstance(converted_df, pd.DataFrame)
63
+ assert converted_df.equals(test_df)
64
+
65
+ # Test arrow conversion
66
+ converted_table = to_pyarrow(test_table)
67
+ assert isinstance(converted_table, pa.Table)
68
+ assert converted_table.equals(test_table)
69
+
70
+ # Test cross-conversion
71
+ df_from_table = to_pandas(test_table)
72
+ table_from_df = to_pyarrow(test_df)
73
+ assert isinstance(df_from_table, pd.DataFrame)
74
+ assert isinstance(table_from_df, pa.Table)
75
+
76
+
77
+ def test_conversion_roundtrip_consistency():
78
+ """Test that conversion functions maintain data integrity through roundtrips."""
79
+ # Create test data
80
+ original_df = pd.DataFrame(
81
+ {
82
+ "id": [1, 2, 3, 4, 5],
83
+ "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
84
+ "age": [25, 30, 35, 40, 45],
85
+ "city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
86
+ }
87
+ )
88
+
89
+ # Test pandas -> arrow -> pandas roundtrip
90
+ arrow_table = to_pyarrow(original_df)
91
+ roundtrip_df = to_pandas(arrow_table)
92
+
93
+ # Verify data integrity (allowing for potential type changes)
94
+ assert get_table_length(original_df) == get_table_length(
95
+ roundtrip_df
96
+ ), "Row count should be preserved"
97
+ assert list(original_df.columns) == list(
98
+ roundtrip_df.columns
99
+ ), "Column names should be preserved"
100
+
101
+ # Verify ID column integrity (critical for merge operations)
102
+ original_ids = sorted(original_df["id"].tolist())
103
+ roundtrip_ids = sorted(roundtrip_df["id"].tolist())
104
+ assert original_ids == roundtrip_ids, "ID column should be preserved exactly"
@@ -0,0 +1,22 @@
1
+ """
2
+ Exception classes for main storage testing that mirror the local storage exceptions.
3
+ These are used to test the main metastore error categorization functionality.
4
+ """
5
+
6
+
7
+ class InvalidNamespaceError(Exception):
8
+ """Exception raised when an invalid namespace is provided to main storage."""
9
+
10
+ error_name = "InvalidNamespaceError"
11
+
12
+
13
+ class MainStorageValidationError(Exception):
14
+ """Exception raised when main storage validation fails."""
15
+
16
+ error_name = "MainStorageValidationError"
17
+
18
+
19
+ class MainStorageError(Exception):
20
+ """General exception for main storage operations."""
21
+
22
+ error_name = "MainStorageError"
@@ -0,0 +1,31 @@
1
+ """
2
+ Mock module that provides storage-specific error categorization functions for main storage testing.
3
+ """
4
+
5
+ from deltacat.tests.utils.exceptions import (
6
+ InvalidNamespaceError,
7
+ MainStorageValidationError,
8
+ )
9
+
10
+
11
+ def can_categorize(e: BaseException, **kwargs) -> bool:
12
+ """
13
+ Mock implementation of can_categorize for main storage testing.
14
+ Returns True if the input error can be categorized by main storage.
15
+ """
16
+ if isinstance(e, InvalidNamespaceError):
17
+ return True
18
+ else:
19
+ return False
20
+
21
+
22
+ def raise_categorized_error(e: BaseException, **kwargs):
23
+ """
24
+ Mock implementation of raise_categorized_error for main storage testing.
25
+ Converts categorizable errors to their main storage equivalent.
26
+ """
27
+ if isinstance(e, InvalidNamespaceError):
28
+ raise MainStorageValidationError("Namespace provided is invalid!")
29
+ else:
30
+ # If we can't categorize it, re-raise the original exception
31
+ raise e