deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,241 @@
1
+ import pytest
2
+ import pyarrow as pa
3
+ from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
4
+
5
+
6
+ def test_field_initialization():
7
+ field = Field(name="test_field", datatype=Datatype.string(), is_merge_key=True)
8
+ assert field.name == "test_field"
9
+ assert field.datatype == Datatype.string()
10
+ assert field.is_merge_key
11
+
12
+
13
+ def test_schema_initialization():
14
+ fields = [("id", Datatype.int64()), ("name", Datatype.string())]
15
+ schema = Schema(fields, merge_keys=["id"])
16
+ assert len(schema) == 2
17
+ assert "id" in schema.keys()
18
+ assert schema["id"].datatype == Datatype.int64()
19
+ assert "name" in schema.keys()
20
+ assert schema["name"].datatype == Datatype.string()
21
+
22
+
23
+ def test_merge_key_conflict_on_init():
24
+ fields = [
25
+ Field("id", Datatype.int64(), is_merge_key=False), # Merge key off here
26
+ ("name", Datatype.string()),
27
+ ]
28
+ with pytest.raises(TypeError):
29
+ Schema(fields, merge_keys=["id"]) # Merge key on here
30
+
31
+
32
+ def test_simultaneous_duplicate_field():
33
+ with pytest.raises(ValueError):
34
+ Schema(
35
+ [
36
+ ("id", Datatype.int32()),
37
+ ("name", Datatype.string()),
38
+ ("age", Datatype.int32()),
39
+ ("age", Datatype.string()),
40
+ ],
41
+ merge_keys=["id"],
42
+ )
43
+
44
+
45
+ def test_add_field():
46
+ schema = Schema()
47
+ field = Field("new_field", Datatype.float(), True)
48
+ schema.add_field(field)
49
+ assert len(schema) == 1
50
+ assert "new_field" in schema.keys()
51
+ assert schema["new_field"].datatype == Datatype.float()
52
+
53
+ field2 = Field("another_field", Datatype.string(), True)
54
+ schema.add_field(field2)
55
+ assert len(schema) == 2
56
+ assert "another_field" in schema.keys()
57
+ assert schema["another_field"].datatype == Datatype.string()
58
+
59
+ with pytest.raises(ValueError):
60
+ schema.add_field(field2)
61
+
62
+
63
+ def test_setitem_field():
64
+ schema = Schema()
65
+ field = Field("test_field", Datatype.int64(), is_merge_key=True)
66
+ schema["test_field"] = field
67
+ assert schema["test_field"] == field
68
+
69
+
70
+ def test_setitem_datatype():
71
+ schema = Schema()
72
+ schema["id"] = (Datatype.int64(), True)
73
+ schema["test_field"] = Datatype.int64()
74
+ assert schema["test_field"].name == "test_field"
75
+ assert schema["test_field"].datatype == Datatype.int64()
76
+ assert not schema["test_field"].is_merge_key
77
+
78
+
79
+ def test_setitem_tuple_with_merge_key():
80
+ schema = Schema()
81
+ schema["test_field"] = (Datatype.int64(), True)
82
+ assert schema["test_field"].name == "test_field"
83
+ assert schema["test_field"].datatype == Datatype.int64()
84
+ assert schema["test_field"].is_merge_key
85
+
86
+
87
+ def test_setitem_invalid_type():
88
+ schema = Schema()
89
+ with pytest.raises(TypeError):
90
+ schema["test_field"] = "invalid"
91
+
92
+
93
+ def test_non_empty_merge_key():
94
+ with pytest.raises(TypeError):
95
+ _ = Schema([], merge_keys=["id"])
96
+
97
+
98
+ def test_merge_schemas():
99
+ schema1 = Schema([("id", Datatype.int64())], merge_keys=["id"])
100
+ schema2 = Schema(
101
+ [("other_id", Datatype.string()), ("name", Datatype.string())],
102
+ merge_keys="other_id",
103
+ )
104
+ schema1.merge(schema2)
105
+ assert len(schema1) == 3
106
+ assert "id" in schema1.keys()
107
+ assert "name" in schema1.keys()
108
+ assert "other_id" in schema1.keys()
109
+
110
+
111
+ def test_merge_schemas_same_merge_key():
112
+ schema1 = Schema(
113
+ [("id", Datatype.int64()), ("name", Datatype.string())], merge_keys=["id"]
114
+ )
115
+ schema2 = Schema(
116
+ [("id", Datatype.int64()), ("other_name", Datatype.string())],
117
+ merge_keys="id",
118
+ )
119
+ schema1.merge(schema2)
120
+ assert len(schema1) == 3
121
+ assert "id" in schema1.keys()
122
+ assert "name" in schema1.keys()
123
+ assert "other_name" in schema1.keys()
124
+
125
+
126
+ def test_merge_schema_conflict():
127
+ schema1 = Schema([("id", Datatype.int64())], merge_keys=["id"])
128
+ schema1_dup = Schema([("id", Datatype.int64())], merge_keys=["id"])
129
+ schema2 = Schema([("id", Datatype.string())], merge_keys=["id"])
130
+
131
+ with pytest.raises(ValueError):
132
+ schema1.merge(schema2)
133
+
134
+ schema1.merge(
135
+ schema1_dup
136
+ ) # Merging the same field is allowed (unlike using add_field)
137
+ assert schema1["id"].datatype == Datatype.int64()
138
+ assert len(schema1) == 1
139
+
140
+
141
+ def test_to_pyarrow_schema():
142
+ fields = [("id", Datatype.int64()), ("name", Datatype.string())]
143
+ schema = Schema(fields, merge_keys=["id"])
144
+ pa_schema = schema.to_pyarrow()
145
+ assert isinstance(pa_schema, pa.Schema)
146
+ assert len(pa_schema) == 2
147
+ assert pa_schema.field("id").type == pa.int64()
148
+ assert pa_schema.field("name").type == pa.string()
149
+
150
+
151
+ def test_from_pyarrow_schema():
152
+ pa_schema = pa.schema([("id", pa.int64()), ("name", pa.string())])
153
+ schema = Schema.from_pyarrow(pa_schema, merge_keys=["id"])
154
+ assert len(schema) == 2
155
+ assert schema["id"].is_merge_key
156
+
157
+
158
+ def test_from_pyarrow_schema_invalid_merge_keys():
159
+ pa_schema = pa.schema([("id", pa.int64()), ("name", pa.string())])
160
+ with pytest.raises(ValueError):
161
+ Schema.from_pyarrow(pa_schema, merge_keys=["bad_key"])
162
+
163
+
164
+ def test_get_field():
165
+ schema = Schema([("id", Datatype.int64())], merge_keys=["id"])
166
+ field = schema["id"]
167
+ assert field.name == "id"
168
+ assert field.datatype == Datatype.int64()
169
+
170
+
171
+ def test_set_field():
172
+ schema = Schema([("id", Datatype.int64())], merge_keys=["id"])
173
+ schema["name"] = Field("name", Datatype.string())
174
+ assert len(schema) == 2
175
+ assert "name" in schema.keys()
176
+ assert schema["name"].datatype == Datatype.string()
177
+
178
+
179
+ def test_delete_field():
180
+ schema = Schema(
181
+ [("name", Datatype.string()), ("zip", Datatype.int32())], merge_keys=["name"]
182
+ )
183
+ del schema["zip"]
184
+ assert "zip" not in schema.keys()
185
+ assert "name" in schema.keys()
186
+
187
+
188
+ def test_delete_merge_key_field():
189
+ schema = Schema([("id", Datatype.int64())], merge_keys=["id"])
190
+ with pytest.raises(ValueError):
191
+ del schema["id"]
192
+
193
+
194
+ def test_schema_iter():
195
+ fields = [
196
+ Field("id", Datatype.int32(), is_merge_key=True),
197
+ Field("name", Datatype.string()),
198
+ ]
199
+ schema = Schema(fields)
200
+ iter_result = list(iter(schema))
201
+ assert len(iter_result) == 2
202
+ assert all(isinstance(item, str) for item in iter_result)
203
+
204
+
205
+ def test_merge_all():
206
+ schema1 = Schema(
207
+ [
208
+ Field("id", Datatype.int64(), is_merge_key=True),
209
+ Field("name", Datatype.string()),
210
+ ]
211
+ )
212
+ schema2 = Schema(
213
+ [
214
+ Field("age", Datatype.int32()),
215
+ Field("email", Datatype.string(), is_merge_key=True),
216
+ ]
217
+ )
218
+ merged_schema = Schema.merge_all([schema1, schema2])
219
+ assert len(merged_schema) == 4
220
+
221
+
222
+ def test_schema_values():
223
+ fields = [
224
+ Field("id", Datatype.int64(), is_merge_key=True),
225
+ Field("name", Datatype.string()),
226
+ ]
227
+ schema = Schema(fields)
228
+ values = list(schema.values())
229
+ assert len(values) == 2
230
+ assert all(isinstance(v, Field) for v in values)
231
+
232
+
233
+ def test_schema_items():
234
+ fields = [
235
+ Field("id", Datatype.int64(), is_merge_key=True),
236
+ Field("name", Datatype.string()),
237
+ ]
238
+ schema = Schema(fields)
239
+ items = list(schema.items())
240
+ assert len(items) == 2
241
+ assert all(isinstance(k, str) and isinstance(v, Field) for k, v in items)
@@ -0,0 +1,162 @@
1
+ import pytest
2
+ import pyarrow as pa
3
+ import pyarrow.parquet as pq
4
+
5
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
6
+ from deltacat.experimental.storage.rivulet.shard.range_shard import (
7
+ RangeShard,
8
+ RangeShardingStrategy,
9
+ )
10
+
11
+
12
+ @pytest.fixture
13
+ def sample_numeric_dataset(tmp_path):
14
+ """
15
+ Creates a small Parquet file with integer-based min/max keys and
16
+ initializes a Dataset from it. Merge key is 'id' with values [1,2,3].
17
+ So min_key=1, max_key=3.
18
+ """
19
+ data = {
20
+ "id": [1, 2, 3],
21
+ "name": ["Alice", "Bob", "Charlie"],
22
+ "age": [25, 30, 35],
23
+ }
24
+ table = pa.Table.from_pydict(data)
25
+ parquet_file = tmp_path / "numeric_data.parquet"
26
+ pq.write_table(table, parquet_file)
27
+
28
+ ds = Dataset.from_parquet(
29
+ name="numeric_dataset",
30
+ file_uri=str(parquet_file),
31
+ metadata_uri=tmp_path,
32
+ merge_keys="id",
33
+ )
34
+ return ds
35
+
36
+
37
+ @pytest.fixture
38
+ def sample_string_dataset(tmp_path):
39
+ """
40
+ Creates a small Parquet file with a string-based merge key ('name')
41
+ and initializes a Dataset from it. Merge key has values
42
+ ['Alice', 'Bob', 'Charlie'] => min_key='Alice', max_key='Charlie'.
43
+ """
44
+ data = {
45
+ "name": ["Alice", "Charlie", "Bob"], # random order
46
+ "value": [100, 200, 150],
47
+ }
48
+ table = pa.Table.from_pydict(data)
49
+ parquet_file = tmp_path / "string_data.parquet"
50
+ pq.write_table(table, parquet_file)
51
+
52
+ ds = Dataset.from_parquet(
53
+ name="string_dataset",
54
+ file_uri=str(parquet_file),
55
+ metadata_uri=tmp_path,
56
+ merge_keys="name",
57
+ )
58
+ return ds
59
+
60
+
61
+ def test_shards(sample_numeric_dataset, sample_string_dataset):
62
+ shards = sample_numeric_dataset.shards(num_shards=2)
63
+
64
+ num_shards = len(list(shards))
65
+ assert num_shards == 2
66
+
67
+ shard = shards[0]
68
+ records = list(sample_numeric_dataset.scan(shard=shard).to_pydict())
69
+ num_records = len(records)
70
+ assert num_records == 2
71
+
72
+ assert records[0]["id"] == 1
73
+ assert records[0]["name"] == "Alice"
74
+
75
+ assert records[1]["id"] == 2
76
+ assert records[1]["name"] == "Bob"
77
+
78
+
79
+ def test_range_shard_repr():
80
+ shard = RangeShard(min_key=5, max_key=15)
81
+ assert repr(shard) == "Shard(type=range, min_key=5, max_key=15)"
82
+
83
+
84
+ def test_range_shard_split_integers():
85
+ shards = RangeShard.split(global_min=1, global_max=10, num_shards=2)
86
+ assert len(shards) == 2
87
+
88
+ assert shards[0].min_key == 1
89
+ assert shards[0].max_key == 5
90
+ assert shards[1].min_key == 6
91
+ assert shards[1].max_key == 10
92
+
93
+
94
+ def test_range_shard_split_integers_single_shard():
95
+ shards = RangeShard.split(global_min=1, global_max=10, num_shards=1)
96
+ assert len(shards) == 1
97
+ assert shards[0].min_key == 1
98
+ assert shards[0].max_key == 10
99
+
100
+
101
+ def test_range_shard_split_integers_same_value():
102
+ shards = RangeShard.split(global_min=5, global_max=5, num_shards=3)
103
+ assert len(shards) == 1
104
+
105
+
106
+ def test_range_sharding_strategy_integers(sample_numeric_dataset):
107
+ strategy = RangeShardingStrategy()
108
+ shards = list(
109
+ strategy.shards(num_shards=2, metastore=sample_numeric_dataset._metastore)
110
+ )
111
+
112
+ assert len(shards) == 2, "Expected 2 shards for dataset with keys [1,2,3]"
113
+
114
+ shard1, shard2 = shards
115
+ assert isinstance(shard1, RangeShard)
116
+ assert isinstance(shard2, RangeShard)
117
+ assert shard1.min_key == 1
118
+ assert shard1.max_key == 2
119
+ assert shard2.min_key == 3
120
+ assert shard2.max_key == 3
121
+
122
+
123
+ def test_range_sharding_strategy_integers_single_shard(sample_numeric_dataset):
124
+ strategy = RangeShardingStrategy()
125
+ shards = list(
126
+ strategy.shards(num_shards=1, metastore=sample_numeric_dataset._metastore)
127
+ )
128
+ assert len(shards) == 1
129
+ shard = shards[0]
130
+ assert shard.min_key == 1
131
+ assert shard.max_key == 3
132
+
133
+
134
+ def test_range_sharding_strategy_strings(sample_string_dataset):
135
+ strategy = RangeShardingStrategy()
136
+ shards = list(
137
+ strategy.shards(num_shards=2, metastore=sample_string_dataset._metastore)
138
+ )
139
+
140
+ assert len(shards) == 2, "Expected 2 shards for string-based dataset"
141
+ shard1, shard2 = shards
142
+ assert isinstance(shard1, RangeShard)
143
+ assert isinstance(shard2, RangeShard)
144
+
145
+ assert shard1.min_key == "Alice"
146
+ assert shard1.max_key < "Charlie"
147
+
148
+ assert shard2.min_key == shard1.max_key
149
+ assert shard2.max_key == "Charlie"
150
+
151
+
152
+ def test_range_sharding_strategy_strings_single_shard(sample_string_dataset):
153
+ strategy = RangeShardingStrategy()
154
+ shards = list(
155
+ strategy.shards(num_shards=1, metastore=sample_string_dataset._metastore)
156
+ )
157
+
158
+ assert len(shards) == 1
159
+
160
+ shard = shards[0]
161
+ assert shard.min_key == "Alice"
162
+ assert shard.max_key == "Charlie"