deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,1262 +0,0 @@
1
- from typing import Any, Callable, Dict, List, Optional, Set, Union, Tuple
2
-
3
- import pyarrow as pa
4
- import daft
5
- import json
6
- import sqlite3
7
- from sqlite3 import Cursor, Connection
8
- import uuid
9
- import ray
10
- import io
11
-
12
- from deltacat.tests.test_utils.storage import create_empty_delta
13
- from deltacat.utils.common import current_time_ms
14
-
15
-
16
- from deltacat.storage import (
17
- Delta,
18
- DeltaLocator,
19
- DeltaType,
20
- DistributedDataset,
21
- LifecycleState,
22
- ListResult,
23
- LocalDataset,
24
- LocalTable,
25
- Manifest,
26
- ManifestAuthor,
27
- Namespace,
28
- NamespaceLocator,
29
- Partition,
30
- SchemaConsistencyType,
31
- Stream,
32
- StreamLocator,
33
- Table,
34
- TableVersion,
35
- TableVersionLocator,
36
- TableLocator,
37
- CommitState,
38
- SortKey,
39
- PartitionLocator,
40
- ManifestMeta,
41
- ManifestEntry,
42
- ManifestEntryList,
43
- DeleteParameters,
44
- PartitionFilter,
45
- PartitionValues,
46
- DeltaPartitionSpec,
47
- StreamPartitionSpec,
48
- TransformName,
49
- IdentityTransformParameters,
50
- )
51
- from deltacat.types.media import (
52
- ContentType,
53
- StorageType,
54
- TableType,
55
- ContentEncoding,
56
- DistributedDatasetType,
57
- )
58
- from deltacat.utils.common import ReadKwargsProvider
59
- from deltacat.tests.local_deltacat_storage.exceptions import (
60
- InvalidNamespaceError,
61
- LocalStorageValidationError,
62
- )
63
-
64
- SQLITE_CUR_ARG = "sqlite3_cur"
65
- SQLITE_CON_ARG = "sqlite3_con"
66
- DB_FILE_PATH_ARG = "db_file_path"
67
-
68
- STORAGE_TYPE = "SQLITE3"
69
- STREAM_ID_PROPERTY = "stream_id"
70
- CREATE_NAMESPACES_TABLE = (
71
- "CREATE TABLE IF NOT EXISTS namespaces(locator, value, PRIMARY KEY (locator))"
72
- )
73
- CREATE_TABLES_TABLE = (
74
- "CREATE TABLE IF NOT EXISTS tables(locator, namespace_locator, value, PRIMARY KEY (locator), "
75
- "FOREIGN KEY (namespace_locator) REFERENCES namespaces(locator))"
76
- )
77
- CREATE_TABLE_VERSIONS_TABLE = (
78
- "CREATE TABLE IF NOT EXISTS table_versions(locator, table_locator, value, PRIMARY KEY (locator), "
79
- "FOREIGN KEY (table_locator) REFERENCES tables(locator))"
80
- )
81
- CREATE_STREAMS_TABLE = (
82
- "CREATE TABLE IF NOT EXISTS streams(locator, table_version_locator, value, PRIMARY KEY(locator), "
83
- "FOREIGN KEY (table_version_locator) REFERENCES table_versions(locator))"
84
- )
85
- CREATE_PARTITIONS_TABLE = (
86
- "CREATE TABLE IF NOT EXISTS partitions(locator, stream_locator, value, PRIMARY KEY(locator), "
87
- "FOREIGN KEY (stream_locator) REFERENCES streams(locator))"
88
- )
89
- CREATE_DELTAS_TABLE = (
90
- "CREATE TABLE IF NOT EXISTS deltas(locator, partition_locator, value, PRIMARY KEY(locator), "
91
- "FOREIGN KEY (partition_locator) REFERENCES partitions(locator))"
92
- )
93
- CREATE_DATA_TABLE = "CREATE TABLE IF NOT EXISTS data(uri, value, PRIMARY KEY(uri))"
94
-
95
-
96
- def _get_sqlite3_cursor_con(kwargs) -> Tuple[Cursor, Connection]:
97
- if SQLITE_CUR_ARG in kwargs and SQLITE_CON_ARG in kwargs:
98
- return kwargs[SQLITE_CUR_ARG], kwargs[SQLITE_CON_ARG]
99
- elif DB_FILE_PATH_ARG in kwargs:
100
- con = sqlite3.connect(kwargs[DB_FILE_PATH_ARG])
101
- cur = con.cursor()
102
- return cur, con
103
-
104
- raise ValueError(f"Invalid local db connection kwargs: {kwargs}")
105
-
106
-
107
- def _get_manifest_entry_uri(manifest_entry_id: str) -> str:
108
- return f"cloudpickle://{manifest_entry_id}"
109
-
110
-
111
- def _merge_and_promote(
112
- partition_deltas: List[Delta], previous_partition_deltas: List[Delta]
113
- ):
114
- previous_partition_deltas_spos_gt: List[Delta] = [
115
- delta
116
- for delta in previous_partition_deltas
117
- if delta.stream_position > partition_deltas[0].stream_position
118
- ]
119
- # handle the case if the previous partition deltas have a greater stream position than the partition_delta
120
- partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
121
- return partition_deltas
122
-
123
-
124
- def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
125
- cur, con = _get_sqlite3_cursor_con(kwargs)
126
- res = cur.execute("SELECT * FROM namespaces")
127
- fetched = res.fetchall()
128
- result = []
129
-
130
- for item in fetched:
131
- result.append(Namespace(json.loads(item[1])))
132
-
133
- return ListResult.of(result, None, None)
134
-
135
-
136
- def list_tables(namespace: str, *args, **kwargs) -> ListResult[Table]:
137
- cur, con = _get_sqlite3_cursor_con(kwargs)
138
- params = (NamespaceLocator.of(namespace).canonical_string(),)
139
- res = cur.execute("SELECT * FROM tables WHERE namespace_locator = ?", params)
140
- fetched = res.fetchall()
141
- result = []
142
-
143
- for item in fetched:
144
- result.append(Table(json.loads(item[2])))
145
-
146
- return ListResult.of(result, None, None)
147
-
148
-
149
- def list_table_versions(
150
- namespace: str, table_name: str, *args, **kwargs
151
- ) -> ListResult[TableVersion]:
152
- cur, con = _get_sqlite3_cursor_con(kwargs)
153
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
154
-
155
- res = cur.execute(
156
- "SELECT * FROM table_versions WHERE table_locator = ?",
157
- (table_locator.canonical_string(),),
158
- )
159
- fetched = res.fetchall()
160
- result = []
161
-
162
- for item in fetched:
163
- result.append(TableVersion(json.loads(item[2])))
164
-
165
- return ListResult.of(result, None, None)
166
-
167
-
168
- def list_partitions(
169
- namespace: str,
170
- table_name: str,
171
- table_version: Optional[str] = None,
172
- *args,
173
- **kwargs,
174
- ) -> ListResult[Partition]:
175
- cur, con = _get_sqlite3_cursor_con(kwargs)
176
-
177
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
178
-
179
- res = cur.execute(
180
- "SELECT * FROM partitions WHERE stream_locator = ?",
181
- (stream.locator.canonical_string(),),
182
- )
183
-
184
- fetched = res.fetchall()
185
- result = []
186
- for item in fetched:
187
- partition = Partition(json.loads(item[2]))
188
- if partition.state == CommitState.COMMITTED:
189
- result.append(partition)
190
-
191
- return ListResult.of(result, None, None)
192
-
193
-
194
- def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partition]:
195
- return list_partitions(
196
- stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
197
- )
198
-
199
-
200
- def list_deltas(
201
- namespace: str,
202
- table_name: str,
203
- partition_values: Optional[PartitionValues] = None,
204
- table_version: Optional[str] = None,
205
- first_stream_position: Optional[int] = None,
206
- last_stream_position: Optional[int] = None,
207
- ascending_order: Optional[bool] = None,
208
- include_manifest: bool = False,
209
- partition_filter: Optional[PartitionFilter] = None,
210
- *args,
211
- **kwargs,
212
- ) -> ListResult[Delta]:
213
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
214
- if stream is None:
215
- return ListResult.of([], None, None)
216
-
217
- if partition_values is not None and partition_filter is not None:
218
- raise ValueError(
219
- "Only one of partition_values or partition_filter must be provided"
220
- )
221
- if partition_filter is not None:
222
- partition_values = partition_filter.partition_values
223
-
224
- partition = get_partition(stream.locator, partition_values, *args, **kwargs)
225
-
226
- all_deltas = list_partition_deltas(
227
- partition,
228
- first_stream_position=first_stream_position,
229
- last_stream_position=last_stream_position,
230
- ascending_order=ascending_order,
231
- include_manifest=include_manifest,
232
- *args,
233
- **kwargs,
234
- ).all_items()
235
-
236
- result = []
237
-
238
- for delta in all_deltas:
239
- if (
240
- not first_stream_position or first_stream_position < delta.stream_position
241
- ) and (
242
- not last_stream_position or delta.stream_position <= last_stream_position
243
- ):
244
- result.append(delta)
245
-
246
- if not include_manifest:
247
- delta.manifest = None
248
-
249
- result.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
250
- return ListResult.of(result, None, None)
251
-
252
-
253
- def list_partition_deltas(
254
- partition_like: Union[Partition, PartitionLocator],
255
- first_stream_position: Optional[int] = None,
256
- last_stream_position: Optional[int] = None,
257
- ascending_order: bool = False,
258
- include_manifest: bool = False,
259
- *args,
260
- **kwargs,
261
- ) -> ListResult[Delta]:
262
- cur, con = _get_sqlite3_cursor_con(kwargs)
263
-
264
- if partition_like is None:
265
- return ListResult.of([], None, None)
266
-
267
- if first_stream_position is None:
268
- first_stream_position = 0
269
-
270
- if last_stream_position is None:
271
- last_stream_position = float("inf")
272
-
273
- assert isinstance(partition_like, Partition) or isinstance(
274
- partition_like, PartitionLocator
275
- ), f"Expected a Partition or PartitionLocator as an input argument but found {partition_like}"
276
-
277
- partition_locator = None
278
- if isinstance(partition_like, Partition):
279
- partition_locator = partition_like.locator
280
- else:
281
- partition_locator = partition_like
282
-
283
- res = cur.execute(
284
- "SELECT * FROM deltas WHERE partition_locator = ?",
285
- (partition_locator.canonical_string(),),
286
- )
287
-
288
- serialized_items = res.fetchall()
289
-
290
- if not serialized_items:
291
- return ListResult.of([], None, None)
292
-
293
- result = []
294
- for item in serialized_items:
295
- current_delta = Delta(json.loads(item[2]))
296
- if (
297
- first_stream_position
298
- <= current_delta.stream_position
299
- <= last_stream_position
300
- ):
301
- result.append(current_delta)
302
-
303
- if not include_manifest:
304
- current_delta.manifest = None
305
-
306
- result.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
307
- return ListResult.of(result, None, None)
308
-
309
-
310
- def get_delta(
311
- namespace: str,
312
- table_name: str,
313
- stream_position: int,
314
- partition_values: Optional[PartitionValues] = None,
315
- table_version: Optional[str] = None,
316
- include_manifest: bool = False,
317
- partition_filter: Optional[PartitionFilter] = None,
318
- *args,
319
- **kwargs,
320
- ) -> Optional[Delta]:
321
- cur, con = _get_sqlite3_cursor_con(kwargs)
322
-
323
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
324
-
325
- if partition_values is not None and partition_filter is not None:
326
- raise ValueError(
327
- "Only one of partition_values or partition_filter must be provided"
328
- )
329
-
330
- if partition_filter is not None:
331
- partition_values = partition_filter.partition_values
332
-
333
- partition = get_partition(stream.locator, partition_values, *args, **kwargs)
334
- delta_locator = DeltaLocator.of(partition.locator, stream_position)
335
-
336
- res = cur.execute(
337
- "SELECT * FROM deltas WHERE locator = ?", (delta_locator.canonical_string(),)
338
- )
339
-
340
- serialized_delta = res.fetchone()
341
- if serialized_delta is None:
342
- return None
343
-
344
- delta = Delta(json.loads(serialized_delta[2]))
345
-
346
- if not include_manifest:
347
- delta.manifest = None
348
-
349
- return delta
350
-
351
-
352
- def get_latest_delta(
353
- namespace: str,
354
- table_name: str,
355
- partition_values: Optional[PartitionValues] = None,
356
- table_version: Optional[str] = None,
357
- include_manifest: bool = False,
358
- partition_filter: Optional[PartitionFilter] = None,
359
- *args,
360
- **kwargs,
361
- ) -> Optional[Delta]:
362
-
363
- deltas = list_deltas(
364
- namespace=namespace,
365
- table_name=table_name,
366
- partition_values=partition_values,
367
- table_version=table_version,
368
- first_stream_position=None,
369
- last_stream_position=None,
370
- ascending_order=False,
371
- include_manifest=include_manifest,
372
- partition_filter=partition_filter,
373
- *args,
374
- **kwargs,
375
- ).all_items()
376
-
377
- if not deltas:
378
- return None
379
-
380
- return deltas[0]
381
-
382
-
383
- def download_delta(
384
- delta_like: Union[Delta, DeltaLocator],
385
- table_type: TableType = TableType.PYARROW,
386
- storage_type: StorageType = StorageType.DISTRIBUTED,
387
- max_parallelism: Optional[int] = None,
388
- columns: Optional[List[str]] = None,
389
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
390
- ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
391
- distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
392
- partition_filter: Optional[PartitionFilter] = None,
393
- *args,
394
- **kwargs,
395
- ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
396
- result = []
397
- if isinstance(delta_like, Delta) and delta_like.manifest is not None:
398
- manifest = Delta(delta_like).manifest
399
- else:
400
- manifest = get_delta_manifest(delta_like, *args, **kwargs)
401
- partition_values: PartitionValues = None
402
- if partition_filter is not None:
403
- partition_values = partition_filter.partition_values
404
- for entry_index in range(len(manifest.entries)):
405
- if (
406
- partition_values is not None
407
- and partition_values != manifest.entries[entry_index].meta.partition_values
408
- ):
409
- continue
410
-
411
- result.append(
412
- download_delta_manifest_entry(
413
- delta_like=delta_like,
414
- entry_index=entry_index,
415
- table_type=table_type,
416
- columns=columns,
417
- file_reader_kwargs_provider=file_reader_kwargs_provider,
418
- *args,
419
- **kwargs,
420
- )
421
- )
422
-
423
- if storage_type == StorageType.DISTRIBUTED:
424
- if distributed_dataset_type is DistributedDatasetType.DAFT:
425
- return daft.from_arrow(result)
426
- elif distributed_dataset_type is DistributedDatasetType.RAY_DATASET:
427
- return ray.data.from_arrow(result)
428
- else:
429
- raise ValueError(f"Dataset type {distributed_dataset_type} not supported!")
430
-
431
- return result
432
-
433
-
434
- def download_delta_manifest_entry(
435
- delta_like: Union[Delta, DeltaLocator],
436
- entry_index: int,
437
- table_type: TableType = TableType.PYARROW,
438
- columns: Optional[List[str]] = None,
439
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
440
- *args,
441
- **kwargs,
442
- ) -> LocalTable:
443
- cur, con = _get_sqlite3_cursor_con(kwargs)
444
- if isinstance(delta_like, Delta) and delta_like.manifest is not None:
445
- manifest = Delta(delta_like).manifest
446
- else:
447
- manifest = get_delta_manifest(delta_like, *args, **kwargs)
448
- if entry_index >= len(manifest.entries):
449
- raise IndexError(
450
- f"Manifest entry index {entry_index} does not exist. "
451
- f"Valid values: [0, {len(manifest.entries)}]"
452
- )
453
-
454
- entry = manifest.entries[entry_index]
455
-
456
- res = cur.execute("SELECT value FROM data WHERE uri = ?", (entry.uri,))
457
- serialized_data = res.fetchone()
458
-
459
- if serialized_data is None:
460
- raise ValueError(
461
- f"Invalid value of delta locator: {delta_like.canonical_string()}"
462
- )
463
-
464
- serialized_data = serialized_data[0]
465
- if entry.meta.content_type == ContentType.PARQUET:
466
- if table_type == TableType.PYARROW_PARQUET:
467
- table = pa.parquet.ParquetFile(io.BytesIO(serialized_data))
468
- else:
469
- table = pa.parquet.read_table(io.BytesIO(serialized_data), columns=columns)
470
- elif entry.meta.content_type == ContentType.UNESCAPED_TSV:
471
- assert (
472
- table_type != TableType.PYARROW_PARQUET
473
- ), f"uTSV table cannot be read as {table_type}"
474
- parse_options = pa.csv.ParseOptions(delimiter="\t")
475
- convert_options = pa.csv.ConvertOptions(
476
- null_values=[""], strings_can_be_null=True, include_columns=columns
477
- )
478
- table = pa.csv.read_csv(
479
- io.BytesIO(serialized_data),
480
- parse_options=parse_options,
481
- convert_options=convert_options,
482
- )
483
- else:
484
- raise ValueError(f"Content type: {entry.meta.content_type} not supported.")
485
-
486
- if table_type == TableType.PYARROW:
487
- return table
488
- elif table_type == TableType.PYARROW_PARQUET:
489
- return table
490
- elif table_type == TableType.NUMPY:
491
- raise NotImplementedError(f"Table type={table_type} not supported")
492
- elif table_type == TableType.PANDAS:
493
- return table.to_pandas()
494
-
495
- return table
496
-
497
-
498
- def get_delta_manifest(
499
- delta_like: Union[Delta, DeltaLocator], *args, **kwargs
500
- ) -> Optional[Manifest]:
501
- delta = get_delta(
502
- namespace=delta_like.namespace,
503
- table_name=delta_like.table_name,
504
- stream_position=delta_like.stream_position,
505
- partition_values=delta_like.partition_values,
506
- table_version=delta_like.table_version,
507
- include_manifest=True,
508
- *args,
509
- **kwargs,
510
- )
511
- if not delta:
512
- return None
513
-
514
- return delta.manifest
515
-
516
-
517
- def create_namespace(
518
- namespace: str, permissions: Dict[str, Any], *args, **kwargs
519
- ) -> Namespace:
520
- cur, con = _get_sqlite3_cursor_con(kwargs)
521
- locator = NamespaceLocator.of(namespace)
522
- result = Namespace.of(locator, permissions)
523
- params = (locator.canonical_string(), json.dumps(result))
524
- cur.execute(CREATE_NAMESPACES_TABLE)
525
- cur.execute(CREATE_TABLES_TABLE)
526
- cur.execute(CREATE_TABLE_VERSIONS_TABLE)
527
- cur.execute(CREATE_STREAMS_TABLE)
528
- cur.execute(CREATE_PARTITIONS_TABLE)
529
- cur.execute(CREATE_DELTAS_TABLE)
530
- cur.execute(CREATE_DATA_TABLE)
531
- cur.execute("INSERT OR IGNORE INTO namespaces VALUES(?, ?)", params)
532
- con.commit()
533
- return result
534
-
535
-
536
- def update_namespace(
537
- namespace: str,
538
- permissions: Optional[Dict[str, Any]] = None,
539
- new_namespace: Optional[str] = None,
540
- *args,
541
- **kwargs,
542
- ) -> None:
543
- assert new_namespace is None, "namespace name cannot be changed"
544
- cur, con = _get_sqlite3_cursor_con(kwargs)
545
- locator = NamespaceLocator.of(namespace)
546
- result = Namespace.of(locator, permissions)
547
- params = (json.dumps(result), locator.canonical_string())
548
- cur.execute("UPDATE namespaces SET value = ? WHERE locator = ?", params)
549
- con.commit()
550
-
551
-
552
- def create_table_version(
553
- namespace: str,
554
- table_name: str,
555
- table_version: Optional[str] = None,
556
- schema: Optional[Union[pa.Schema, str, bytes]] = None,
557
- schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
558
- partition_keys: Optional[List[Dict[str, Any]]] = None,
559
- primary_key_column_names: Optional[Set[str]] = None,
560
- sort_keys: Optional[List[SortKey]] = None,
561
- table_version_description: Optional[str] = None,
562
- table_version_properties: Optional[Dict[str, str]] = None,
563
- table_permissions: Optional[Dict[str, Any]] = None,
564
- table_description: Optional[str] = None,
565
- table_properties: Optional[Dict[str, str]] = None,
566
- supported_content_types: Optional[List[ContentType]] = None,
567
- partition_spec: Optional[StreamPartitionSpec] = None,
568
- *args,
569
- **kwargs,
570
- ) -> Stream:
571
- cur, con = _get_sqlite3_cursor_con(kwargs)
572
-
573
- if partition_keys is not None and partition_spec is not None:
574
- raise ValueError(
575
- "Only one of partition_keys or partition_spec must be provided"
576
- )
577
- if partition_spec is not None:
578
- assert (
579
- partition_spec.ordered_transforms is not None
580
- ), "Ordered transforms must be specified when partition_spec is specified"
581
- partition_keys = []
582
- for transform in partition_spec.ordered_transforms:
583
- assert transform.name == TransformName.IDENTITY, (
584
- "Local DeltaCAT storage does not support creating table versions "
585
- "with non identity transform partition spec"
586
- )
587
- transform_params: IdentityTransformParameters = transform.parameters
588
- partition_keys.append(transform_params.column_name)
589
-
590
- latest_version = get_latest_table_version(namespace, table_name, *args, **kwargs)
591
- if (
592
- table_version is not None
593
- and latest_version
594
- and int(latest_version.table_version) + 1 != int(table_version)
595
- ):
596
- raise AssertionError(
597
- f"Table version can only be incremented. Last version={latest_version.table_version}"
598
- )
599
- elif table_version is None:
600
- table_version = (
601
- (int(latest_version.table_version) + 1) if latest_version else "1"
602
- )
603
-
604
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
605
- table_obj = Table.of(
606
- table_locator, table_permissions, table_description, table_properties
607
- )
608
- table_version_locator = TableVersionLocator.of(
609
- table_locator=table_locator, table_version=table_version
610
- )
611
-
612
- stream_id = uuid.uuid4().__str__()
613
-
614
- if table_version_properties is None:
615
- table_version_properties = {}
616
-
617
- properties = {**table_version_properties, STREAM_ID_PROPERTY: stream_id}
618
- table_version_obj = TableVersion.of(
619
- table_version_locator,
620
- schema=schema,
621
- partition_keys=partition_keys,
622
- primary_key_columns=primary_key_column_names,
623
- description=table_version_description,
624
- properties=properties,
625
- sort_keys=sort_keys,
626
- content_types=supported_content_types,
627
- )
628
- stream_locator = StreamLocator.of(
629
- table_version_obj.locator, stream_id=stream_id, storage_type=STORAGE_TYPE
630
- )
631
- result_stream = Stream.of(
632
- stream_locator, partition_keys=partition_keys, state=CommitState.COMMITTED
633
- )
634
-
635
- params = (
636
- table_locator.canonical_string(),
637
- table_locator.namespace_locator.canonical_string(),
638
- json.dumps(table_obj),
639
- )
640
- cur.execute("INSERT OR IGNORE INTO tables VALUES (?, ?, ?)", params)
641
- params = (
642
- table_version_locator.canonical_string(),
643
- table_locator.canonical_string(),
644
- json.dumps(table_version_obj),
645
- )
646
- cur.execute("INSERT OR IGNORE INTO table_versions VALUES (?, ?, ?)", params)
647
-
648
- params = (
649
- stream_locator.canonical_string(),
650
- table_version_locator.canonical_string(),
651
- json.dumps(result_stream),
652
- )
653
- cur.execute("INSERT OR IGNORE INTO streams VALUES (?, ?, ?)", params)
654
- con.commit()
655
- return result_stream
656
-
657
-
658
- def update_table(
659
- namespace: str,
660
- table_name: str,
661
- permissions: Optional[Dict[str, Any]] = None,
662
- description: Optional[str] = None,
663
- properties: Optional[Dict[str, str]] = None,
664
- new_table_name: Optional[str] = None,
665
- *args,
666
- **kwargs,
667
- ) -> None:
668
- cur, con = _get_sqlite3_cursor_con(kwargs)
669
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
670
- table_obj = Table.of(table_locator, permissions, description, properties)
671
-
672
- params = (table_locator.canonical_string(),)
673
- cur.execute("DELETE FROM tables WHERE locator = ?", params)
674
- params = (
675
- table_locator.canonical_string(),
676
- table_locator.namespace_locator.canonical_string(),
677
- json.dumps(table_obj),
678
- )
679
- cur.execute("INSERT INTO tables VALUES (?, ?, ?)", params)
680
- con.commit()
681
-
682
-
683
- def update_table_version(
684
- namespace: str,
685
- table_name: str,
686
- table_version: str,
687
- lifecycle_state: Optional[LifecycleState] = None,
688
- schema: Optional[Union[pa.Schema, str, bytes]] = None,
689
- schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
690
- description: Optional[str] = None,
691
- properties: Optional[Dict[str, str]] = None,
692
- *args,
693
- **kwargs,
694
- ) -> None:
695
- cur, con = _get_sqlite3_cursor_con(kwargs)
696
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
697
- table_version_locator = TableVersionLocator.of(
698
- table_locator=table_locator, table_version=table_version
699
- )
700
-
701
- res = cur.execute(
702
- "SELECT * from table_versions WHERE locator = ?",
703
- (table_version_locator.canonical_string(),),
704
- )
705
- serialized_table_version = res.fetchone()
706
- assert (
707
- serialized_table_version is not None
708
- ), f"Table version not found with locator={table_version_locator.canonical_string()}"
709
- current_table_version_obj = TableVersion(json.loads(serialized_table_version[2]))
710
-
711
- if properties is None:
712
- properties = {}
713
-
714
- current_props = (
715
- current_table_version_obj.properties
716
- if current_table_version_obj.properties
717
- else {}
718
- )
719
-
720
- tv_properties = {**properties, **current_props}
721
- table_version_obj = TableVersion.of(
722
- table_version_locator,
723
- schema=schema,
724
- partition_keys=current_table_version_obj.partition_keys,
725
- primary_key_columns=current_table_version_obj.primary_keys,
726
- description=description,
727
- properties=tv_properties,
728
- sort_keys=current_table_version_obj.sort_keys,
729
- content_types=current_table_version_obj.content_types,
730
- )
731
-
732
- params = (
733
- table_locator.canonical_string(),
734
- json.dumps(table_version_obj),
735
- table_version_locator.canonical_string(),
736
- )
737
- cur.execute(
738
- "UPDATE table_versions SET table_locator = ?, value = ? WHERE locator = ?",
739
- params,
740
- )
741
- con.commit()
742
-
743
-
744
- def stage_stream(
745
- namespace: str,
746
- table_name: str,
747
- table_version: Optional[str] = None,
748
- *args,
749
- **kwargs,
750
- ) -> Stream:
751
- cur, con = _get_sqlite3_cursor_con(kwargs)
752
-
753
- existing_table_version = get_table_version(
754
- namespace, table_name, table_version, *args, **kwargs
755
- )
756
- existing_stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
757
-
758
- stream_id = uuid.uuid4().__str__()
759
- new_stream_locator = StreamLocator.of(
760
- existing_table_version.locator, stream_id, STORAGE_TYPE
761
- )
762
- new_stream = Stream.of(
763
- new_stream_locator,
764
- existing_stream.partition_keys,
765
- CommitState.STAGED,
766
- existing_stream.locator.canonical_string(),
767
- )
768
-
769
- params = (
770
- new_stream_locator.canonical_string(),
771
- existing_table_version.locator.canonical_string(),
772
- json.dumps(new_stream),
773
- )
774
- cur.execute("INSERT INTO streams VALUES (?, ?, ?)", params)
775
- con.commit()
776
-
777
- return new_stream
778
-
779
-
780
- def commit_stream(stream: Stream, *args, **kwargs) -> Stream:
781
- cur, con = _get_sqlite3_cursor_con(kwargs)
782
-
783
- existing_table_version = get_table_version(
784
- stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
785
- )
786
- stream_to_commit = Stream.of(
787
- stream.locator,
788
- stream.partition_keys,
789
- CommitState.COMMITTED,
790
- stream.previous_stream_digest,
791
- )
792
-
793
- existing_table_version.properties[
794
- STREAM_ID_PROPERTY
795
- ] = stream_to_commit.locator.stream_id
796
-
797
- params = (
798
- json.dumps(existing_table_version),
799
- existing_table_version.locator.canonical_string(),
800
- )
801
- cur.execute("UPDATE table_versions SET value = ? WHERE locator = ?", params)
802
- params = (json.dumps(stream_to_commit), stream_to_commit.locator.canonical_string())
803
- cur.execute("UPDATE streams SET value = ? WHERE locator = ?", params)
804
- con.commit()
805
-
806
- return stream_to_commit
807
-
808
-
809
- def delete_stream(
810
- namespace: str,
811
- table_name: str,
812
- table_version: Optional[str] = None,
813
- *args,
814
- **kwargs,
815
- ) -> None:
816
- cur, con = _get_sqlite3_cursor_con(kwargs)
817
-
818
- table_version_locator = TableVersionLocator.of(
819
- TableLocator.of(NamespaceLocator.of(namespace), table_name), table_version
820
- )
821
-
822
- res = cur.execute(
823
- "SELECT locator FROM streams WHERE table_version_locator = ?",
824
- (table_version_locator.canonical_string(),),
825
- )
826
- locators = res.fetchall()
827
- cur.executemany("DELETE FROM streams WHERE locator = ?", locators)
828
- cur.execute(
829
- "DELETE FROM table_versions WHERE locator = ?",
830
- (table_version_locator.canonical_string(),),
831
- )
832
-
833
- con.commit()
834
-
835
-
836
- def stage_partition(
837
- stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
838
- ) -> Partition:
839
- cur, con = _get_sqlite3_cursor_con(kwargs)
840
- partition_id = uuid.uuid4().__str__()
841
- partition_locator = PartitionLocator.of(
842
- stream.locator, partition_values=partition_values, partition_id=partition_id
843
- )
844
-
845
- tv = get_table_version(
846
- stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
847
- )
848
-
849
- pv_partition = get_partition(
850
- stream.locator, partition_values=partition_values, *args, **kwargs
851
- )
852
-
853
- stream_position = current_time_ms()
854
- partition = Partition.of(
855
- partition_locator,
856
- schema=tv.schema,
857
- content_types=tv.content_types,
858
- state=CommitState.STAGED,
859
- previous_stream_position=pv_partition.stream_position if pv_partition else None,
860
- previous_partition_id=pv_partition.partition_id if pv_partition else None,
861
- stream_position=stream_position,
862
- )
863
-
864
- params = (
865
- partition.locator.canonical_string(),
866
- partition.stream_locator.canonical_string(),
867
- json.dumps(partition),
868
- )
869
- cur.execute("INSERT INTO partitions VALUES (?, ?, ?)", params)
870
- con.commit()
871
-
872
- return partition
873
-
874
-
875
- def commit_partition(
876
- partition: Partition,
877
- previous_partition: Optional[Partition] = None,
878
- *args,
879
- **kwargs,
880
- ) -> Partition:
881
- cur, con = _get_sqlite3_cursor_con(kwargs)
882
- pv_partition: Optional[Partition] = previous_partition or get_partition(
883
- partition.stream_locator,
884
- partition_values=partition.partition_values,
885
- *args,
886
- **kwargs,
887
- )
888
- # deprecate old partition and commit new one
889
- if pv_partition:
890
- pv_partition.state = CommitState.DEPRECATED
891
- params = (json.dumps(pv_partition), pv_partition.locator.canonical_string())
892
- cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
893
- previous_partition_deltas = (
894
- list_partition_deltas(
895
- pv_partition, ascending_order=False, *args, **kwargs
896
- ).all_items()
897
- or []
898
- )
899
-
900
- partition_deltas: Optional[List[Delta]] = (
901
- list_partition_deltas(
902
- partition, ascending_order=False, *args, **kwargs
903
- ).all_items()
904
- or []
905
- )
906
-
907
- # if previous_partition is passed in, table is in-place compacted and we need to run merge-and-promote
908
- if previous_partition:
909
- partition_deltas = _merge_and_promote(
910
- partition_deltas, previous_partition_deltas
911
- )
912
-
913
- stream_position = (
914
- partition_deltas[0].stream_position
915
- if partition_deltas
916
- else partition.stream_position
917
- )
918
-
919
- partition.stream_position = stream_position
920
- if partition_deltas:
921
- partition.locator = partition_deltas[0].partition_locator
922
-
923
- partition.state = CommitState.COMMITTED
924
- partition.previous_stream_position = (
925
- pv_partition.stream_position if pv_partition else None
926
- )
927
- params = (json.dumps(partition), partition.locator.canonical_string())
928
- cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
929
- con.commit()
930
-
931
- return partition
932
-
933
-
934
- def delete_partition(
935
- namespace: str,
936
- table_name: str,
937
- table_version: Optional[str] = None,
938
- partition_values: Optional[PartitionValues] = None,
939
- *args,
940
- **kwargs,
941
- ) -> None:
942
- cur, con = _get_sqlite3_cursor_con(kwargs)
943
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
944
- partition = get_partition(stream.locator, partition_values, *args, **kwargs)
945
-
946
- partition.state = CommitState.DEPRECATED
947
- params = (json.dumps(partition), partition.locator.canonical_string())
948
-
949
- cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
950
- con.commit()
951
-
952
-
953
- def get_partition(
954
- stream_locator: StreamLocator,
955
- partition_values: Optional[PartitionValues] = None,
956
- *args,
957
- **kwargs,
958
- ) -> Optional[Partition]:
959
- cur, con = _get_sqlite3_cursor_con(kwargs)
960
-
961
- res = cur.execute(
962
- "SELECT * FROM partitions WHERE stream_locator = ?",
963
- (stream_locator.canonical_string(),),
964
- )
965
-
966
- serialized_partitions = res.fetchall()
967
-
968
- if not serialized_partitions:
969
- return None
970
-
971
- if partition_values is None:
972
- partition_values = []
973
-
974
- prior_pv = ",".join(partition_values)
975
-
976
- for item in serialized_partitions:
977
- partition = Partition(json.loads(item[2]))
978
- pv = ",".join(partition.partition_values if partition.partition_values else [])
979
-
980
- if pv == prior_pv and partition.state == CommitState.COMMITTED:
981
- return partition
982
-
983
- return None
984
-
985
-
986
- def stage_delta(
987
- data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
988
- partition: Partition,
989
- delta_type: DeltaType = DeltaType.UPSERT,
990
- max_records_per_entry: Optional[int] = None,
991
- author: Optional[ManifestAuthor] = None,
992
- properties: Optional[Dict[str, str]] = None,
993
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
994
- content_type: ContentType = ContentType.PARQUET,
995
- delete_parameters: Optional[DeleteParameters] = None,
996
- partition_spec: Optional[DeltaPartitionSpec] = None,
997
- partition_values: Optional[PartitionValues] = None,
998
- *args,
999
- **kwargs,
1000
- ) -> Delta:
1001
- cur, con = _get_sqlite3_cursor_con(kwargs)
1002
- manifest_id = uuid.uuid4().__str__()
1003
- uri = _get_manifest_entry_uri(manifest_id)
1004
-
1005
- if data is None:
1006
- delta = create_empty_delta(
1007
- partition,
1008
- delta_type,
1009
- author,
1010
- properties=properties,
1011
- manifest_entry_id=manifest_id,
1012
- )
1013
- cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", (uri, None))
1014
- params = (delta.locator.canonical_string(), "staged_delta", json.dumps(delta))
1015
- cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
1016
- con.commit()
1017
- return delta
1018
-
1019
- if partition_spec:
1020
- assert partition_values is not None, (
1021
- "partition_values must be provided as local "
1022
- "storage does not support computing it from input data"
1023
- )
1024
-
1025
- serialized_data = None
1026
- if content_type == ContentType.PARQUET:
1027
- buffer = io.BytesIO()
1028
- pa.parquet.write_table(data, buffer)
1029
- serialized_data = buffer.getvalue()
1030
- elif content_type == ContentType.UNESCAPED_TSV:
1031
- buffer = io.BytesIO()
1032
- write_options = pa.csv.WriteOptions(
1033
- include_header=True, delimiter="\t", quoting_style="none"
1034
- )
1035
- pa.csv.write_csv(data, buffer, write_options=write_options)
1036
- serialized_data = buffer.getvalue()
1037
- else:
1038
- raise ValueError(f"Unsupported content type: {content_type}")
1039
-
1040
- stream_position = current_time_ms()
1041
- delta_locator = DeltaLocator.of(partition.locator, stream_position=stream_position)
1042
-
1043
- meta = ManifestMeta.of(
1044
- len(data),
1045
- len(serialized_data),
1046
- content_type=content_type,
1047
- content_encoding=ContentEncoding.IDENTITY,
1048
- source_content_length=data.nbytes,
1049
- partition_values=partition_values,
1050
- )
1051
-
1052
- manifest = Manifest.of(
1053
- entries=ManifestEntryList.of(
1054
- [
1055
- ManifestEntry.of(
1056
- uri=uri, url=uri, meta=meta, mandatory=True, uuid=manifest_id
1057
- )
1058
- ]
1059
- ),
1060
- author=author,
1061
- uuid=manifest_id,
1062
- )
1063
-
1064
- delta = Delta.of(
1065
- delta_locator,
1066
- delta_type=delta_type,
1067
- meta=meta,
1068
- properties=properties,
1069
- manifest=manifest,
1070
- previous_stream_position=partition.stream_position,
1071
- delete_parameters=delete_parameters,
1072
- )
1073
-
1074
- params = (uri, serialized_data)
1075
- cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", params)
1076
-
1077
- params = (delta_locator.canonical_string(), "staged_delta", json.dumps(delta))
1078
- cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
1079
-
1080
- con.commit()
1081
- return delta
1082
-
1083
-
1084
- def commit_delta(delta: Delta, *args, **kwargs) -> Delta:
1085
- cur, con = _get_sqlite3_cursor_con(kwargs)
1086
- delta_stream_position: Optional[int] = delta.stream_position
1087
- delta.locator.stream_position = delta_stream_position or current_time_ms()
1088
-
1089
- params = (
1090
- delta.locator.canonical_string(),
1091
- delta.partition_locator.canonical_string(),
1092
- json.dumps(delta),
1093
- )
1094
-
1095
- cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
1096
-
1097
- params = (
1098
- delta.partition_locator.canonical_string(),
1099
- json.dumps(delta),
1100
- delta.locator.canonical_string(),
1101
- )
1102
- cur.execute(
1103
- "UPDATE deltas SET partition_locator = ?, value = ? WHERE locator = ?", params
1104
- )
1105
- con.commit()
1106
- return delta
1107
-
1108
-
1109
- def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
1110
- cur, con = _get_sqlite3_cursor_con(kwargs)
1111
- locator = NamespaceLocator.of(namespace)
1112
-
1113
- res = cur.execute(
1114
- "SELECT * FROM namespaces WHERE locator = ?", (locator.canonical_string(),)
1115
- )
1116
- serialized_result = res.fetchone()
1117
-
1118
- if serialized_result is None:
1119
- return None
1120
-
1121
- return Namespace(json.loads(serialized_result[1]))
1122
-
1123
-
1124
- def namespace_exists(namespace: str, *args, **kwargs) -> bool:
1125
- obj = get_namespace(namespace, *args, **kwargs)
1126
-
1127
- return obj is not None
1128
-
1129
-
1130
- def get_table(namespace: str, table_name: str, *args, **kwargs) -> Optional[Table]:
1131
- cur, con = _get_sqlite3_cursor_con(kwargs)
1132
- locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
1133
-
1134
- res = cur.execute(
1135
- "SELECT * FROM tables WHERE locator = ?", (locator.canonical_string(),)
1136
- )
1137
- serialized_result = res.fetchone()
1138
-
1139
- if serialized_result is None:
1140
- return None
1141
-
1142
- return Table(json.loads(serialized_result[2]))
1143
-
1144
-
1145
- def table_exists(namespace: str, table_name: str, *args, **kwargs) -> bool:
1146
- obj = get_table(namespace, table_name, *args, **kwargs)
1147
-
1148
- return obj is not None
1149
-
1150
-
1151
- def get_table_version(
1152
- namespace: str, table_name: str, table_version: str, *args, **kwargs
1153
- ) -> Optional[TableVersion]:
1154
- cur, con = _get_sqlite3_cursor_con(kwargs)
1155
- locator = TableVersionLocator.of(
1156
- TableLocator.of(NamespaceLocator.of(namespace), table_name), table_version
1157
- )
1158
-
1159
- res = cur.execute(
1160
- "SELECT * FROM table_versions WHERE locator = ?", (locator.canonical_string(),)
1161
- )
1162
- serialized_table_version = res.fetchone()
1163
-
1164
- if serialized_table_version is None:
1165
- return None
1166
-
1167
- return TableVersion(json.loads(serialized_table_version[2]))
1168
-
1169
-
1170
- def get_latest_table_version(
1171
- namespace: str, table_name: str, *args, **kwargs
1172
- ) -> Optional[TableVersion]:
1173
- table_versions = list_table_versions(
1174
- namespace, table_name, *args, **kwargs
1175
- ).all_items()
1176
- if not table_versions:
1177
- return None
1178
-
1179
- table_versions.sort(reverse=True, key=lambda v: int(v.table_version))
1180
- return table_versions[0]
1181
-
1182
-
1183
- def get_latest_active_table_version(
1184
- namespace: str, table_name: str, *args, **kwargs
1185
- ) -> Optional[TableVersion]:
1186
-
1187
- # This module does not support table version lifecycle state
1188
- return get_latest_table_version(namespace, table_name, *args, **kwargs)
1189
-
1190
-
1191
- def get_table_version_schema(
1192
- namespace: str,
1193
- table_name: str,
1194
- table_version: Optional[str] = None,
1195
- *args,
1196
- **kwargs,
1197
- ) -> Optional[Union[pa.Schema, str, bytes]]:
1198
- obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
1199
-
1200
- return obj.schema
1201
-
1202
-
1203
- def table_version_exists(
1204
- namespace: str, table_name: str, table_version: str, *args, **kwargs
1205
- ) -> bool:
1206
- obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
1207
-
1208
- return obj is not None
1209
-
1210
-
1211
- def get_stream(
1212
- namespace: str,
1213
- table_name: str,
1214
- table_version: Optional[str] = None,
1215
- *args,
1216
- **kwargs,
1217
- ) -> Optional[Stream]:
1218
- assert not isinstance(table_version, int), f"Passed an integer as the table version"
1219
- obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
1220
-
1221
- if obj is None:
1222
- return None
1223
-
1224
- stream_id = obj.properties.get(STREAM_ID_PROPERTY)
1225
- if stream_id is None:
1226
- return None
1227
-
1228
- cur, con = _get_sqlite3_cursor_con(kwargs)
1229
- stream_locator = StreamLocator.of(
1230
- obj.locator, stream_id=stream_id, storage_type=STORAGE_TYPE
1231
- )
1232
- res = cur.execute(
1233
- "SELECT * FROM streams WHERE locator = ?", (stream_locator.canonical_string(),)
1234
- )
1235
-
1236
- serialized_stream = res.fetchone()
1237
- if serialized_stream is None:
1238
- return None
1239
-
1240
- return Stream(json.loads(serialized_stream[2]))
1241
-
1242
-
1243
- def get_table_version_column_names(
1244
- namespace: str,
1245
- table_name: str,
1246
- table_version: Optional[str] = None,
1247
- *args,
1248
- **kwargs,
1249
- ) -> Optional[List[str]]:
1250
- raise NotImplementedError("Fetching column names is not supported")
1251
-
1252
-
1253
- def can_categorize(e: BaseException, **kwargs) -> bool:
1254
- if isinstance(e, InvalidNamespaceError):
1255
- return True
1256
- else:
1257
- return False
1258
-
1259
-
1260
- def raise_categorized_error(e: BaseException, **kwargs):
1261
- if isinstance(e, InvalidNamespaceError):
1262
- raise LocalStorageValidationError("Namespace provided is invalid!")