deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,172 @@
1
+ from typing import List, Optional, Union, Dict, Any
2
+
3
+ from ray.data import Dataset as RayDataset
4
+ from ray.data import read_datasource
5
+
6
+ from deltacat.io.datasource.deltacat_datasource import DeltaCatDatasource
7
+ from deltacat.io.dataset.deltacat_dataset import DeltaCatDataset
8
+ from deltacat.utils.common import ReadKwargsProvider
9
+ from deltacat.utils.url import DeltaCatUrl, DeltaCatUrlReader
10
+ from deltacat.io.datasource.deltacat_datasource import DeltacatReadType
11
+
12
+
13
+ class EmptyReadKwargsProvider(ReadKwargsProvider):
14
+ def _get_kwargs(
15
+ self,
16
+ datasource_type: str,
17
+ kwargs: Dict[str, Any],
18
+ ) -> Dict[str, Any]:
19
+ return {}
20
+
21
+
22
+ def read_deltacat(
23
+ urls: Union[DeltaCatUrl, List[DeltaCatUrl]],
24
+ *,
25
+ deltacat_read_type: DeltacatReadType = DeltacatReadType.DATA,
26
+ timestamp_as_of: Optional[int] = None,
27
+ merge_on_read: Optional[bool] = False,
28
+ read_kwargs_provider: Optional[ReadKwargsProvider] = EmptyReadKwargsProvider(),
29
+ ) -> DeltaCatDataset:
30
+ """Reads the given DeltaCAT URLs into a Ray Dataset. DeltaCAT URLs can
31
+ either reference objects registered in a DeltaCAT catalog, or unregistered
32
+ external objects that are readable into a Ray Dataset.
33
+
34
+ Unless `metadata_only` is `True`, all reads of registered DeltaCAT catalog
35
+ object data must resolve to a single table version.
36
+
37
+ When reading unregistered external objects, all additional keyword
38
+ arguments specified are passed into the Ray Datasource resolved for the
39
+ given DeltaCAT URLs.
40
+
41
+ Examples:
42
+ >>> # Read the latest active DeltaCAT table version:
43
+ >>> import deltacat as dc
44
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table")
45
+ >>> # If `my_catalog is the default catalog, this is equivalent to:
46
+ >>> dc.io.read_deltacat("namespace://my_namespace/my_table")
47
+ >>> # If `my_namespace` is the default namespace, this is equivalent to:
48
+ >>> dc.io.read_deltacat("table://my_table")
49
+
50
+ >>> # Read metadata from all partitions and deltas of the latest active
51
+ >>> # DeltaCAT table version:
52
+ >>> import deltacat as dc
53
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table", metadata_only=True)
54
+ >>> # Since "default" always resolves to the latest active table version.
55
+ >>> # This is equivalent to:
56
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/default", metadata_only=True)
57
+
58
+ >>> # Read only the latest active table version's top-level metadata:
59
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/default", metadata_only=True, recursive=False)
60
+
61
+ >>> # Read only top-level metadata from a DeltaCAT table:
62
+ >>> import deltacat as dc
63
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table", metadata_only=True, recursive=False)
64
+
65
+ >>> # Read top-level table metadata from all table versions:
66
+ >>> import deltacat as dc
67
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/*", metadata_only=True, recursive=False)
68
+
69
+ >>> # Read metadata from all partitions and deltas of all table versions:
70
+ >>> import deltacat as dc
71
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/*", metadata_only=True)
72
+
73
+ >>> # Read metadata from all tables and table versions of the namespace:
74
+ >>> import deltacat as dc
75
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/*", metadata_only=True)
76
+
77
+ >>> # Read metadata from the latest active table version for each
78
+ >>> # table in the namespace:
79
+ >>> import deltacat as dc
80
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace", metadata_only=True)
81
+
82
+ >>> # Read metadata from the latest active table version for each
83
+ >>> # table in the namespace:
84
+ >>> import deltacat as dc
85
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace", metadata_only=True)
86
+
87
+ >>> # Read metadata from the latest active table version for each
88
+ >>> # table in the catalog's default namespace:
89
+ >>> import deltacat as dc
90
+ >>> dc.io.read_deltacat("dc://my_catalog", metadata_only=True)
91
+
92
+ >>> # Read metadata from all table versions for each table in each
93
+ >>> # catalog namespace:
94
+ >>> import deltacat as dc
95
+ >>> dc.io.read_deltacat("dc://my_catalog/*", metadata_only=True)
96
+
97
+ >>> # Read the Iceberg stream of the latest active DeltaCAT table version,
98
+ >>> import deltacat as dc
99
+ >>> dc.io.read_deltacat("dc://my_catalog/my_namespace/my_table/default/iceberg")
100
+ >>> # Or, if `my_catalog` is the default catalog, this is equivalent to:
101
+ >>> dc.io.read_deltacat("namespace://my_namespace/my_table/default/iceberg")
102
+ >>> # Or, if `my_namespace` is the default namespace, this is equivalent to:
103
+ >>> dc.io.read_deltacat("table://my_table/default/iceberg")
104
+
105
+ >>> # Read an external unregistered Iceberg table `my_db.my_table`:
106
+ >>> import deltacat as dc
107
+ >>> dc.io.read_deltacat("iceberg://my_db.my_table")
108
+
109
+ >>> # Read an external unregistered audio file from /my/audio.mp4:
110
+ >>> import deltacat as dc
111
+ >>> dc.io.read_deltacat("audio+file:///my/audio.mp4")
112
+
113
+ >>> # Read an external unregistered audio file from s3://my/audio.mp4:
114
+ >>> import deltacat as dc
115
+ >>> dc.io.read_deltacat("audio+s3://my/audio.mp4")
116
+
117
+ Args:
118
+ urls: The DeltaCAT URLs to read.
119
+ deltacat_read_type: If METADATA, reads only DeltaCAT metadata for the
120
+ given URL and skips both recursive metadata expansion and reads
121
+ of the underlying data files. If METADATA_RECURSIVE then recursively
122
+ expands child metadata but does not read underlying data files. If
123
+ DATA then recursively expands child metadata to discover and read
124
+ all underlying data files.
125
+ timestamp_as_of: Reads a historic snapshot of the given paths as-of the
126
+ given millisecond-precision epoch timestamp (only used when reading
127
+ registered DeltaCAT catalog objects).
128
+ merge_on_read: If True, merges all unmaterialized inserts, updates,
129
+ and deletes in the registered DeltaCAT table version being read. Only
130
+ applicable if `metadata_only` is False.
131
+ ray_remote_args: kwargs passed to `ray.remote` in the read tasks.
132
+ read_kwargs_provider: Resolves
133
+ :class:`~deltacat.types.media.DatasourceType` string keys to
134
+ kwarg dictionaries to pass to the resolved
135
+ :class:`~ray.data.Datasource` implementation for each distinct
136
+ DeltaCAT URL type.
137
+
138
+ Returns:
139
+ DeltacatDataset holding Arrow records read from the specified URL.
140
+ """
141
+ # TODO(pdames): The below implementation serializes reads of each URL and
142
+ # then unions their respective datasets together. While this was an easy
143
+ # starting point to implement, a more efficient implementation should push
144
+ # all URLs down into `DeltacatDatasource` to parallelize all reads
145
+ # (i.e., by returning the `ReadTask` for all datasources in
146
+ # `get_read_tasks()` and estimating the corresponding memory size across
147
+ # all datasources in `estimate_inmemory_data_size()`.
148
+ dataset: RayDataset = None
149
+ for url in urls:
150
+ if not url.is_deltacat_catalog_url():
151
+ # this URL points to an external unregistered Ray Datasource
152
+ # TODO(pdames): Honor metadata only reads of external datasources
153
+ # by registering only file paths & metadata in delta manifests.
154
+ reader = DeltaCatUrlReader(url)
155
+ next_ds = reader.read(read_kwargs_provider(url.datastore_type, {}))
156
+ else:
157
+ # this URL points to a registered DeltaCAT object
158
+ next_ds = read_datasource(
159
+ DeltaCatDatasource(
160
+ url=url,
161
+ deltacat_read_type=deltacat_read_type,
162
+ timestamp_as_of=timestamp_as_of,
163
+ merge_on_read=merge_on_read,
164
+ read_kwargs_provider=read_kwargs_provider,
165
+ )
166
+ )
167
+ # union the last dataset read into the result set
168
+ if not dataset:
169
+ dataset = next_ds
170
+ else:
171
+ dataset.union(next_ds)
172
+ return DeltaCatDataset.from_dataset(dataset)
deltacat/logs.py CHANGED
@@ -18,6 +18,7 @@ from deltacat.constants import (
18
18
  DELTACAT_APP_DEBUG_LOG_BASE_FILE_NAME,
19
19
  DELTACAT_SYS_DEBUG_LOG_BASE_FILE_NAME,
20
20
  DELTACAT_LOGGER_CONTEXT,
21
+ DELTACAT_LOGGER_USE_SINGLE_HANDLER,
21
22
  )
22
23
 
23
24
  DEFAULT_LOG_LEVEL = "INFO"
@@ -226,6 +227,7 @@ def _configure_logger(
226
227
  # This maintains log level of rotating file handlers
227
228
  primary_log_level = log_level
228
229
  logger.propagate = False
230
+ needs_handler = True
229
231
  if log_level <= logging.getLevelName("DEBUG"):
230
232
  if not _file_handler_exists(logger, log_dir, debug_log_base_file_name):
231
233
  handler = _create_rotating_file_handler(
@@ -235,8 +237,9 @@ def _configure_logger(
235
237
  context_kwargs=context_kwargs,
236
238
  )
237
239
  _add_logger_handler(logger, handler)
240
+ needs_handler = not DELTACAT_LOGGER_USE_SINGLE_HANDLER
238
241
  primary_log_level = logging.getLevelName("INFO")
239
- if not _file_handler_exists(logger, log_dir, log_base_file_name):
242
+ if not _file_handler_exists(logger, log_dir, log_base_file_name) and needs_handler:
240
243
  handler = _create_rotating_file_handler(
241
244
  log_dir,
242
245
  log_base_file_name,
@@ -1,24 +1,70 @@
1
- from deltacat.aws.redshift import (
1
+ from deltacat.storage.model.manifest import (
2
+ EntryType,
3
+ EntryParams,
2
4
  Manifest,
3
5
  ManifestAuthor,
4
6
  ManifestEntry,
5
7
  ManifestEntryList,
6
8
  ManifestMeta,
7
9
  )
8
- from deltacat.storage.model.delta import Delta, DeltaLocator
10
+ from deltacat.storage.model.delta import (
11
+ Delta,
12
+ DeltaLocator,
13
+ DeltaProperties,
14
+ )
9
15
  from deltacat.storage.model.list_result import ListResult
10
16
  from deltacat.storage.model.locator import Locator
11
- from deltacat.storage.model.namespace import Namespace, NamespaceLocator
12
- from deltacat.storage.model.partition import Partition, PartitionLocator
13
- from deltacat.storage.model.stream import Stream, StreamLocator
14
- from deltacat.storage.model.table import Table, TableLocator
15
- from deltacat.storage.model.table_version import TableVersion, TableVersionLocator
16
- from deltacat.storage.model.delete_parameters import DeleteParameters
17
- from deltacat.storage.model.partition_spec import (
18
- PartitionFilter,
17
+ from deltacat.storage.model.metafile import (
18
+ Metafile,
19
+ )
20
+ from deltacat.storage.model.transaction import (
21
+ TransactionOperation,
22
+ Transaction,
23
+ read_transaction,
24
+ transactions,
25
+ transaction,
26
+ )
27
+ from deltacat.storage.model.namespace import (
28
+ Namespace,
29
+ NamespaceLocator,
30
+ NamespaceProperties,
31
+ )
32
+ from deltacat.storage.model.partition import (
33
+ Partition,
34
+ PartitionLocator,
35
+ PartitionLocatorAlias,
36
+ PartitionKey,
37
+ PartitionKeyList,
38
+ PartitionScheme,
39
+ PartitionSchemeList,
19
40
  PartitionValues,
20
- DeltaPartitionSpec,
21
- StreamPartitionSpec,
41
+ )
42
+ from deltacat.storage.model.schema import (
43
+ Field,
44
+ FieldId,
45
+ FieldLocator,
46
+ FieldName,
47
+ NestedFieldName,
48
+ Schema,
49
+ SchemaList,
50
+ SchemaUpdate,
51
+ SchemaUpdateOperation,
52
+ SchemaUpdateOperations,
53
+ )
54
+ from deltacat.storage.model.stream import (
55
+ Stream,
56
+ StreamLocator,
57
+ StreamLocatorAlias,
58
+ )
59
+ from deltacat.storage.model.table import (
60
+ Table,
61
+ TableLocator,
62
+ TableProperties,
63
+ )
64
+ from deltacat.storage.model.table_version import (
65
+ TableVersion,
66
+ TableVersionLocator,
67
+ TableVersionProperties,
22
68
  )
23
69
  from deltacat.storage.model.transform import (
24
70
  Transform,
@@ -26,28 +72,61 @@ from deltacat.storage.model.transform import (
26
72
  TransformParameters,
27
73
  BucketingStrategy,
28
74
  BucketTransformParameters,
29
- IdentityTransformParameters,
75
+ TruncateTransformParameters,
76
+ BucketTransform,
77
+ IdentityTransform,
78
+ VoidTransform,
79
+ UnknownTransform,
80
+ HourTransform,
81
+ DayTransform,
82
+ MonthTransform,
83
+ YearTransform,
84
+ TruncateTransform,
85
+ TruncateStrategy,
30
86
  )
31
-
32
87
  from deltacat.storage.model.types import (
33
88
  CommitState,
89
+ Dataset,
34
90
  DeltaType,
35
91
  DistributedDataset,
36
92
  LifecycleState,
37
93
  LocalDataset,
38
94
  LocalTable,
95
+ NullOrder,
39
96
  SchemaConsistencyType,
97
+ StreamFormat,
98
+ SortOrder,
99
+ TransactionOperationType,
100
+ TransactionStatus,
101
+ )
102
+ from deltacat.storage.model.sort_key import (
103
+ SortKey,
104
+ SortKeyList,
105
+ SortScheme,
106
+ SortSchemeList,
40
107
  )
41
- from deltacat.storage.model.sort_key import SortKey, SortOrder
108
+ from deltacat.storage.main import impl as metastore
42
109
 
43
110
  __all__ = [
111
+ "BucketingStrategy",
112
+ "BucketTransform",
113
+ "BucketTransformParameters",
44
114
  "CommitState",
115
+ "Dataset",
116
+ "DayTransform",
45
117
  "Delta",
46
118
  "DeltaLocator",
47
- "Partition",
48
- "DeleteParameters",
119
+ "DeltaProperties",
49
120
  "DeltaType",
50
121
  "DistributedDataset",
122
+ "EntryType",
123
+ "EntryParams",
124
+ "Field",
125
+ "FieldId",
126
+ "FieldLocator",
127
+ "FieldName",
128
+ "HourTransform",
129
+ "IdentityTransform",
51
130
  "LifecycleState",
52
131
  "ListResult",
53
132
  "LocalDataset",
@@ -56,28 +135,59 @@ __all__ = [
56
135
  "Manifest",
57
136
  "ManifestAuthor",
58
137
  "ManifestEntry",
59
- "ManifestMeta",
60
138
  "ManifestEntryList",
139
+ "ManifestMeta",
140
+ "Metafile",
141
+ "metastore",
142
+ "MonthTransform",
61
143
  "Namespace",
62
144
  "NamespaceLocator",
145
+ "NamespaceProperties",
146
+ "NestedFieldName",
147
+ "NullOrder",
148
+ "Partition",
149
+ "PartitionKey",
150
+ "PartitionKeyList",
63
151
  "PartitionLocator",
64
- "Stream",
152
+ "PartitionLocatorAlias",
153
+ "PartitionScheme",
154
+ "PartitionSchemeList",
155
+ "PartitionValues",
156
+ "Schema",
157
+ "SchemaList",
158
+ "SchemaUpdate",
159
+ "SchemaUpdateOperation",
160
+ "SchemaUpdateOperations",
65
161
  "SchemaConsistencyType",
162
+ "SortKey",
163
+ "SortKeyList",
164
+ "SortOrder",
165
+ "SortScheme",
166
+ "SortSchemeList",
167
+ "Stream",
168
+ "StreamFormat",
66
169
  "StreamLocator",
170
+ "StreamLocatorAlias",
67
171
  "Table",
68
172
  "TableLocator",
173
+ "TableProperties",
69
174
  "TableVersion",
70
175
  "TableVersionLocator",
71
- "SortKey",
72
- "SortOrder",
73
- "PartitionFilter",
74
- "PartitionValues",
75
- "DeltaPartitionSpec",
76
- "StreamPartitionSpec",
176
+ "TableVersionProperties",
177
+ "Transaction",
178
+ "TransactionOperation",
179
+ "TransactionOperationType",
180
+ "TransactionStatus",
77
181
  "Transform",
78
182
  "TransformName",
79
183
  "TransformParameters",
80
- "BucketingStrategy",
81
- "BucketTransformParameters",
82
- "IdentityTransformParameters",
184
+ "TruncateTransform",
185
+ "TruncateTransformParameters",
186
+ "TruncateStrategy",
187
+ "UnknownTransform",
188
+ "VoidTransform",
189
+ "YearTransform",
190
+ "read_transaction",
191
+ "transactions",
192
+ "transaction",
83
193
  ]