deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,84 @@
1
+ import sys
2
+ import time
3
+ from contextlib import contextmanager
4
+ from typing import Generator, Tuple
5
+
6
+ from deltacat.benchmarking.benchmark_report import BenchmarkMetric, BenchmarkStep
7
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
8
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
9
+ QueryExpression,
10
+ )
11
+
12
+
13
+ @contextmanager
14
+ def timed_step(description: str) -> Generator[BenchmarkStep, None, None]:
15
+ """Convenience for computing elapsed time of a block of code as a metric.
16
+
17
+ :param description: description of the step
18
+ :return: a benchmark operation populated with the elapsed time
19
+ """
20
+ metric = BenchmarkStep(description)
21
+ start_time = time.time()
22
+ yield metric
23
+ end_time = time.time()
24
+ metric.add(BenchmarkMetric("elapsed_time", 1000 * (end_time - start_time), "ms"))
25
+
26
+
27
+ class BenchmarkEngine:
28
+ def __init__(self, dataset: Dataset):
29
+ self.dataset = dataset
30
+
31
+ def load_and_commit(
32
+ self, schema_name, generator, count
33
+ ) -> Tuple[str, BenchmarkStep]:
34
+ """Load count number of rows from the generator and commit.
35
+
36
+ :param generator: row generator
37
+ :param count: the number of rows to load into the dataset
38
+ :return: tuple of the manifest URI and a operation measurement
39
+ """
40
+ desc = f"load {count} from {generator}"
41
+ writer = self.dataset.writer(schema_name)
42
+ with timed_step(desc) as step:
43
+ rows = [generator.generate() for _ in range(count)]
44
+ writer.write(rows)
45
+ result = writer.flush()
46
+ step.add(BenchmarkMetric("loaded", count))
47
+ return result, step
48
+
49
+ def scan(self) -> Tuple[set[any], BenchmarkStep]:
50
+ """
51
+ Scans the rows in dataset and prints some basic statistics about the manifest
52
+
53
+ :return: Tuple[set[any], BenchmarkStep] - a tuple containing a set of merge keys and a benchmark step with metrics
54
+ """
55
+ keys = set()
56
+ object_count = 0
57
+ size_b = 0
58
+ # Note that we expect single col merge keys so we can return key set
59
+ # this will fail with validation error if dataset has multiple merge keys
60
+ merge_key_name = self.dataset.schemas["all"].get_merge_key()
61
+ with timed_step("full scan") as step:
62
+ for row in self.dataset.scan(QueryExpression()).to_pydict():
63
+ object_count += 1
64
+ size_b += sum([sys.getsizeof(x) for x in row.values()])
65
+ keys.add(row.get(merge_key_name))
66
+ # TODO replace with the actual metrics we want to measure
67
+ step.add(BenchmarkMetric("rows read", object_count))
68
+ step.add(BenchmarkMetric("size", size_b / (1024 * 1024), "MB"))
69
+ return keys, step
70
+
71
+ def run_queries(
72
+ self, description, manifest_uri, queries: list[QueryExpression]
73
+ ) -> BenchmarkStep:
74
+ object_count = 0
75
+ size_b = 0
76
+ with timed_step(description) as step:
77
+ for query in queries:
78
+ for row in self.dataset.scan(query).to_pydict():
79
+ object_count += 1
80
+ size_b += sum([sys.getsizeof(x) for x in row.values()])
81
+ # TODO replace with the actual metrics we want to measure
82
+ step.add(BenchmarkMetric("rows read", object_count))
83
+ step.add(BenchmarkMetric("size", size_b / (1024 * 1024), "MB"))
84
+ return step
@@ -0,0 +1,86 @@
1
+ from dataclasses import dataclass
2
+ from tabulate import tabulate
3
+ from typing import Union, Optional
4
+
5
+
6
+ @dataclass
7
+ class BenchmarkMetric:
8
+ name: str
9
+ value: Union[float, int]
10
+ unit: Optional[str] = None
11
+
12
+
13
+ class BenchmarkStep:
14
+ """Captures measurements from a given operation"""
15
+
16
+ def __init__(self, description):
17
+ self.description: str = description
18
+ """Description of the operation"""
19
+ self._metrics: dict[str, BenchmarkMetric] = {}
20
+ """Description of the operation"""
21
+
22
+ def add(self, metric: BenchmarkMetric):
23
+ self._metrics[metric.name] = metric
24
+
25
+ def list_metrics(self):
26
+ """List the metrics (sorted by name)"""
27
+ return sorted(self._metrics.values(), key=lambda x: x.name)
28
+
29
+
30
+ class BenchmarkRun:
31
+ """Class for capturing measurements for a given test suite for comparison."""
32
+
33
+ def __init__(self, suite: str, description: str):
34
+ self.suite = suite
35
+ """The test suite associated with this report."""
36
+ self.description = description
37
+ """Description of the report"""
38
+ self.steps: list[BenchmarkStep] = []
39
+ """List of steps and their metrics"""
40
+
41
+ def add(self, operation):
42
+ self.steps.append(operation)
43
+
44
+
45
+ class BenchmarkReport:
46
+ def __init__(self, name):
47
+ self.name = name
48
+ self.runs: list[BenchmarkRun] = []
49
+
50
+ def add(self, run):
51
+ self.runs.append(run)
52
+
53
+ def __str__(self):
54
+ """Pretty-print a table that compares the metrics across each report.
55
+
56
+ We want to transpose these such that each report gets their own column and each metric gets its own row
57
+ (ideally grouped by operation).
58
+ """
59
+ if not self.runs:
60
+ print("No runs to compare!")
61
+ return
62
+ suites = set(r.suite for r in self.runs)
63
+ if len(suites) > 1:
64
+ print("Found more than one type of suite")
65
+ return
66
+ suite = self.runs[0].suite
67
+
68
+ headers = [
69
+ f"{suite} Operation",
70
+ "Metric",
71
+ "Unit",
72
+ *[r.description for r in self.runs],
73
+ ]
74
+ rows = []
75
+ for step_tranche in zip(*[r.steps for r in self.runs]):
76
+ # TODO zip by metric name instead of assuming all metrics are being measured
77
+ step_name = step_tranche[0].description
78
+ for metric_tuple in zip(*[x.list_metrics() for x in step_tranche]):
79
+ row = [
80
+ step_name,
81
+ metric_tuple[0].name,
82
+ metric_tuple[0].unit,
83
+ *[p.value for p in metric_tuple],
84
+ ]
85
+ rows.append(row)
86
+ return tabulate(rows, headers=headers, tablefmt="fancy_outline")
@@ -0,0 +1,11 @@
1
+ from typing import Protocol
2
+
3
+ from deltacat.benchmarking.benchmark_report import BenchmarkRun
4
+
5
+
6
+ class BenchmarkSuite(Protocol):
7
+ def run(self) -> BenchmarkRun:
8
+ """Run the benchmark suite and produce a report.
9
+
10
+ Each report should be comparable against other reports by the same suite"""
11
+ ...
@@ -4,12 +4,28 @@ import pyarrow as pa
4
4
  import pyarrow.fs as pafs
5
5
  import pyarrow.parquet as papq
6
6
  import pytest
7
+ from _pytest.terminal import TerminalReporter
7
8
 
8
- from deltacat.utils.pyarrow import s3_file_to_table
9
- from deltacat.types.media import (
10
- ContentEncoding,
11
- ContentType,
12
- )
9
+ from deltacat.benchmarking.benchmark_report import BenchmarkReport
10
+
11
+
12
+ @pytest.fixture(autouse=True, scope="function")
13
+ def report(request):
14
+ report = BenchmarkReport(request.node.name)
15
+
16
+ def final_callback():
17
+ terminal_reporter: TerminalReporter = request.config.pluginmanager.get_plugin(
18
+ "terminalreporter"
19
+ )
20
+ capture_manager = request.config.pluginmanager.get_plugin("capturemanager")
21
+ with capture_manager.global_and_fixture_disabled():
22
+ terminal_reporter.ensure_newline()
23
+ terminal_reporter.section(request.node.name, sep="-", blue=True, bold=True)
24
+ terminal_reporter.write(str(report))
25
+ terminal_reporter.ensure_newline()
26
+
27
+ request.addfinalizer(final_callback)
28
+ return report
13
29
 
14
30
 
15
31
  def pyarrow_read(path: str, columns: list[str] | None = None) -> pa.Table:
@@ -21,17 +37,6 @@ def pyarrow_read(path: str, columns: list[str] | None = None) -> pa.Table:
21
37
  return papq.read_table(path, columns=columns, filesystem=fs)
22
38
 
23
39
 
24
- def deltacat_read(path: str, columns: list[str] | None = None) -> pa.Table:
25
- assert path.startswith("s3://")
26
- return s3_file_to_table(
27
- path,
28
- content_type=ContentType.PARQUET,
29
- content_encoding=ContentEncoding.IDENTITY,
30
- column_names=None, # Parquet files are schemaful
31
- include_columns=columns,
32
- )
33
-
34
-
35
40
  def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
36
41
  try:
37
42
  import daft
@@ -40,7 +45,7 @@ def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
40
45
  "Daft not installed. Install Daft using pip to run these benchmarks: `pip install getdaft`"
41
46
  )
42
47
 
43
- tbl = daft.table.Table.read_parquet(path, columns=columns)
48
+ tbl = daft.read_parquet(path, columns=columns)
44
49
  return tbl.to_arrow()
45
50
 
46
51
 
@@ -48,12 +53,10 @@ def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
48
53
  params=[
49
54
  daft_table_read,
50
55
  pyarrow_read,
51
- deltacat_read,
52
56
  ],
53
57
  ids=[
54
58
  "daft_table",
55
59
  "pyarrow",
56
- "deltacat",
57
60
  ],
58
61
  )
59
62
  def read_fn(request):
@@ -0,0 +1,94 @@
1
+ import math
2
+ import os
3
+ from enum import Enum
4
+ from io import BytesIO
5
+ from typing import Any, Dict
6
+
7
+ import faker
8
+ from faker_file.providers.png_file import GraphicPngFileProvider
9
+ from faker_file.storages.filesystem import FileSystemStorage
10
+ from PIL import Image
11
+
12
+ from deltacat.benchmarking.data.row_generator import RowGenerator
13
+
14
+
15
+ class ImageStyle(Enum):
16
+ RANDOM_BYTES = 1
17
+ """Generate random bytes to simulate an image.
18
+
19
+ This is the fastest option (if you want to test correctness).
20
+ """
21
+ PILLOW = 2
22
+ """Generate actual PNG files in-memory directly using Pillow"""
23
+ FAKER_FILE = 3
24
+ """Generate PNG files on-disk using with some random elements"""
25
+
26
+
27
+ class RandomRowGenerator(RowGenerator):
28
+ """Generate rows with 'images' that are just randomly-generated bytes"""
29
+
30
+ def __init__(
31
+ self, seed=0, tmp_dir=None, style: ImageStyle = ImageStyle.RANDOM_BYTES
32
+ ):
33
+ self.seed = seed
34
+ self.fake = faker.Faker()
35
+ self.fake.seed_instance(seed)
36
+ self.fake.add_provider(GraphicPngFileProvider)
37
+ self.temp_dir = tmp_dir
38
+ self.style = style
39
+
40
+ def __str__(self):
41
+ return f"random source"
42
+
43
+ def _generate_image(self, width, height) -> bytes:
44
+ if self.style == ImageStyle.RANDOM_BYTES:
45
+ return self._generate_with_random_bytes(width, height)
46
+ elif self.style == ImageStyle.PILLOW:
47
+ return self._generate_with_pillow(width, height)
48
+ elif self.style == ImageStyle.FAKER_FILE:
49
+ return self._generate_with_faker(width, height)
50
+ else:
51
+ raise ValueError("Unknown ImageStyle")
52
+
53
+ @staticmethod
54
+ def _generate_with_random_bytes(width, height) -> bytes:
55
+ """Generate random bytes to simulate an image."""
56
+ target_size = math.floor(
57
+ width * height / 50
58
+ ) # this isn't actually how file size relates to image size
59
+ # Assumption: we don't actually need images. It suffices to generate arbitrary-length bytes of random characters.
60
+ return os.urandom(target_size)
61
+
62
+ @staticmethod
63
+ def _generate_with_pillow(width, height) -> bytes:
64
+ """Generate actual PNG files in-memory directly using Pillow"""
65
+ file = BytesIO()
66
+ image = Image.new("RGBA", size=(width, height), color=(155, 0, 0))
67
+ image.save(file, "png")
68
+ file.name = "test.png"
69
+ file.seek(0)
70
+ return file.read()
71
+
72
+ def _generate_with_faker(self, width, height) -> bytes:
73
+ """Generate PNG files on-disk using with some random elements"""
74
+ rel_name = self.fake.graphic_png_file(
75
+ storage=FileSystemStorage(
76
+ root_path=self.temp_dir,
77
+ rel_path="tmp",
78
+ ),
79
+ size=(width, height),
80
+ )
81
+ file_name = f"{self.temp_dir}/{rel_name}"
82
+ with open(file_name, "rb") as f:
83
+ return f.read()
84
+
85
+ def generate(self) -> Dict[str, Any]:
86
+ return {
87
+ "id": self.fake.random_int(0, 10_000_000),
88
+ "source": self.fake.image_url(),
89
+ "media": (
90
+ self._generate_image(
91
+ self.fake.random_int(512, 2048), self.fake.random_int(512, 4096)
92
+ )
93
+ ),
94
+ }
@@ -0,0 +1,10 @@
1
+ from typing import Protocol, Iterator, Dict, Any
2
+
3
+
4
+ class RowGenerator(Protocol):
5
+ def generate(self) -> Dict[str, Any]:
6
+ ...
7
+
8
+ def generate_dataset(self, count) -> Iterator[Dict[str, Any]]:
9
+ """Generate a dataset with a given number of records"""
10
+ return map(lambda x: self.generate(), iter(range(count)))
@@ -0,0 +1,108 @@
1
+ import math
2
+ from random import shuffle
3
+ import pytest
4
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
5
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
6
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
7
+ QueryExpression,
8
+ )
9
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
10
+ from deltacat.benchmarking.benchmark_engine import BenchmarkEngine
11
+ from deltacat.benchmarking.benchmark_report import BenchmarkRun, BenchmarkReport
12
+ from deltacat.benchmarking.benchmark_suite import BenchmarkSuite
13
+ from deltacat.benchmarking.data.random_row_generator import RandomRowGenerator
14
+ from deltacat.benchmarking.data.row_generator import RowGenerator
15
+ from deltacat.tests.test_utils.filesystem import temp_dir_autocleanup
16
+
17
+ pytestmark = pytest.mark.benchmark
18
+
19
+
20
+ @pytest.fixture
21
+ def schema():
22
+ return Schema(
23
+ [
24
+ ("id", Datatype.int32()),
25
+ ("source", Datatype.string()),
26
+ ("media", Datatype.image("png")),
27
+ ],
28
+ "id",
29
+ )
30
+
31
+
32
+ class LoadAndScanSuite(BenchmarkSuite):
33
+ """Load some number of rows and scan"""
34
+
35
+ schema_name = "LoadAndScanSuite"
36
+
37
+ def __init__(self, dataset: Dataset, schema: Schema, generator, description=None):
38
+ self.suite = "ReadSuite"
39
+ self.dataset: Dataset = dataset
40
+ self.schema = schema
41
+ self.dataset.add_schema(schema, LoadAndScanSuite.schema_name)
42
+ self.generator: RowGenerator = generator
43
+ self.description: str = description or f"{self.dataset} x {self.generator}"
44
+
45
+ def run(self) -> BenchmarkRun:
46
+ container = BenchmarkEngine(self.dataset)
47
+ run = BenchmarkRun(self.suite, self.description)
48
+ # load a large number of rows
49
+ manifest_uri, step = container.load_and_commit(
50
+ LoadAndScanSuite.schema_name, self.generator, 1000
51
+ )
52
+ run.add(step)
53
+ # do a full scan of all rows (and eagerly load them)
54
+ keys, step = container.scan()
55
+ run.add(step)
56
+ # randomly retrieve all keys one-by-one from the dataset
57
+ random_keys = list(keys)
58
+ shuffle(random_keys)
59
+ step = container.run_queries(
60
+ "load all keys individually",
61
+ manifest_uri,
62
+ [QueryExpression().with_key(k) for k in random_keys],
63
+ )
64
+ run.add(step)
65
+ # split into 4 key ranges and get them individually
66
+ quartiles = self._generate_quartiles(keys)
67
+ expressions = [
68
+ QueryExpression().with_range(start, end) for (start, end) in quartiles
69
+ ]
70
+ step = container.run_queries(
71
+ "load key ranges by quartile", manifest_uri, expressions
72
+ )
73
+ run.add(step)
74
+ return run
75
+
76
+ @staticmethod
77
+ def _generate_quartiles(keys):
78
+ sorted_keys = sorted(keys)
79
+ size = len(keys)
80
+ starts = list(range(0, size, math.ceil(size / 4)))
81
+ ends = list([x - 1 for x in starts[1:]])
82
+ ends.append(size - 1)
83
+ quartiles = list(zip(starts, ends))
84
+ return [(sorted_keys[start], sorted_keys[end]) for (start, end) in quartiles]
85
+
86
+
87
+ def test_suite1(schema: Schema, report: BenchmarkReport):
88
+ with temp_dir_autocleanup() as temp_dir:
89
+ generator = RandomRowGenerator(123, temp_dir)
90
+ report.add(
91
+ LoadAndScanSuite(
92
+ Dataset(dataset_name="test_suite1_ds1", metadata_uri=temp_dir),
93
+ schema,
94
+ generator,
95
+ "SST (rand)",
96
+ ).run()
97
+ )
98
+
99
+ with temp_dir_autocleanup() as temp_dir:
100
+ generator = RandomRowGenerator(123, temp_dir)
101
+ report.add(
102
+ LoadAndScanSuite(
103
+ Dataset(dataset_name="test_suite1_ds2", metadata_uri=temp_dir),
104
+ schema,
105
+ generator,
106
+ "dupe",
107
+ ).run()
108
+ )
@@ -0,0 +1,73 @@
1
+ from deltacat.catalog.delegate import (
2
+ alter_namespace,
3
+ alter_table,
4
+ create_namespace,
5
+ create_table,
6
+ default_namespace,
7
+ drop_namespace,
8
+ drop_table,
9
+ get_namespace,
10
+ get_table,
11
+ list_namespaces,
12
+ list_tables,
13
+ namespace_exists,
14
+ read_table,
15
+ refresh_table,
16
+ rename_table,
17
+ table_exists,
18
+ truncate_table,
19
+ write_to_table,
20
+ )
21
+ from deltacat.catalog.model.catalog import ( # noqa: F401
22
+ all_catalogs,
23
+ init,
24
+ init_local,
25
+ is_initialized,
26
+ clear_catalogs,
27
+ get_catalog,
28
+ pop_catalog,
29
+ put_catalog,
30
+ raise_if_not_initialized,
31
+ Catalog,
32
+ )
33
+ from deltacat.catalog.model.properties import ( # noqa: F401
34
+ CatalogProperties,
35
+ get_catalog_properties,
36
+ )
37
+ from deltacat.catalog.model.table_definition import TableDefinition
38
+ from deltacat.catalog.main import impl as dcat
39
+
40
+ __all__ = [
41
+ "alter_namespace",
42
+ "alter_table",
43
+ "create_namespace",
44
+ "create_table",
45
+ "default_namespace",
46
+ "drop_namespace",
47
+ "drop_table",
48
+ "get_namespace",
49
+ "get_table",
50
+ "list_namespaces",
51
+ "list_tables",
52
+ "namespace_exists",
53
+ "read_table",
54
+ "refresh_table",
55
+ "rename_table",
56
+ "table_exists",
57
+ "truncate_table",
58
+ "write_to_table",
59
+ "all_catalogs",
60
+ "init",
61
+ "init_local",
62
+ "is_initialized",
63
+ "clear_catalogs",
64
+ "get_catalog",
65
+ "get_catalog_properties",
66
+ "pop_catalog",
67
+ "put_catalog",
68
+ "raise_if_not_initialized",
69
+ "dcat",
70
+ "Catalog",
71
+ "CatalogProperties",
72
+ "TableDefinition",
73
+ ]