deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -6,15 +6,26 @@ from typing import Any, List, Set, Protocol, TypeVar, Dict, Iterable
6
6
 
7
7
  from pyarrow import RecordBatch, Table
8
8
  from deltacat.storage.model.partition import PartitionLocator
9
- from deltacat.storage.rivulet.metastore.delta import ManifestIO, DeltacatManifestIO
10
-
11
- from deltacat.storage.rivulet import Schema
12
- from deltacat.storage.rivulet.metastore.json_sst import JsonSstWriter
13
- from deltacat.storage.rivulet.serializer import MEMTABLE_DATA, DataSerializer
14
- from deltacat.storage.rivulet.serializer_factory import DataSerializerFactory
15
- from deltacat.storage.rivulet.writer.dataset_writer import DatasetWriter, DATA
16
- from deltacat.storage.rivulet.metastore.sst import SSTWriter
17
- from deltacat.storage.rivulet.fs.file_provider import FileProvider
9
+ from deltacat.experimental.storage.rivulet.metastore.delta import (
10
+ ManifestIO,
11
+ DeltacatManifestIO,
12
+ )
13
+
14
+ from deltacat.experimental.storage.rivulet import Schema
15
+ from deltacat.experimental.storage.rivulet.metastore.json_sst import JsonSstWriter
16
+ from deltacat.experimental.storage.rivulet.serializer import (
17
+ MEMTABLE_DATA,
18
+ DataSerializer,
19
+ )
20
+ from deltacat.experimental.storage.rivulet.serializer_factory import (
21
+ DataSerializerFactory,
22
+ )
23
+ from deltacat.experimental.storage.rivulet.writer.dataset_writer import (
24
+ DatasetWriter,
25
+ DATA,
26
+ )
27
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTWriter
28
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
18
29
 
19
30
  INPUT_ROW = TypeVar("INPUT_ROW")
20
31
 
deltacat/io/__init__.py CHANGED
@@ -0,0 +1,13 @@
1
+ from deltacat.io.reader.deltacat_read_api import read_deltacat
2
+ from deltacat.io.datasource.deltacat_datasource import DeltacatReadType
3
+ from deltacat.io.datasource.deltacat_datasource import (
4
+ METAFILE_DATA_COLUMN_NAME,
5
+ METAFILE_TYPE_COLUMN_NAME,
6
+ )
7
+
8
+ __all__ = [
9
+ "read_deltacat",
10
+ "DeltacatReadType",
11
+ "METAFILE_DATA_COLUMN_NAME",
12
+ "METAFILE_TYPE_COLUMN_NAME",
13
+ ]
File without changes
@@ -0,0 +1,91 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, Callable, Dict, Optional, cast
5
+
6
+ import pyarrow as pa
7
+ from ray.data import Dataset
8
+
9
+ from deltacat.utils.url import DeltaCatUrl
10
+ from deltacat.io.datasink.deltacat_datasink import DeltaCatDatasink
11
+
12
+
13
+ class DeltaCatDataset(Dataset):
14
+ @staticmethod
15
+ def from_dataset(dataset: Dataset) -> DeltaCatDataset:
16
+ # cast to DeltacatDataset in-place since it only adds new methods
17
+ dataset.__class__ = DeltaCatDataset
18
+ return cast(DeltaCatDataset, dataset)
19
+
20
+ def write_deltacat(
21
+ self,
22
+ url: DeltaCatUrl,
23
+ *,
24
+ # if the source dataset only contains DeltaCAT metadata, then only copy the metadata to the destination... if it contains external source file paths, then register them in a new Delta.
25
+ metadata_only: bool = False,
26
+ # merge all deltas as part of the write operation
27
+ copy_on_write: Optional[bool] = False,
28
+ filesystem: Optional[pa.fs.S3FileSystem] = None,
29
+ try_create_dir: bool = True,
30
+ arrow_open_stream_args: Optional[Dict[str, Any]] = None,
31
+ arrow_parquet_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
32
+ min_rows_per_file: Optional[int] = None,
33
+ ray_remote_args: Dict[str, Any] = None,
34
+ concurrency: Optional[int] = None,
35
+ **arrow_parquet_args,
36
+ ) -> None:
37
+ """Writes the dataset to files and commits DeltaCAT metadata indexing
38
+ the files written.
39
+
40
+ This is only supported for datasets convertible to Arrow records.
41
+ To control the number of files, use ``.repartition()``.
42
+
43
+ Unless a custom block path provider is given, the format of the output
44
+ files will be {uuid}_{block_idx}.{extension}, where ``uuid`` is a
45
+ unique id for the dataset.
46
+
47
+ The DeltaCAT manifest will be written to ``f"{path}/manifest``
48
+
49
+ Examples:
50
+ >>> ds.write_deltacat("s3://catalog/root/path")
51
+
52
+ Time complexity: O(dataset size / parallelism)
53
+
54
+ Args:
55
+ url: The path to the root directory where materialized files and
56
+ DeltaCAT manifest will be written.
57
+ filesystem: The filesystem implementation to write to. This should
58
+ be either a PyArrow S3FileSystem.
59
+ try_create_dir: Try to create all directories in destination path
60
+ if True. Does nothing if all directories already exist.
61
+ arrow_open_stream_args: kwargs passed to
62
+ pyarrow.fs.S3FileSystem.open_output_stream
63
+ filename_provider: FilenameProvider implementation
64
+ to write each dataset block to a custom output path.
65
+ arrow_parquet_args_fn: Callable that returns a dictionary of write
66
+ arguments to use when writing each block to a file. Overrides
67
+ any duplicate keys from arrow_parquet_args. This should be used
68
+ instead of arrow_parquet_args if any of your write arguments
69
+ cannot be pickled, or if you'd like to lazily resolve the write
70
+ arguments for each dataset block.
71
+ arrow_parquet_args: Options to pass to
72
+ pyarrow.parquet.write_table(), which is used to write out each
73
+ block to a file.
74
+ """
75
+ datasink = DeltaCatDatasink(
76
+ url,
77
+ metadata_only=metadata_only,
78
+ copy_on_write=copy_on_write,
79
+ arrow_parquet_args_fn=arrow_parquet_args_fn,
80
+ arrow_parquet_args=arrow_parquet_args,
81
+ min_rows_per_file=min_rows_per_file,
82
+ filesystem=filesystem,
83
+ try_create_dir=try_create_dir,
84
+ open_stream_args=arrow_open_stream_args,
85
+ dataset_uuid=self._uuid,
86
+ )
87
+ self.write_datasink(
88
+ datasink,
89
+ ray_remote_args=ray_remote_args,
90
+ concurrency=concurrency,
91
+ )
File without changes
@@ -0,0 +1,207 @@
1
+ import logging
2
+
3
+ from collections import OrderedDict
4
+ from typing import Dict, Any, Optional, List, Iterable
5
+
6
+ from ray.data import Datasink
7
+ from ray.data._internal.execution.interfaces import TaskContext
8
+ from ray.data.block import Block, BlockAccessor
9
+ from ray.data.datasource import WriteResult
10
+
11
+ from ray.data.datasource.filename_provider import (
12
+ FilenameProvider,
13
+ )
14
+
15
+ from deltacat import logs
16
+
17
+ from deltacat.constants import METAFILE_FORMAT_MSGPACK
18
+ from deltacat.storage import Metafile
19
+ from deltacat.io.datasource.deltacat_datasource import (
20
+ METAFILE_DATA_COLUMN_NAME,
21
+ METAFILE_TYPE_COLUMN_NAME,
22
+ )
23
+ from deltacat.utils.url import DeltaCatUrl, DeltaCatUrlWriter
24
+
25
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
+
27
+
28
+ class CapturingBlockWritePathProvider(FilenameProvider):
29
+ """Delegating block write path provider that saves an ordered dictionary of
30
+ input keyword arguments for every block write path returned."""
31
+
32
+ def __init__(
33
+ self,
34
+ block_write_path_provider: FilenameProvider,
35
+ base_path: Optional[str] = None,
36
+ ):
37
+ self.base_path = base_path
38
+ self.block_write_path_provider = block_write_path_provider
39
+ self.write_path_kwargs: Dict[str, Dict[str, Any]] = OrderedDict()
40
+
41
+ def get_filename_for_block(
42
+ self,
43
+ block: Any,
44
+ task_index: int,
45
+ block_index: int,
46
+ ) -> str:
47
+ if self.base_path is None:
48
+ raise ValueError(
49
+ "Base path must be provided to CapturingBlockWritePathProvider",
50
+ )
51
+ return self._get_write_path_for_block(
52
+ base_path=self.base_path,
53
+ block=block,
54
+ block_index=block_index,
55
+ )
56
+
57
+ def _get_write_path_for_block(
58
+ self,
59
+ base_path: str,
60
+ *args,
61
+ **kwargs,
62
+ ) -> str:
63
+ filename = self.block_write_path_provider.get_filename_for_block(
64
+ *args,
65
+ **kwargs,
66
+ )
67
+ write_path = f"{base_path}/{filename}"
68
+ kwargs["base_path"] = base_path
69
+ self.write_path_kwargs[write_path] = kwargs
70
+ return write_path
71
+
72
+
73
+ class DeltaCatWriteResult:
74
+ def __init__(self):
75
+ self.metadata = None
76
+ self.path = None
77
+ self.dataset_uuid = None
78
+ self.block_write_path_provider = None
79
+ self.content_type = None
80
+ self.content_encoding = None
81
+ self.filesystem = None
82
+
83
+
84
+ class DeltaCatDatasink(Datasink[List[Metafile]]):
85
+ def __init__(
86
+ self,
87
+ url: DeltaCatUrl,
88
+ *,
89
+ metadata_only: bool = False,
90
+ copy_on_write: Optional[bool] = False,
91
+ ):
92
+ self._url = url
93
+ self._metadata_only = metadata_only
94
+ self._copy_on_write = copy_on_write
95
+
96
+ def on_write_start(self) -> None:
97
+ pass
98
+
99
+ def write(
100
+ self,
101
+ blocks: Iterable[Block],
102
+ ctx: TaskContext,
103
+ ) -> List[Metafile]:
104
+ for block in blocks:
105
+ pa_table = BlockAccessor.for_block(block).to_arrow()
106
+ if (
107
+ METAFILE_DATA_COLUMN_NAME in pa_table.column_names
108
+ and METAFILE_TYPE_COLUMN_NAME in pa_table.column_names
109
+ ):
110
+ for pa_scalar in pa_table[METAFILE_DATA_COLUMN_NAME]:
111
+ metafile_msgpack_bytes = pa_scalar.as_py()
112
+ metafile = Metafile.deserialize(
113
+ serialized=metafile_msgpack_bytes,
114
+ meta_format=METAFILE_FORMAT_MSGPACK,
115
+ )
116
+ # TODO(pdames): Add `metafile` to writer as a kwarg instead
117
+ # of constructing a new URL with the metafile as input.
118
+ writer_url = DeltaCatUrlWriter(self._url, metafile=metafile)
119
+ # TODO(pdames): Run writes in order from catalog -> delta
120
+ # by truncating the URL down to just dc://{catalog-name}
121
+ # and rebuilding all path elements from there.
122
+ writer_url.write(metafile)
123
+ else:
124
+ raise NotImplementedError(
125
+ f"Expected {METAFILE_DATA_COLUMN_NAME} and "
126
+ f"{METAFILE_TYPE_COLUMN_NAME} columns in the input block, "
127
+ f"but found {pa_table.column_names}."
128
+ )
129
+
130
+ def on_write_complete(
131
+ self,
132
+ write_result: WriteResult[List[Metafile]],
133
+ ):
134
+ pass
135
+
136
+
137
+ """
138
+ def write(
139
+ self,
140
+ blocks: Iterable[Block],
141
+ ctx: TaskContext,
142
+ ) -> List[ObjectRef[DeltacatWriteResult]]:
143
+ paths, filesystem = resolve_paths_and_filesystem(
144
+ self.path,
145
+ self.filesystem,
146
+ )
147
+ assert len(paths) == 1, f"Expected 1 write path, found {len(paths)}."
148
+ path = paths[0]
149
+ write_results = super().write(blocks)
150
+ # append a summary of this write operation in the last write result
151
+ metadata = [BlockAccessor.for_block(_).get_metadata() for _ in blocks]
152
+ rwr = DeltacatWriteResult()
153
+ rwr.metadata = metadata
154
+ rwr.path = path
155
+ rwr.dataset_uuid = self.dataset_uuid
156
+ rwr.block_write_path_provider = self.filename_provider
157
+ rwr.content_type = ContentType.PARQUET.value
158
+ rwr.content_encoding = ContentEncoding.IDENTITY.value
159
+ rwr.filesystem = filesystem
160
+ rwr_obj_ref = ray.put(rwr)
161
+ write_results.append(rwr_obj_ref)
162
+ return write_results
163
+
164
+ def on_write_complete(self, write_results: List[Any], **kwargs) -> None:
165
+ # TODO (pdames): time latency of this operation - overall s3 write times
166
+ # are 2-3x pure read_parquet_fast() times
167
+ # restore the write operation summary from the last write result
168
+ result: DeltacatWriteResult = write_results[len(write_results) - 1]
169
+ write_path_args = result.block_write_path_provider.write_path_kwargs
170
+ blocks_written = len(write_path_args)
171
+ expected_blocks_written = len(result.metadata)
172
+ # TODO(pdames): Corner cases where mismatch is expected? Emply blocks?
173
+ # Blocks filtered/split/merged to more/less write paths?
174
+ assert blocks_written == expected_blocks_written, (
175
+ f"Dataset write result validation failed. Found "
176
+ f"{blocks_written}/{expected_blocks_written} Dataset blocks "
177
+ f"written. Refusing to commit DeltaCAT Manifest."
178
+ )
179
+ manifest_entries = ManifestEntryList()
180
+ for block_idx, path in enumerate(write_path_args.keys()):
181
+ file_info = result.filesystem.get_file_info(path)
182
+ if file_info.type == pyarrow.fs.FileType.File:
183
+ content_length = file_info.size
184
+ else:
185
+ raise FileNotFoundError(ENOENT, strerror(ENOENT), path)
186
+ num_rows = result.metadata[block_idx].num_rows
187
+ source_content_length = result.metadata[block_idx].size_bytes
188
+ manifest_entry_meta = ManifestMeta.of(
189
+ int(num_rows) if num_rows is not None else None,
190
+ int(content_length) if content_length is not None else None,
191
+ result.content_type,
192
+ result.content_encoding,
193
+ int(source_content_length) if source_content_length else None,
194
+ )
195
+ parsed_url = parse_s3_url(path)
196
+ manifest_entry = ManifestEntry.of(
197
+ parsed_url.url,
198
+ manifest_entry_meta,
199
+ )
200
+ manifest_entries.append(manifest_entry)
201
+ manifest = Manifest.of(manifest_entries)
202
+ manifest_path = f"{result.path}/manifest"
203
+ logger.debug(f"Write succeeded for Dataset ID: {result.dataset_uuid}")
204
+ with result.filesystem.open_output_stream(manifest_path) as f:
205
+ f.write(json.dumps(manifest).encode("utf-8"))
206
+ logger.debug(f"Manifest committed to: {manifest_path}")
207
+ """
File without changes