deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,198 @@
1
+ import argparse
2
+ import pathlib
3
+
4
+ from deltacat.compute import (
5
+ job_client,
6
+ JobStatus,
7
+ )
8
+
9
+
10
+ def run_async(
11
+ source: str,
12
+ dest: str,
13
+ jobs_to_submit: int,
14
+ job_timeout: int,
15
+ cloud: str,
16
+ restart_ray: bool,
17
+ ):
18
+ # print package version info
19
+ working_dir = pathlib.Path(__file__).parent
20
+ cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
21
+ job_number = 0
22
+ client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
23
+ job_ids = []
24
+ while jobs_to_submit > 0:
25
+ jobs_to_submit -= 1
26
+ job_dest = dest + f".{job_number}"
27
+ job_id = client.submit_job(
28
+ # Entrypoint shell command to execute
29
+ entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
30
+ # Path to the local directory that contains the indexer.py file
31
+ # This entire directory will be zipped into a job package, so keep
32
+ # it small.
33
+ runtime_env={"working_dir": working_dir},
34
+ )
35
+ job_ids.append(job_id)
36
+ job_number += 1
37
+
38
+ print("Waiting for all jobs to complete...")
39
+ job_number = 0
40
+ all_job_logs = ""
41
+ for job_id in job_ids:
42
+ job_status = client.await_job(job_id, timeout_seconds=job_timeout)
43
+ if job_status != JobStatus.SUCCEEDED:
44
+ print(f"Job `{job_id}` logs: ")
45
+ print(client.get_job_logs(job_id))
46
+ raise RuntimeError(f"Job `{job_id}` terminated with status: {job_status}")
47
+ all_job_logs += f"\nJob #{job_number} logs: \n"
48
+ all_job_logs += client.get_job_logs(job_id)
49
+ job_number += 1
50
+ print("All jobs completed!")
51
+ print("Job Logs: ")
52
+ print(all_job_logs)
53
+
54
+
55
+ def run_sync(
56
+ source: str,
57
+ dest: str,
58
+ jobs_to_submit: int,
59
+ job_timeout: int,
60
+ cloud: str,
61
+ restart_ray: bool,
62
+ ):
63
+ working_dir = pathlib.Path(__file__).parent
64
+ cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
65
+ client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
66
+ job_number = 0
67
+ while job_number < jobs_to_submit:
68
+ job_dest = dest + f".{job_number}"
69
+ job_run_result = client.run_job(
70
+ # Entrypoint shell command to execute
71
+ entrypoint=f"python3 indexer.py --source '{source}' --dest '{job_dest}'",
72
+ # Path to the local directory that contains the indexer.py file
73
+ # This entire directory will be zipped into a job package, so keep
74
+ # it small.
75
+ runtime_env={"working_dir": working_dir},
76
+ timeout_seconds=job_timeout,
77
+ )
78
+ print(
79
+ f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}"
80
+ )
81
+ print(f"Job ID {job_run_result.job_id} logs: ")
82
+ print(job_run_result.job_logs)
83
+ job_number += 1
84
+
85
+
86
+ def run(
87
+ source: str,
88
+ dest: str,
89
+ restart_ray: bool,
90
+ jobs_to_submit: int,
91
+ job_timeout: int,
92
+ asynchronous: bool,
93
+ cloud_provider: str,
94
+ ):
95
+ run_func = run_async if asynchronous else run_sync
96
+ run_func(
97
+ source=source,
98
+ dest=dest,
99
+ jobs_to_submit=jobs_to_submit,
100
+ job_timeout=job_timeout,
101
+ cloud=cloud_provider,
102
+ restart_ray=restart_ray,
103
+ )
104
+
105
+
106
+ if __name__ == "__main__":
107
+ """
108
+ # Run this example through a command of the form:
109
+ $ python ./deltacat/examples/job_runner.py -- \
110
+ $ --source text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31 \
111
+ $ --dest parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet \
112
+ $ --asynchronous \
113
+ $ --jobs-to-submit 100 \
114
+ $ --job-timeout 90 \
115
+ $ --cloud-provider aws
116
+ """
117
+ script_args = [
118
+ (
119
+ [
120
+ "--source",
121
+ ],
122
+ {
123
+ "help": "Source DeltaCAT URL to index.",
124
+ "type": str,
125
+ "default": "text+s3://openalex-mag-format/data_dump_v1/2022-07-08/nlp/PaperAbstractsInvertedIndex.txt_part31",
126
+ },
127
+ ),
128
+ (
129
+ [
130
+ "--dest",
131
+ ],
132
+ {
133
+ "help": "Destination DeltaCAT URL to store the indexed file.",
134
+ "type": str,
135
+ "default": "parquet+s3://deltacat-example-output/openalex/PaperAbstractsInvertedIndex.part31.parquet",
136
+ },
137
+ ),
138
+ (
139
+ [
140
+ "--restart-ray",
141
+ ],
142
+ {
143
+ "help": "Restart Ray on an existing cluster.",
144
+ "action": "store_true",
145
+ "default": False,
146
+ },
147
+ ),
148
+ (
149
+ [
150
+ "--asynchronous",
151
+ ],
152
+ {
153
+ "help": "Run jobs asynchronously.",
154
+ "action": "store_true",
155
+ "default": False,
156
+ },
157
+ ),
158
+ (
159
+ [
160
+ "--jobs-to-submit",
161
+ ],
162
+ {
163
+ "help": "Number of indexer jobs to submit for execution.",
164
+ "type": int,
165
+ "default": 1,
166
+ },
167
+ ),
168
+ (
169
+ [
170
+ "--job-timeout",
171
+ ],
172
+ {
173
+ "help": "Job timeout in seconds.",
174
+ "type": int,
175
+ "default": 300,
176
+ },
177
+ ),
178
+ (
179
+ [
180
+ "--cloud-provider",
181
+ ],
182
+ {
183
+ "help": "Ray Cluster Cloud Provider ('aws' or 'gcp')",
184
+ "type": str,
185
+ "default": "aws",
186
+ },
187
+ ),
188
+ ]
189
+
190
+ # parse CLI input arguments
191
+ parser = argparse.ArgumentParser()
192
+ for args, kwargs in script_args:
193
+ parser.add_argument(*args, **kwargs)
194
+ args = parser.parse_args()
195
+ print(f"Command Line Arguments: {args}")
196
+
197
+ # run the example using os.environ as kwargs
198
+ run(**vars(args))
deltacat/exceptions.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
  from enum import Enum
3
- from typing import Callable
3
+ from typing import Callable, Optional, TYPE_CHECKING
4
4
  import logging
5
5
 
6
6
  import tenacity
@@ -28,6 +28,9 @@ from deltacat.utils.ray_utils.runtime import (
28
28
  get_current_ray_task_id,
29
29
  )
30
30
 
31
+ if TYPE_CHECKING:
32
+ from deltacat.storage.model.schema import FieldLocator
33
+
31
34
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
32
35
 
33
36
  DELTACAT_STORAGE_PARAM = "deltacat_storage"
@@ -74,9 +77,18 @@ class DeltaCatErrorNames(str, Enum):
74
77
  TABLE_NOT_FOUND_ERROR = "TableNotFoundError"
75
78
  TABLE_VERSION_NOT_FOUND_ERROR = "TableVersionNotFoundError"
76
79
  STREAM_NOT_FOUND_ERROR = "StreamNotFoundError"
80
+ PARTITION_NOT_FOUND_ERROR = "PartitionNotFoundError"
77
81
  DELTA_NOT_FOUND_ERROR = "DeltaNotFoundError"
78
82
  TABLE_ALREADY_EXISTS_ERROR = "TableAlreadyExistsError"
83
+ TABLE_VERSION_ALREADY_EXISTS_ERROR = "TableVersionAlreadyExistsError"
79
84
  NAMESPACE_ALREADY_EXISTS_ERROR = "NamespaceAlreadyExistsError"
85
+ SCHEMA_COMPATIBILITY_ERROR = "SchemaCompatibilityError"
86
+ SCHEMA_VALIDATION_ERROR = "SchemaValidationError"
87
+ TABLE_VALIDATION_ERROR = "TableValidationError"
88
+ CONCURRENT_MODIFICATION_ERROR = "ConcurrentModificationError"
89
+ OBJECT_NOT_FOUND_ERROR = "ObjectNotFoundError"
90
+ OBJECT_DELETED_ERROR = "ObjectDeletedError"
91
+ OBJECT_ALREADY_EXISTS_ERROR = "ObjectAlreadyExistsError"
80
92
 
81
93
 
82
94
  class DeltaCatError(Exception):
@@ -87,9 +99,12 @@ class DeltaCatError(Exception):
87
99
  super().__init__(*args, **kwargs)
88
100
 
89
101
  def _get_ray_task_id_and_node_ip(self):
90
- task_id = get_current_ray_task_id()
91
- node_ip = ray.util.get_node_ip_address()
92
- return task_id, node_ip
102
+ if ray.is_initialized():
103
+ task_id = get_current_ray_task_id()
104
+ node_ip = ray.util.get_node_ip_address()
105
+ return task_id, node_ip
106
+ else:
107
+ return None, None
93
108
 
94
109
 
95
110
  class NonRetryableError(DeltaCatError):
@@ -232,6 +247,10 @@ class TableVersionNotFoundError(NonRetryableError):
232
247
  error_name = DeltaCatErrorNames.TABLE_VERSION_NOT_FOUND_ERROR.value
233
248
 
234
249
 
250
+ class PartitionNotFoundError(NonRetryableError):
251
+ error_name = DeltaCatErrorNames.PARTITION_NOT_FOUND_ERROR.value
252
+
253
+
235
254
  class StreamNotFoundError(NonRetryableError):
236
255
  error_name = DeltaCatErrorNames.STREAM_NOT_FOUND_ERROR.value
237
256
 
@@ -244,10 +263,53 @@ class TableAlreadyExistsError(NonRetryableError):
244
263
  error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
245
264
 
246
265
 
266
+ class TableVersionAlreadyExistsError(NonRetryableError):
267
+ error_name = DeltaCatErrorNames.TABLE_VERSION_ALREADY_EXISTS_ERROR.value
268
+
269
+
247
270
  class NamespaceAlreadyExistsError(NonRetryableError):
248
271
  error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
249
272
 
250
273
 
274
+ class ObjectNotFoundError(NonRetryableError):
275
+ error_name = DeltaCatErrorNames.OBJECT_NOT_FOUND_ERROR.value
276
+
277
+
278
+ class ObjectDeletedError(NonRetryableError):
279
+ error_name = DeltaCatErrorNames.OBJECT_DELETED_ERROR.value
280
+
281
+
282
+ class ObjectAlreadyExistsError(NonRetryableError):
283
+ error_name = DeltaCatErrorNames.OBJECT_ALREADY_EXISTS_ERROR.value
284
+
285
+
286
+ class ConcurrentModificationError(NonRetryableError):
287
+ error_name = DeltaCatErrorNames.CONCURRENT_MODIFICATION_ERROR.value
288
+
289
+
290
+ class SchemaValidationError(NonRetryableError):
291
+ error_name = DeltaCatErrorNames.SCHEMA_VALIDATION_ERROR.value
292
+
293
+
294
+ class TableValidationError(NonRetryableError):
295
+ error_name = DeltaCatErrorNames.TABLE_VALIDATION_ERROR.value
296
+
297
+
298
+ class SchemaCompatibilityError(NonRetryableError):
299
+ error_name = DeltaCatErrorNames.SCHEMA_COMPATIBILITY_ERROR.value
300
+ """Raised when a schema update would break backward compatibility."""
301
+
302
+ def __init__(
303
+ self,
304
+ message: str,
305
+ field_locator: Optional[FieldLocator] = None,
306
+ *args,
307
+ **kwargs,
308
+ ):
309
+ super().__init__(message, *args, **kwargs)
310
+ self.field_locator = field_locator
311
+
312
+
251
313
  def categorize_errors(func: Callable):
252
314
  def wrapper(*args, **kwargs):
253
315
  try:
@@ -0,0 +1,6 @@
1
+ from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
2
+ IcebergCatalogConfig,
3
+ )
4
+ import deltacat.experimental.catalog.iceberg.impl as IcebergCatalog
5
+
6
+ __all__ = ["IcebergCatalogConfig", "IcebergCatalog"]
@@ -15,7 +15,7 @@ class IcebergCatalogConfig:
15
15
 
16
16
  This configuration is passed through to PyIceberg by invoking load_catalog.
17
17
  The Properties provided must match properties accepted by PyIceberg for each catalog type
18
- See: :func:`deltacat.catalog.iceberg.initialize`
18
+ See: :func:`deltacat.experimental.catalog.iceberg.initialize`
19
19
 
20
20
  Attributes:
21
21
  type: The PyIceberg Catalog instance
@@ -1,16 +1,26 @@
1
1
  import logging
2
+ import sys
2
3
 
3
4
  from typing import Any, Dict, List, Optional, Union
4
5
 
5
- from daft import DataFrame
6
+ from daft import DataFrame, context
7
+ from daft.daft import ScanOperatorHandle, StorageConfig
8
+ from daft.logical.builder import LogicalPlanBuilder
6
9
 
7
10
  from deltacat import logs
11
+ from deltacat.catalog.model.catalog import Catalog
8
12
  from deltacat.catalog.model.table_definition import TableDefinition
13
+ from deltacat.utils.daft import DeltaCatScanOperator
9
14
  from deltacat.exceptions import TableAlreadyExistsError
10
- from deltacat.storage.iceberg.iceberg_scan_planner import IcebergScanPlanner
11
- from deltacat.storage.iceberg.model import PartitionSchemeMapper, SchemaMapper
15
+ from deltacat.experimental.storage.iceberg.iceberg_scan_planner import (
16
+ IcebergScanPlanner,
17
+ )
18
+ from deltacat.experimental.storage.iceberg.model import (
19
+ PartitionSchemeMapper,
20
+ SchemaMapper,
21
+ )
12
22
  from deltacat.storage.model.partition import PartitionScheme
13
- from deltacat.storage.iceberg.impl import _get_native_catalog
23
+ from deltacat.experimental.storage.iceberg.impl import _get_native_catalog
14
24
  from deltacat.storage.model.sort_key import SortScheme
15
25
  from deltacat.storage.model.list_result import ListResult
16
26
  from deltacat.storage.model.namespace import Namespace, NamespaceProperties
@@ -23,20 +33,31 @@ from deltacat.storage.model.types import (
23
33
  LocalTable,
24
34
  StreamFormat,
25
35
  )
26
- from deltacat.storage.iceberg import impl as IcebergStorage
36
+ from deltacat.experimental.storage.iceberg import impl as IcebergStorage
27
37
  from deltacat.types.media import ContentType
28
38
  from deltacat.types.tables import TableWriteMode
29
39
  from deltacat.constants import DEFAULT_NAMESPACE
30
- from deltacat.catalog.iceberg.iceberg_catalog_config import IcebergCatalogConfig
40
+ from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
41
+ IcebergCatalogConfig,
42
+ )
31
43
 
32
- from pyiceberg.catalog import Catalog, load_catalog
44
+ from pyiceberg.catalog import Catalog as PyIcebergCatalog, load_catalog
33
45
  from pyiceberg.transforms import BucketTransform
34
46
 
35
47
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
48
 
49
+ IcebergCatalog = sys.modules[__name__]
50
+
51
+
52
+ def from_config(config: IcebergCatalogConfig, *args, **kwargs) -> Catalog:
53
+ """
54
+ Factory method to construct a catalog from Iceberg catalog configuration.
55
+ """
56
+ return Catalog(config, impl=IcebergCatalog, *args, **kwargs)
57
+
37
58
 
38
59
  # catalog functions
39
- def initialize(*args, config: IcebergCatalogConfig, **kwargs) -> Catalog:
60
+ def initialize(config: IcebergCatalogConfig, **kwargs) -> PyIcebergCatalog:
40
61
  """
41
62
  Initializes an Iceberg catalog with the given config.
42
63
 
@@ -120,7 +141,7 @@ def write_to_table(
120
141
  )
121
142
  # TODO(pdames): only append s3:// to output file paths when writing to S3!
122
143
  out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
123
- from deltacat.catalog.iceberg import overrides
144
+ from deltacat.experimental.catalog.iceberg import overrides
124
145
 
125
146
  overrides.append(
126
147
  table_definition.table.native_object,
@@ -144,7 +165,17 @@ def read_table(
144
165
  table: str, *args, namespace: Optional[str] = None, **kwargs
145
166
  ) -> DistributedDataset:
146
167
  """Read a table into a distributed dataset."""
147
- raise NotImplementedError("read_table not implemented")
168
+ # TODO: more proper IO configuration
169
+ io_config = context.get_context().daft_planning_config.default_io_config
170
+ multithreaded_io = context.get_context().get_or_create_runner().name != "ray"
171
+
172
+ storage_config = StorageConfig(multithreaded_io, io_config)
173
+
174
+ dc_table = get_table(name=table, namespace=namespace, **kwargs)
175
+ dc_scan_operator = DeltaCatScanOperator(dc_table, storage_config)
176
+ handle = ScanOperatorHandle.from_python_scan_operator(dc_scan_operator)
177
+ builder = LogicalPlanBuilder.from_tabular_scan(scan_operator=handle)
178
+ return DataFrame(builder)
148
179
 
149
180
 
150
181
  def alter_table(
@@ -167,7 +198,7 @@ def create_table(
167
198
  name: str,
168
199
  *args,
169
200
  namespace: Optional[str] = None,
170
- version: Optional[str] = None,
201
+ table_version: Optional[str] = None,
171
202
  lifecycle_state: Optional[LifecycleState] = None,
172
203
  schema: Optional[Schema] = None,
173
204
  partition_scheme: Optional[PartitionScheme] = None,
@@ -211,7 +242,7 @@ def create_table(
211
242
  IcebergStorage.create_table_version(
212
243
  namespace=namespace,
213
244
  table_name=name,
214
- table_version=version,
245
+ table_version=table_version,
215
246
  schema=schema,
216
247
  partition_scheme=partition_scheme,
217
248
  sort_keys=sort_keys,
@@ -5,12 +5,11 @@ from typing import Iterator, List
5
5
  from pyarrow.fs import FileSystem
6
6
 
7
7
  from pyiceberg.io.pyarrow import (
8
- fill_parquet_file_metadata,
8
+ data_file_statistics_from_parquet_metadata,
9
9
  compute_statistics_plan,
10
10
  parquet_path_to_id_mapping,
11
11
  )
12
- from pyiceberg.table import Table, _MergingSnapshotProducer
13
- from pyiceberg.table.snapshots import Operation
12
+ from pyiceberg.table import Table
14
13
  from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
15
14
  from pyiceberg.types import StructType, NestedField, IntegerType
16
15
  from pyiceberg.typedef import Record
@@ -24,11 +23,10 @@ def append(table: Table, paths: List[str]) -> None:
24
23
  # raise ValueError("Cannot write to tables with a sort-order")
25
24
 
26
25
  data_files = write_file(table, paths)
27
- merge = _MergingSnapshotProducer(operation=Operation.APPEND, table=table)
28
- for data_file in data_files:
29
- merge.append_data_file(data_file)
30
-
31
- merge.commit()
26
+ with table.transaction() as txn:
27
+ with txn.update_snapshot().fast_append() as snapshot_update:
28
+ for data_file in data_files:
29
+ snapshot_update.append_data_file(data_file)
32
30
 
33
31
 
34
32
  def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
@@ -41,6 +39,11 @@ def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
41
39
  fs_path = fs_tuple[1]
42
40
  with fs.open_input_file(fs_path) as native_file:
43
41
  parquet_metadata = pq.read_metadata(native_file)
42
+ statistics = data_file_statistics_from_parquet_metadata(
43
+ parquet_metadata=parquet_metadata,
44
+ stats_columns=compute_statistics_plan(table.schema(), table.properties),
45
+ parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
46
+ )
44
47
  data_file = DataFile(
45
48
  content=DataFileContent.DATA,
46
49
  file_path=file_path,
@@ -63,12 +66,7 @@ def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
63
66
  spec_id=table.spec().spec_id,
64
67
  equality_ids=None,
65
68
  key_metadata=None,
66
- )
67
- fill_parquet_file_metadata(
68
- data_file=data_file,
69
- parquet_metadata=parquet_metadata,
70
- stats_columns=compute_statistics_plan(table.schema(), table.properties),
71
- parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
69
+ **statistics.to_serialized_dict(),
72
70
  )
73
71
  data_files.append(data_file)
74
72
  return data_files