deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,11 @@
1
1
  from __future__ import annotations
2
- from typing import Optional, Dict
3
- from deltacat.compute.converter.constants import DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
2
+ from typing import Optional, Dict, Any, List
3
+ from deltacat.compute.converter.constants import (
4
+ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM,
5
+ )
6
+ from deltacat.constants import DEFAULT_NAMESPACE
7
+ from fsspec import AbstractFileSystem
8
+ from pyiceberg.catalog import Catalog
4
9
 
5
10
 
6
11
  class ConverterSessionParams(dict):
@@ -9,7 +14,7 @@ class ConverterSessionParams(dict):
9
14
  """
10
15
 
11
16
  @staticmethod
12
- def of(params: Optional[Dict]) -> ConverterSessionParams:
17
+ def of(params: Optional[Dict[str, Any]]) -> ConverterSessionParams:
13
18
  params = {} if params is None else params
14
19
  assert params.get("catalog") is not None, "catalog is a required arg"
15
20
  assert (
@@ -18,15 +23,15 @@ class ConverterSessionParams(dict):
18
23
  assert (
19
24
  params.get("iceberg_warehouse_bucket_name") is not None
20
25
  ), "iceberg_warehouse_bucket_name is a required arg"
21
- assert (
22
- params.get("iceberg_namespace") is not None
23
- ), "iceberg_namespace is a required arg"
24
26
  result = ConverterSessionParams(params)
25
27
 
28
+ result.iceberg_namespace = params.get("iceberg_namespace", DEFAULT_NAMESPACE)
26
29
  result.enforce_primary_key_uniqueness = params.get(
27
30
  "enforce_primary_key_uniqueness", False
28
31
  )
29
- result.compact_small_files = params.get("compact_small_files", False)
32
+ result.compact_previous_position_delete_files = params.get(
33
+ "compact_previous_position_delete_files", False
34
+ )
30
35
 
31
36
  # For Iceberg v3 spec, option to produce delete vector that can establish 1:1 mapping with data files.
32
37
  result.position_delete_for_multiple_data_files = params.get(
@@ -36,10 +41,14 @@ class ConverterSessionParams(dict):
36
41
  "task_max_parallelism", DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
37
42
  )
38
43
  result.merge_keys = params.get("merge_keys", None)
44
+ result.s3_client_kwargs = params.get("s3_client_kwargs", {})
45
+ result.filesystem = params.get("filesystem", None)
46
+ result.s3_prefix_override = params.get("s3_prefix_override", None)
47
+
39
48
  return result
40
49
 
41
50
  @property
42
- def catalog(self):
51
+ def catalog(self) -> Catalog:
43
52
  return self["catalog"]
44
53
 
45
54
  @property
@@ -54,21 +63,31 @@ class ConverterSessionParams(dict):
54
63
  def iceberg_namespace(self) -> str:
55
64
  return self["iceberg_namespace"]
56
65
 
66
+ @iceberg_namespace.setter
67
+ def iceberg_namespace(self, iceberg_namespace: str) -> None:
68
+ self["iceberg_namespace"] = iceberg_namespace
69
+
57
70
  @property
58
71
  def enforce_primary_key_uniqueness(self) -> bool:
59
72
  return self["enforce_primary_key_uniqueness"]
60
73
 
61
74
  @enforce_primary_key_uniqueness.setter
62
- def enforce_primary_key_uniqueness(self, enforce_primary_key_uniqueness) -> None:
75
+ def enforce_primary_key_uniqueness(
76
+ self, enforce_primary_key_uniqueness: bool
77
+ ) -> None:
63
78
  self["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
64
79
 
65
80
  @property
66
- def compact_small_files(self) -> bool:
67
- return self["compact_small_files"]
81
+ def compact_previous_position_delete_files(self) -> bool:
82
+ return self["compact_previous_position_delete_files"]
68
83
 
69
- @compact_small_files.setter
70
- def compact_small_files(self, compact_small_files) -> None:
71
- self["compact_small_files"] = compact_small_files
84
+ @compact_previous_position_delete_files.setter
85
+ def compact_previous_position_delete_files(
86
+ self, compact_previous_position_delete_files: bool
87
+ ) -> None:
88
+ self[
89
+ "compact_previous_position_delete_files"
90
+ ] = compact_previous_position_delete_files
72
91
 
73
92
  @property
74
93
  def position_delete_for_multiple_data_files(self) -> bool:
@@ -76,24 +95,50 @@ class ConverterSessionParams(dict):
76
95
 
77
96
  @position_delete_for_multiple_data_files.setter
78
97
  def position_delete_for_multiple_data_files(
79
- self, position_delete_for_multiple_data_files
98
+ self, position_delete_for_multiple_data_files: bool
80
99
  ) -> None:
81
100
  self[
82
101
  "position_delete_for_multiple_data_files"
83
102
  ] = position_delete_for_multiple_data_files
84
103
 
85
104
  @property
86
- def task_max_parallelism(self) -> str:
105
+ def task_max_parallelism(self) -> int:
87
106
  return self["task_max_parallelism"]
88
107
 
89
108
  @task_max_parallelism.setter
90
- def task_max_parallelism(self, task_max_parallelism) -> None:
109
+ def task_max_parallelism(self, task_max_parallelism: int) -> None:
91
110
  self["task_max_parallelism"] = task_max_parallelism
92
111
 
93
112
  @property
94
- def merge_keys(self) -> str:
113
+ def merge_keys(self) -> Optional[List[str]]:
95
114
  return self["merge_keys"]
96
115
 
97
116
  @merge_keys.setter
98
- def merge_keys(self, merge_keys) -> None:
117
+ def merge_keys(self, merge_keys: Optional[List[str]]) -> None:
99
118
  self["merge_keys"] = merge_keys
119
+
120
+ @property
121
+ def s3_client_kwargs(self) -> Dict[str, Any]:
122
+ return self["s3_client_kwargs"]
123
+
124
+ @s3_client_kwargs.setter
125
+ def s3_client_kwargs(self, s3_client_kwargs: Dict[str, Any]) -> None:
126
+ self["s3_client_kwargs"] = s3_client_kwargs
127
+
128
+ @property
129
+ def filesystem(self) -> Optional[AbstractFileSystem]:
130
+ return self["filesystem"]
131
+
132
+ @filesystem.setter
133
+ def filesystem(self, filesystem: Optional[AbstractFileSystem]) -> None:
134
+ self["filesystem"] = filesystem
135
+
136
+ @property
137
+ def location_provider_prefix_override(self) -> Optional[str]:
138
+ return self["location_provider_prefix_override"]
139
+
140
+ @location_provider_prefix_override.setter
141
+ def location_provider_prefix_override(
142
+ self, location_provider_prefix_override: Optional[str]
143
+ ) -> None:
144
+ self["location_provider_prefix_override"] = location_provider_prefix_override
@@ -1,8 +1,15 @@
1
- from typing import Optional
2
-
3
-
4
- def load_catalog(iceberg_catalog_name, iceberg_catalog_properties):
5
- catalog = load_catalog(
1
+ from typing import Optional, Dict, Any
2
+ from pyiceberg.table import Table
3
+ from pyiceberg.catalog import Catalog, load_catalog as pyiceberg_load_catalog
4
+ from botocore.credentials import Credentials
5
+ import boto3
6
+ from boto3.session import Session
7
+
8
+
9
+ def load_catalog(
10
+ iceberg_catalog_name: str, iceberg_catalog_properties: Dict[str, Any]
11
+ ) -> Catalog:
12
+ catalog = pyiceberg_load_catalog(
6
13
  name=iceberg_catalog_name,
7
14
  **iceberg_catalog_properties,
8
15
  )
@@ -23,25 +30,21 @@ def get_s3_path(
23
30
  return result_path
24
31
 
25
32
 
26
- def get_bucket_name():
27
- return "metadata-py4j-zyiqin1"
33
+ def get_bucket_name() -> str:
34
+ return "test-bucket"
28
35
 
29
36
 
30
- def get_s3_prefix():
37
+ def get_s3_prefix() -> str:
31
38
  return get_s3_path(get_bucket_name())
32
39
 
33
40
 
34
- def get_credential():
35
- import boto3
36
-
37
- boto3_session = boto3.Session()
38
- credentials = boto3_session.get_credentials()
41
+ def get_credential() -> Credentials:
42
+ boto3_session: Session = boto3.Session()
43
+ credentials: Credentials = boto3_session.get_credentials()
39
44
  return credentials
40
45
 
41
46
 
42
- def get_glue_catalog():
43
- from pyiceberg.catalog import load_catalog
44
-
47
+ def get_glue_catalog() -> Catalog:
45
48
  credential = get_credential()
46
49
  # Credentials are refreshable, so accessing your access key / secret key
47
50
  # separately can lead to a race condition. Use this to get an actual matched
@@ -51,7 +54,7 @@ def get_glue_catalog():
51
54
  secret_access_key = credential.secret_key
52
55
  session_token = credential.token
53
56
  s3_path = get_s3_prefix()
54
- glue_catalog = load_catalog(
57
+ glue_catalog = pyiceberg_load_catalog(
55
58
  "glue",
56
59
  **{
57
60
  "warehouse": s3_path,
@@ -70,6 +73,6 @@ def get_glue_catalog():
70
73
  return glue_catalog
71
74
 
72
75
 
73
- def load_table(catalog, table_name):
76
+ def load_table(catalog: Catalog, table_name: str) -> Table:
74
77
  loaded_table = catalog.load_table(table_name)
75
78
  return loaded_table
@@ -1,69 +1,210 @@
1
1
  from collections import defaultdict
2
2
  import logging
3
3
  from deltacat import logs
4
+ import pyarrow
4
5
  import pyarrow.parquet as pq
6
+ from pyiceberg.io.pyarrow import (
7
+ parquet_path_to_id_mapping,
8
+ StatisticsCollector,
9
+ MetricModeTypes,
10
+ DataFileStatistics,
11
+ MetricsMode,
12
+ StatsAggregator,
13
+ )
14
+ from typing import Dict, List, Set, Any, Tuple
15
+ from deltacat.compute.converter.utils.iceberg_columns import (
16
+ ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN,
17
+ ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN,
18
+ )
19
+ from pyiceberg.io.pyarrow import (
20
+ compute_statistics_plan,
21
+ )
22
+ from pyiceberg.manifest import (
23
+ DataFile,
24
+ DataFileContent,
25
+ FileFormat,
26
+ )
27
+ from pyiceberg.table import _min_sequence_number, _open_manifest, Table
28
+ from pyiceberg.utils.concurrent import ExecutorFactory
29
+ from itertools import chain
30
+ from pyiceberg.typedef import (
31
+ KeyDefaultDict,
32
+ )
33
+ from pyiceberg.schema import Schema
34
+ from pyiceberg.io import FileIO
35
+ from deltacat.compute.converter.model.convert_input_files import (
36
+ DataFileList,
37
+ )
38
+
5
39
 
6
40
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
7
41
 
8
42
 
9
- def parquet_files_dict_to_iceberg_data_files(io, table_metadata, files_dict_list):
10
- from pyiceberg.io.pyarrow import (
11
- _check_pyarrow_schema_compatible,
12
- data_file_statistics_from_parquet_metadata,
13
- compute_statistics_plan,
14
- parquet_path_to_id_mapping,
15
- )
16
- from pyiceberg.manifest import (
17
- DataFile,
18
- DataFileContent,
19
- FileFormat,
43
+ def parquet_path_to_id_mapping_override(schema: Schema) -> Dict[str, int]:
44
+ res = parquet_path_to_id_mapping(schema)
45
+ # Override here to insert position delete reserved column field IDs
46
+ res["file_path"] = ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN
47
+ res["pos"] = ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN
48
+ return res
49
+
50
+
51
+ def data_file_statistics_from_parquet_metadata(
52
+ parquet_metadata: pq.FileMetaData,
53
+ stats_columns: Dict[int, StatisticsCollector],
54
+ parquet_column_mapping: Dict[str, int],
55
+ ) -> DataFileStatistics:
56
+ """
57
+ Overrides original Pyiceberg function: Compute and return DataFileStatistics that includes the following.
58
+
59
+ - record_count
60
+ - column_sizes
61
+ - value_counts
62
+ - null_value_counts
63
+ - nan_value_counts
64
+ - column_aggregates
65
+ - split_offsets
66
+
67
+ Args:
68
+ parquet_metadata (pyarrow.parquet.FileMetaData): A pyarrow metadata object.
69
+ stats_columns (Dict[int, StatisticsCollector]): The statistics gathering plan. It is required to
70
+ set the mode for column metrics collection
71
+ parquet_column_mapping (Dict[str, int]): The mapping of the parquet file name to the field ID
72
+ """
73
+ column_sizes: Dict[int, int] = {}
74
+ value_counts: Dict[int, int] = {}
75
+ split_offsets: List[int] = []
76
+
77
+ null_value_counts: Dict[int, int] = {}
78
+ nan_value_counts: Dict[int, int] = {}
79
+
80
+ col_aggs = {}
81
+
82
+ invalidate_col: Set[int] = set()
83
+ for r in range(parquet_metadata.num_row_groups):
84
+ # References:
85
+ # https://github.com/apache/iceberg/blob/fc381a81a1fdb8f51a0637ca27cd30673bd7aad3/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java#L232
86
+ # https://github.com/apache/parquet-mr/blob/ac29db4611f86a07cc6877b416aa4b183e09b353/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java#L184
87
+
88
+ row_group = parquet_metadata.row_group(r)
89
+
90
+ data_offset = row_group.column(0).data_page_offset
91
+ dictionary_offset = row_group.column(0).dictionary_page_offset
92
+
93
+ if row_group.column(0).has_dictionary_page and dictionary_offset < data_offset:
94
+ split_offsets.append(dictionary_offset)
95
+ else:
96
+ split_offsets.append(data_offset)
97
+
98
+ for pos in range(parquet_metadata.num_columns):
99
+ column = row_group.column(pos)
100
+ field_id = parquet_column_mapping[column.path_in_schema]
101
+ if field_id in stats_columns:
102
+ stats_col = stats_columns[field_id]
103
+
104
+ column_sizes.setdefault(field_id, 0)
105
+ column_sizes[field_id] += column.total_compressed_size
106
+
107
+ if stats_col.mode == MetricsMode(MetricModeTypes.NONE):
108
+ continue
109
+
110
+ value_counts[field_id] = (
111
+ value_counts.get(field_id, 0) + column.num_values
112
+ )
113
+
114
+ if column.is_stats_set:
115
+ try:
116
+ statistics = column.statistics
117
+
118
+ if statistics.has_null_count:
119
+ null_value_counts[field_id] = (
120
+ null_value_counts.get(field_id, 0)
121
+ + statistics.null_count
122
+ )
123
+
124
+ if stats_col.mode == MetricsMode(MetricModeTypes.COUNTS):
125
+ continue
126
+
127
+ if field_id not in col_aggs:
128
+ col_aggs[field_id] = StatsAggregator(
129
+ stats_col.iceberg_type,
130
+ statistics.physical_type,
131
+ stats_col.mode.length,
132
+ )
133
+
134
+ col_aggs[field_id].update_min(statistics.min)
135
+ col_aggs[field_id].update_max(statistics.max)
136
+
137
+ except pyarrow.lib.ArrowNotImplementedError as e:
138
+ invalidate_col.add(field_id)
139
+ logger.warning(e)
140
+ else:
141
+ # Note: Removed original adding columns without stats to invalid column logic here
142
+ logger.warning(
143
+ "PyArrow statistics missing for column %d when writing file", pos
144
+ )
145
+
146
+ split_offsets.sort()
147
+
148
+ for field_id in invalidate_col:
149
+ del col_aggs[field_id]
150
+ del null_value_counts[field_id]
151
+
152
+ return DataFileStatistics(
153
+ record_count=parquet_metadata.num_rows,
154
+ column_sizes=column_sizes,
155
+ value_counts=value_counts,
156
+ null_value_counts=null_value_counts,
157
+ nan_value_counts=nan_value_counts,
158
+ column_aggregates=col_aggs,
159
+ split_offsets=split_offsets,
20
160
  )
21
161
 
22
- data_file_content_type = DataFileContent.POSITION_DELETES
162
+
163
+ def parquet_files_dict_to_iceberg_data_files(
164
+ io: FileIO,
165
+ table_metadata: Any,
166
+ files_dict: Dict[Any, List[str]],
167
+ file_content_type: DataFileContent,
168
+ ) -> List[DataFile]:
23
169
  iceberg_files = []
24
170
  schema = table_metadata.schema()
25
- for files_dict in files_dict_list:
26
- for partition_value, file_paths in files_dict.items():
27
- for file_path in file_paths:
28
- input_file = io.new_input(file_path)
29
- with input_file.open() as input_stream:
30
- parquet_metadata = pq.read_metadata(input_stream)
31
- _check_pyarrow_schema_compatible(
32
- schema, parquet_metadata.schema.to_arrow_schema()
33
- )
171
+ for partition_value, file_paths in files_dict.items():
172
+ for file_path in file_paths:
173
+ input_file = io.new_input(file_path)
174
+ with input_file.open() as input_stream:
175
+ parquet_metadata = pq.read_metadata(input_stream)
34
176
 
35
- statistics = data_file_statistics_from_parquet_metadata(
36
- parquet_metadata=parquet_metadata,
37
- stats_columns=compute_statistics_plan(
38
- schema, table_metadata.properties
39
- ),
40
- parquet_column_mapping=parquet_path_to_id_mapping(schema),
41
- )
177
+ # Removed _check_pyarrow_schema_compatible() here since reserved columns does not comply to all rules.
42
178
 
43
- data_file = DataFile(
44
- content=data_file_content_type,
45
- file_path=file_path,
46
- file_format=FileFormat.PARQUET,
47
- partition=partition_value,
48
- # partition=Record(**{"pk": "111", "bucket": 2}),
49
- file_size_in_bytes=len(input_file),
50
- sort_order_id=None,
51
- spec_id=table_metadata.default_spec_id,
52
- equality_ids=None,
53
- key_metadata=None,
54
- **statistics.to_serialized_dict(),
55
- )
56
- iceberg_files.append(data_file)
179
+ statistics = data_file_statistics_from_parquet_metadata(
180
+ parquet_metadata=parquet_metadata,
181
+ stats_columns=compute_statistics_plan(
182
+ schema, table_metadata.properties
183
+ ),
184
+ parquet_column_mapping=parquet_path_to_id_mapping_override(schema),
185
+ )
186
+
187
+ data_file = DataFile(
188
+ content=file_content_type,
189
+ file_path=file_path,
190
+ file_format=FileFormat.PARQUET,
191
+ partition=partition_value,
192
+ file_size_in_bytes=len(input_file),
193
+ sort_order_id=None,
194
+ spec_id=table_metadata.default_spec_id,
195
+ equality_ids=None,
196
+ key_metadata=None,
197
+ **statistics.to_serialized_dict(),
198
+ )
199
+ iceberg_files.append(data_file)
57
200
  return iceberg_files
58
201
 
59
202
 
60
- def fetch_all_bucket_files(table):
203
+ def fetch_all_bucket_files(
204
+ table: Table,
205
+ ) -> Tuple[Dict[Any, DataFileList], Dict[Any, DataFileList], Dict[Any, DataFileList]]:
61
206
  # step 1: filter manifests using partition summaries
62
207
  # the filter depends on the partition spec used to write the manifest file, so create a cache of filters for each spec id
63
- from pyiceberg.typedef import (
64
- KeyDefaultDict,
65
- )
66
-
67
208
  data_scan = table.scan()
68
209
  snapshot = data_scan.snapshot()
69
210
  if not snapshot:
@@ -78,23 +219,8 @@ def fetch_all_bucket_files(table):
78
219
 
79
220
  # step 2: filter the data files in each manifest
80
221
  # this filter depends on the partition spec used to write the manifest file
81
- from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator
82
- from pyiceberg.types import (
83
- strtobool,
84
- )
85
- from pyiceberg.table import _min_sequence_number, _open_manifest
86
- from pyiceberg.utils.concurrent import ExecutorFactory
87
- from itertools import chain
88
- from pyiceberg.manifest import DataFileContent
89
-
90
222
  partition_evaluators = KeyDefaultDict(data_scan._build_partition_evaluator)
91
- metrics_evaluator = _InclusiveMetricsEvaluator(
92
- data_scan.table_metadata.schema(),
93
- data_scan.row_filter,
94
- data_scan.case_sensitive,
95
- strtobool(data_scan.options.get("include_empty_files", "false")),
96
- ).eval
97
-
223
+ residual_evaluators = KeyDefaultDict(data_scan._build_residual_evaluator)
98
224
  min_sequence_number = _min_sequence_number(manifests)
99
225
 
100
226
  # {"bucket_index": List[DataFile]}
@@ -111,7 +237,8 @@ def fetch_all_bucket_files(table):
111
237
  data_scan.io,
112
238
  manifest,
113
239
  partition_evaluators[manifest.partition_spec_id],
114
- metrics_evaluator,
240
+ residual_evaluators[manifest.partition_spec_id],
241
+ data_scan._build_metrics_evaluator(),
115
242
  )
116
243
  for manifest in manifests
117
244
  if data_scan._check_sequence_number(min_sequence_number, manifest)
@@ -122,9 +249,10 @@ def fetch_all_bucket_files(table):
122
249
  file_sequence_number = manifest_entry.sequence_number
123
250
  data_file_tuple = (file_sequence_number, data_file)
124
251
  partition_value = data_file.partition
252
+
125
253
  if data_file.content == DataFileContent.DATA:
126
254
  data_entries[partition_value].append(data_file_tuple)
127
- if data_file.content == DataFileContent.POSITION_DELETES:
255
+ elif data_file.content == DataFileContent.POSITION_DELETES:
128
256
  positional_delete_entries[partition_value].append(data_file_tuple)
129
257
  elif data_file.content == DataFileContent.EQUALITY_DELETES:
130
258
  equality_data_entries[partition_value].append(data_file_tuple)