deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Optional, Dict, Any
2
+ from typing import Optional, Dict, Any, List
3
3
  from deltacat import logs
4
4
  from deltacat.storage import (
5
5
  Delta,
@@ -61,6 +61,7 @@ def _estimate_resources_required_to_process_delta_using_previous_inflation(
61
61
  def _estimate_resources_required_to_process_delta_using_type_params(
62
62
  delta: Delta,
63
63
  operation_type: OperationType,
64
+ all_column_names: List[str],
64
65
  estimate_resources_params: EstimateResourcesParams,
65
66
  deltacat_storage: unimplemented_deltacat_storage,
66
67
  deltacat_storage_kwargs: Dict[str, Any],
@@ -93,11 +94,30 @@ def _estimate_resources_required_to_process_delta_using_type_params(
93
94
  on_disk_size_bytes=delta.meta.content_length,
94
95
  ),
95
96
  )
96
-
97
+ file_reader_kwargs_provider = kwargs.get(
98
+ "file_reader_kwargs_provider"
99
+ ) or deltacat_storage_kwargs.get("file_reader_kwargs_provider")
100
+
101
+ """
102
+ NOTE: The file_reader_kwargs_provider parameter can be passed in two ways:
103
+ 1. Nested within deltacat_storage_kwargs during resource estimation
104
+ 2. As a top-level attribute of CompactPartitionsParams during compaction
105
+
106
+ This creates an inconsistent parameter path between resource estimation and compaction flows.
107
+ As a long-term solution, this should be unified to use a single consistent path (either always
108
+ nested in deltacat_storage_kwargs or always as a top-level parameter).
109
+
110
+ For now, this implementation handles the resource estimation case by:
111
+ 1. First checking for file_reader_kwargs_provider as a direct kwarg
112
+ 2. Falling back to deltacat_storage_kwargs if not found
113
+ This approach maintains backward compatibility by not modifying the DELTA_RESOURCE_ESTIMATION_FUNCTIONS signatures.
114
+ """
97
115
  appended = append_content_type_params(
98
116
  delta=delta,
117
+ all_column_names=all_column_names,
99
118
  deltacat_storage=deltacat_storage,
100
119
  deltacat_storage_kwargs=deltacat_storage_kwargs,
120
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
101
121
  )
102
122
 
103
123
  if not appended:
@@ -152,6 +172,10 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
152
172
  operation_type == OperationType.PYARROW_DOWNLOAD
153
173
  ), "Number of rows can only be estimated for PYARROW_DOWNLOAD operation"
154
174
 
175
+ if not estimate_resources_params.max_files_to_sample:
176
+ # we cannot calculate if we cannot sample
177
+ return None
178
+
155
179
  if not delta.manifest:
156
180
  delta.manifest = deltacat_storage.get_delta_manifest(
157
181
  delta.locator,
@@ -168,10 +192,6 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
168
192
  ),
169
193
  )
170
194
 
171
- if not estimate_resources_params.max_files_to_sample:
172
- # we cannot calculate if we cannot sample
173
- return None
174
-
175
195
  sampled_in_memory_size = 0.0
176
196
  sampled_on_disk_size = 0.0
177
197
  sampled_num_rows = 0
@@ -234,6 +254,10 @@ RESOURCE_ESTIMATION_METHOD_TO_DELTA_RESOURCE_ESTIMATION_FUNCTIONS = {
234
254
  _estimate_resources_required_to_process_delta_using_file_sampling,
235
255
  _estimate_resources_required_to_process_delta_using_previous_inflation,
236
256
  ],
257
+ ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION: [
258
+ _estimate_resources_required_to_process_delta_using_file_sampling,
259
+ _estimate_resources_required_to_process_delta_using_previous_inflation,
260
+ ],
237
261
  }
238
262
 
239
263
 
@@ -267,10 +291,18 @@ def estimate_resources_required_to_process_delta(
267
291
  estimate_resources_params.resource_estimation_method
268
292
  )
269
293
 
294
+ all_column_names = deltacat_storage.get_table_version_column_names(
295
+ delta.locator.namespace,
296
+ delta.locator.table_name,
297
+ delta.locator.table_version,
298
+ **deltacat_storage_kwargs,
299
+ )
300
+
270
301
  for func in functions:
271
302
  resources = func(
272
303
  delta=delta,
273
304
  operation_type=operation_type,
305
+ all_column_names=all_column_names,
274
306
  estimate_resources_params=estimate_resources_params,
275
307
  deltacat_storage=deltacat_storage,
276
308
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -23,6 +23,14 @@ class ResourceEstimationMethod(str, Enum):
23
23
  """
24
24
  DEFAULT_V2 = "DEFAULT_V2"
25
25
 
26
+ """
27
+ This approach combines file sampling estimation and inflation based methods
28
+ and runs them in the order specified below:
29
+ 1. FILE_SAMPLING
30
+ 2. PREVIOUS_INFLATION
31
+ """
32
+ FILE_SAMPLING_WITH_PREVIOUS_INFLATION = "FILE_SAMPLING_WITH_PREVIOUS_INFLATION"
33
+
26
34
  """
27
35
  This approach strictly uses previous inflation and average record size to arrive
28
36
  at a resource estimate. It requires users to pass in previous inflation and average
deltacat/constants.py CHANGED
@@ -1,8 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import botocore.exceptions
3
4
 
5
+ from daft.exceptions import DaftTransientError
4
6
  from deltacat.utils.common import env_string, env_bool
5
- import os
7
+ from deltacat.utils.common import env_integer
6
8
 
7
9
  # Environment variables
8
10
  DELTACAT_SYS_LOG_LEVEL = env_string("DELTACAT_SYS_LOG_LEVEL", "DEBUG")
@@ -40,14 +42,13 @@ DELTACAT_LOGGER_USE_SINGLE_HANDLER = env_bool(
40
42
  )
41
43
  DELTACAT_ROOT = env_string(
42
44
  "DELTACAT_ROOT",
43
- os.path.join(os.getcwd(), ".deltacat"),
45
+ "",
44
46
  )
45
47
 
46
48
  # CLI Args
47
- METAFILE_FORMAT_KEY = "METAFILE_FORMAT"
48
49
  METAFILE_FORMAT_JSON = "json"
49
50
  METAFILE_FORMAT_MSGPACK = "msgpack"
50
- METAFILE_FORMAT = env_string(METAFILE_FORMAT_KEY, METAFILE_FORMAT_MSGPACK)
51
+ METAFILE_FORMAT = env_string("METAFILE_FORMAT", METAFILE_FORMAT_MSGPACK)
51
52
  SUPPORTED_METAFILE_FORMATS = [METAFILE_FORMAT_JSON, METAFILE_FORMAT_MSGPACK]
52
53
  METAFILE_EXT = {
53
54
  "json": ".json",
@@ -93,14 +94,56 @@ REVISION_DIR_NAME: str = "rev"
93
94
  TXN_DIR_NAME: str = "txn"
94
95
  RUNNING_TXN_DIR_NAME: str = "running"
95
96
  FAILED_TXN_DIR_NAME: str = "failed"
97
+ PAUSED_TXN_DIR_NAME: str = "paused"
96
98
  SUCCESS_TXN_DIR_NAME: str = "success"
99
+ DATA_FILE_DIR_NAME: str = "data"
100
+ REV_DIR_NAME: str = "rev"
97
101
  TXN_PART_SEPARATOR = "_"
102
+
98
103
  # Storage interface defaults
99
104
  # These defaults should be applied in catalog interface implementations
100
105
  # Storage interface implementations should be agnostic to defaults and require full information
101
- DEFAULT_CATALOG = "DEFAULT"
102
- DEFAULT_NAMESPACE = "DEFAULT"
106
+ DEFAULT_CATALOG = "default"
107
+ DEFAULT_NAMESPACE = "default"
103
108
  DEFAULT_TABLE_VERSION = "1"
104
109
  DEFAULT_STREAM_ID = "stream"
105
110
  DEFAULT_PARTITION_ID = "partition"
106
111
  DEFAULT_PARTITION_VALUES = ["default"]
112
+
113
+ # Transaction Status constants
114
+ SUCCESSFULLY_CLEANED = "cleaned"
115
+ CURRENTLY_CLEANING = "cleaning"
116
+ TIMEOUT_TXN = "timedout"
117
+
118
+ # operation timeout constants
119
+ OPERATION_TIMEOUTS = {
120
+ "create": 5,
121
+ "update": 3,
122
+ "delete": 4,
123
+ "read_siblings": 2,
124
+ "read_children": 2,
125
+ "read_latest": 3,
126
+ "read_exists": 1,
127
+ }
128
+ # Upload/Download Retry Defaults
129
+ UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY = env_integer(
130
+ "UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY", 10 * 60
131
+ )
132
+ UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY = env_integer(
133
+ "UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY", 30 * 60
134
+ )
135
+ DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY = env_integer(
136
+ "DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY", 30 * 60
137
+ )
138
+ DEFAULT_FILE_READ_TIMEOUT_MS = env_integer(
139
+ "DEFAULT_FILE_READ_TIMEOUT_MS", 300_000
140
+ ) # 5 mins
141
+ RETRYABLE_TRANSIENT_ERRORS = (
142
+ OSError,
143
+ botocore.exceptions.ConnectionError,
144
+ botocore.exceptions.HTTPClientError,
145
+ botocore.exceptions.NoCredentialsError,
146
+ botocore.exceptions.ConnectTimeoutError,
147
+ botocore.exceptions.ReadTimeoutError,
148
+ DaftTransientError,
149
+ )