deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -8,7 +8,12 @@ from deltacat.api import (
8
8
  list,
9
9
  put,
10
10
  )
11
- from deltacat.catalog.delegate import (
11
+ from deltacat.storage.model.transaction import (
12
+ transaction,
13
+ transactions,
14
+ read_transaction,
15
+ )
16
+ from deltacat.catalog import ( # noqa: F401
12
17
  alter_namespace,
13
18
  alter_table,
14
19
  create_namespace,
@@ -27,22 +32,30 @@ from deltacat.catalog.delegate import (
27
32
  table_exists,
28
33
  truncate_table,
29
34
  write_to_table,
30
- )
31
- from deltacat.catalog.model.catalog import ( # noqa: F401
32
- Catalog,
33
- Catalogs,
34
- raise_if_not_initialized,
35
- is_initialized,
36
35
  init,
36
+ init_local,
37
+ is_initialized,
38
+ clear_catalogs,
37
39
  get_catalog,
40
+ get_catalog_properties,
41
+ pop_catalog,
38
42
  put_catalog,
43
+ raise_if_not_initialized,
44
+ Catalog,
45
+ CatalogProperties,
46
+ TableDefinition,
39
47
  )
40
- from deltacat.catalog.model.table_definition import TableDefinition
41
48
  from deltacat.compute import (
42
49
  job_client,
43
50
  local_job_client,
44
51
  )
45
52
  from deltacat.storage import (
53
+ BucketingStrategy,
54
+ BucketTransform,
55
+ BucketTransformParameters,
56
+ DayTransform,
57
+ HourTransform,
58
+ IdentityTransform,
46
59
  Dataset,
47
60
  DistributedDataset,
48
61
  Field,
@@ -58,22 +71,50 @@ from deltacat.storage import (
58
71
  SortKey,
59
72
  SortOrder,
60
73
  SortScheme,
74
+ TableProperties,
75
+ TransactionStatus,
76
+ Transform,
77
+ TransformName,
78
+ TransformParameters,
79
+ TruncateTransform,
80
+ TruncateTransformParameters,
81
+ TruncateStrategy,
82
+ UnknownTransform,
83
+ VoidTransform,
84
+ YearTransform,
61
85
  NullOrder,
62
86
  )
63
- from deltacat.storage.rivulet import Dataset as RivDataset, Datatype as RivDatatype
64
87
  from deltacat.types.media import (
65
88
  ContentEncoding,
66
89
  ContentType,
67
90
  DatasetType,
68
91
  DatastoreType,
69
92
  )
70
-
71
- from deltacat.types.tables import TableWriteMode
93
+ from deltacat.types.tables import (
94
+ TableWriteMode,
95
+ TableProperty,
96
+ TableReadOptimizationLevel,
97
+ SchemaEvolutionMode,
98
+ from_pandas,
99
+ from_pyarrow,
100
+ from_manifest_table,
101
+ to_pyarrow,
102
+ to_pandas,
103
+ dataset_length,
104
+ dataset_size,
105
+ dataset_column_names,
106
+ dataset_schema,
107
+ )
72
108
  from deltacat.utils.url import DeltaCatUrl
73
109
 
110
+ write = write_to_table
111
+ read = read_table
112
+
74
113
  __iceberg__ = []
75
114
  if importlib.util.find_spec("pyiceberg") is not None:
76
- from deltacat.catalog.iceberg import impl as IcebergCatalog # noqa: F401
115
+ from deltacat.experimental.catalog.iceberg import ( # noqa: F401
116
+ impl as IcebergCatalog,
117
+ )
77
118
 
78
119
  __iceberg__ = [
79
120
  "IcebergCatalog",
@@ -81,7 +122,7 @@ if importlib.util.find_spec("pyiceberg") is not None:
81
122
 
82
123
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
83
124
 
84
- __version__ = "2.0.0b10"
125
+ __version__ = "2.0.0b12"
85
126
 
86
127
 
87
128
  __all__ = [
@@ -92,6 +133,8 @@ __all__ = [
92
133
  "get",
93
134
  "list",
94
135
  "put",
136
+ "transaction",
137
+ "transactions",
95
138
  "alter_table",
96
139
  "create_table",
97
140
  "drop_table",
@@ -108,28 +151,50 @@ __all__ = [
108
151
  "create_namespace",
109
152
  "drop_namespace",
110
153
  "default_namespace",
154
+ "write",
111
155
  "write_to_table",
156
+ "read",
112
157
  "read_table",
158
+ "read_transaction",
159
+ "init",
160
+ "init_local",
161
+ "is_initialized",
162
+ "clear_catalogs",
113
163
  "get_catalog",
164
+ "get_catalog_properties",
165
+ "pop_catalog",
114
166
  "put_catalog",
115
167
  "raise_if_not_initialized",
116
- "is_initialized",
117
- "init",
168
+ "dataset_length",
169
+ "dataset_size",
170
+ "dataset_column_names",
171
+ "dataset_schema",
172
+ "from_pandas",
173
+ "from_pyarrow",
174
+ "from_manifest_table",
175
+ "to_pandas",
176
+ "to_pyarrow",
177
+ "BucketingStrategy",
178
+ "BucketTransform",
179
+ "BucketTransformParameters",
118
180
  "Catalog",
181
+ "CatalogProperties",
119
182
  "ContentType",
120
183
  "ContentEncoding",
121
184
  "Dataset",
122
185
  "DatasetType",
123
186
  "DatastoreType",
187
+ "DayTransform",
124
188
  "DeltaCatUrl",
125
189
  "DistributedDataset",
126
- "RivDataset",
127
- "RivDatatype",
128
190
  "Field",
191
+ "HourTransform",
192
+ "IdentityTransform",
129
193
  "LifecycleState",
130
194
  "ListResult",
131
195
  "LocalDataset",
132
196
  "LocalTable",
197
+ "MonthTransform",
133
198
  "Namespace",
134
199
  "NullOrder",
135
200
  "PartitionKey",
@@ -141,6 +206,20 @@ __all__ = [
141
206
  "SortScheme",
142
207
  "TableDefinition",
143
208
  "TableWriteMode",
209
+ "TableProperties",
210
+ "TableProperty",
211
+ "TableReadOptimizationLevel",
212
+ "SchemaEvolutionMode",
213
+ "TransactionStatus",
214
+ "Transform",
215
+ "TransformName",
216
+ "TransformParameters",
217
+ "TruncateTransform",
218
+ "TruncateTransformParameters",
219
+ "TruncateStrategy",
220
+ "UnknownTransform",
221
+ "VoidTransform",
222
+ "YearTransform",
144
223
  ]
145
224
 
146
225
  __all__ += __iceberg__
deltacat/api.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import time
2
2
  from dataclasses import dataclass
3
3
  from typing import Any, Union, List, Optional, Dict, Callable, Tuple
4
+ import logging
4
5
 
5
6
  import ray
6
7
  import deltacat as dc
@@ -15,6 +16,12 @@ from deltacat.io import (
15
16
  DeltacatReadType,
16
17
  )
17
18
  from deltacat.storage import (
19
+ Namespace,
20
+ Table,
21
+ TableVersion,
22
+ Stream,
23
+ Partition,
24
+ Delta,
18
25
  Dataset,
19
26
  DistributedDataset,
20
27
  ListResult,
@@ -44,6 +51,9 @@ from deltacat.utils.ray_utils.runtime import (
44
51
  other_live_node_resource_keys,
45
52
  find_max_single_node_resource_type,
46
53
  )
54
+ from deltacat import logs
55
+
56
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
47
57
 
48
58
  """
49
59
  # CLI Example of Copying from Source to Dest without file conversion
@@ -72,38 +82,6 @@ from deltacat.utils.ray_utils.runtime import (
72
82
  """
73
83
 
74
84
 
75
- def _copy_dc(
76
- source: DeltaCatUrl,
77
- destination: DeltaCatUrl,
78
- recursive: bool = False,
79
- ) -> Metafile:
80
- if recursive:
81
- src_obj = list(source, recursive=True)
82
- else:
83
- src_obj = get(source) if not source.url.endswith("/*") else list(source)
84
- """
85
- dc_dest_url = DeltacatUrl(destination)
86
- # TODO(pdames): Add writer with support for Ray Dataset DeltaCAT Sink &
87
- # Recursive DeltaCAT source object copies. Ideally, the Ray Dataset read
88
- # is lazy, and only indexes metadata about the objects at source instead
89
- # of eagerly converting them to PyArrow-based Blocks.
90
- dc_dest_url.writer(src_obj, recursive=recursive)
91
- """
92
-
93
- src_parts = source.url.split("/")
94
- src_parts = [part for part in src_parts if part]
95
- dst_parts = destination.url.split("/")
96
- dst_parts = [part for part in dst_parts if part]
97
- dc.raise_if_not_initialized()
98
- if len(src_parts) != len(dst_parts):
99
- # TODO(pdames): Better error message.
100
- raise ValueError(
101
- f"Cannot copy {source} to {destination}. "
102
- f"Source and destination must share the same type."
103
- )
104
- return put(destination, metafile=src_obj)
105
-
106
-
107
85
  def copy(
108
86
  src: DeltaCatUrl,
109
87
  dst: DeltaCatUrl,
@@ -123,6 +101,7 @@ def copy(
123
101
  "gz": 35,
124
102
  "bz2": 35,
125
103
  "zip": 35,
104
+ "zst": 35,
126
105
  "7z": 35,
127
106
  "*": 2.5,
128
107
  },
@@ -135,11 +114,10 @@ def copy(
135
114
  Copies data from the source datastore to the destination datastore. By
136
115
  default, this method launches one parallel Ray process to read/transform
137
116
  each input file found in the source followed by one parallel Ray process
138
- to write each output file to the destination. Files written to the
139
- destination are split or combined to contain uniform record counts. To
140
- ensure that adequate resources are available to complete the operation,
141
- you may optionally specify minimum cluster and/or worker CPUs to wait for
142
- before starting parallel processing.
117
+ to write each output file to the destination. To ensure that adequate
118
+ resources are available to complete the operation, you may optionally
119
+ specify minimum cluster and/or worker CPUs to wait for before starting
120
+ parallel processing.
143
121
 
144
122
  Args:
145
123
  src: DeltaCAT URL of the source datastore to read.
@@ -190,6 +168,73 @@ def copy(
190
168
  )
191
169
 
192
170
 
171
+ def _copy_objects_in_order(
172
+ src_objects: List[Metafile],
173
+ destination: DeltaCatUrl,
174
+ ) -> Union[Metafile, List[Metafile]]:
175
+ dc_dest_url = DeltaCatUrl(destination.url)
176
+ catalog_name = dc_dest_url.catalog_name
177
+
178
+ copied_results = []
179
+
180
+ # Group objects by type for hierarchical copying
181
+ # Copy objects in strict hierarchical order
182
+ # Namespace -> Table -> TableVersion -> Stream -> Partition -> Delta
183
+ ordered_objects_by_type = {
184
+ Namespace: [],
185
+ Table: [],
186
+ TableVersion: [],
187
+ Stream: [],
188
+ Partition: [],
189
+ Delta: [],
190
+ }
191
+
192
+ for obj in src_objects:
193
+ obj_class = Metafile.get_class(obj.to_serializable())
194
+ ordered_objects_by_type[obj_class].append(obj)
195
+
196
+ # TODO(pdames): Support copying uncommitted streams/partitions.
197
+ # TODO(pdames): Support parallel/distributed copies.
198
+ for obj_class, objects in ordered_objects_by_type.items():
199
+ if objects:
200
+ logger.info(f"Copying {len(objects)} {obj_class} objects...")
201
+ if obj_class == TableVersion:
202
+ # sort table versions by ascending table version
203
+ objects.sort(key=lambda x: x.current_version_number())
204
+ if obj_class == Delta:
205
+ # sort deltas by ascending stream position
206
+ objects.sort(key=lambda x: x.stream_position)
207
+ for i, obj in enumerate(objects):
208
+ logger.info(f"Copying object {i+1}/{len(objects)}: {obj.url}")
209
+ dest_url = DeltaCatUrl(obj.url(catalog_name=catalog_name))
210
+ logger.info(f"Destination URL for object {i+1}/{len(objects)}: {dest_url}")
211
+ result = put(dest_url, metafile=obj)
212
+ copied_results.append(result)
213
+ logger.info(f"Successfully copied object {i+1}/{len(objects)}")
214
+ return copied_results[0] if len(copied_results) == 1 else copied_results
215
+
216
+
217
+ def _copy_dc(
218
+ source: DeltaCatUrl,
219
+ destination: DeltaCatUrl,
220
+ recursive: bool = False,
221
+ ) -> Union[Metafile, List[Metafile]]:
222
+ dc.raise_if_not_initialized()
223
+ if len(source.url.split("/")) != len(destination.url.split("/")):
224
+ # TODO(pdames): Better error message.
225
+ raise ValueError(
226
+ f"Cannot copy {source} to {destination}. "
227
+ f"Source and destination must share the same type."
228
+ )
229
+ if recursive:
230
+ src_objects = list(DeltaCatUrl(source.url.rstrip("/**")), recursive=True)
231
+ elif source.url.endswith("/*"):
232
+ src_objects = list(DeltaCatUrl(source.url.rstrip("/*")))
233
+ else:
234
+ src_objects = [get(source)]
235
+ return _copy_objects_in_order(src_objects, destination)
236
+
237
+
193
238
  def concat(source, destination):
194
239
  raise NotImplementedError
195
240
 
@@ -214,9 +259,13 @@ def _list_all_metafiles(
214
259
  metafiles: ListResult[Metafile] = lister(**kwargs)
215
260
  list_results.append(metafiles)
216
261
  if recursive:
262
+ # Process each level of the hierarchy
263
+ current_level_metafiles = [mf for mf in metafiles.all_items()]
264
+
217
265
  for lister, kwarg_name, kwarg_val_resolver_fn in reader.listers:
266
+ next_level_metafiles = []
218
267
  # each subsequent lister needs to inject missing keyword args from the parent metafile
219
- for metafile in metafiles.all_items():
268
+ for metafile in current_level_metafiles:
220
269
  kwargs_update = (
221
270
  {kwarg_name: kwarg_val_resolver_fn(metafile)}
222
271
  if kwarg_name and kwarg_val_resolver_fn
@@ -226,8 +275,11 @@ def _list_all_metafiles(
226
275
  **kwargs,
227
276
  **kwargs_update,
228
277
  }
229
- metafiles = lister(**lister_kwargs)
230
- list_results.append(metafiles)
278
+ child_metafiles = lister(**lister_kwargs)
279
+ list_results.append(child_metafiles)
280
+ next_level_metafiles.extend(child_metafiles.all_items())
281
+ # Move to the next level for the next iteration
282
+ current_level_metafiles = next_level_metafiles
231
283
  return [
232
284
  metafile for list_result in list_results for metafile in list_result.all_items()
233
285
  ]
@@ -308,7 +360,7 @@ def put(
308
360
  *args,
309
361
  **kwargs,
310
362
  ) -> Union[Metafile, str]:
311
- writer = DeltaCatUrlWriter(url, metafile)
363
+ writer = DeltaCatUrlWriter(url, metafile=metafile)
312
364
  return writer.write(*args, **kwargs)
313
365
 
314
366
 
@@ -351,6 +403,7 @@ def _copy_external_ray(
351
403
  "gz": 35,
352
404
  "bz2": 35,
353
405
  "zip": 35,
406
+ "zst": 35,
354
407
  "7z": 35,
355
408
  "*": 2.5,
356
409
  },
@@ -359,7 +412,7 @@ def _copy_external_ray(
359
412
  writer_args: Dict[str, Any] = {},
360
413
  filesystem: pafs.FileSystem = None,
361
414
  ) -> str:
362
- print(f"DeltaCAT Copy Invocation Received at: {time.time_ns()}")
415
+ logger.info(f"DeltaCAT Copy Invocation Received at: {time.time_ns()}")
363
416
 
364
417
  if not isinstance(src, DeltaCatUrl):
365
418
  raise ValueError(f"Expected `src` to be a `DeltaCatUrl` but got `{src}`.")
@@ -367,30 +420,32 @@ def _copy_external_ray(
367
420
  # wait for required resources
368
421
  head_cpu_count = int(current_node_resources()["CPU"])
369
422
  if minimum_worker_cpus > 0:
370
- print(f"Waiting for {minimum_worker_cpus} worker CPUs...")
423
+ logger.info(f"Waiting for {minimum_worker_cpus} worker CPUs...")
371
424
  live_cpu_waiter(
372
425
  min_live_cpus=minimum_worker_cpus + head_cpu_count,
373
426
  )
374
- print(f"{minimum_worker_cpus} worker CPUs found!")
427
+ logger.info(f"{minimum_worker_cpus} worker CPUs found!")
375
428
  # start job execution
376
429
  cluster_resources = ray.cluster_resources()
377
- print(f"Cluster Resources: {cluster_resources}")
378
- print(f"Available Cluster Resources: {ray.available_resources()}")
430
+ logger.info(f"Cluster Resources: {cluster_resources}")
431
+ logger.info(f"Available Cluster Resources: {ray.available_resources()}")
379
432
  cluster_cpus = int(cluster_resources["CPU"])
380
- print(f"Cluster CPUs: {cluster_cpus}")
433
+ logger.info(f"Cluster CPUs: {cluster_cpus}")
381
434
  all_node_resource_keys = live_node_resource_keys()
382
- print(f"Found {len(all_node_resource_keys)} live nodes: {all_node_resource_keys}")
435
+ logger.info(
436
+ f"Found {len(all_node_resource_keys)} live nodes: {all_node_resource_keys}"
437
+ )
383
438
  worker_node_resource_keys = other_live_node_resource_keys()
384
- print(
439
+ logger.info(
385
440
  f"Found {len(worker_node_resource_keys)} live worker nodes: {worker_node_resource_keys}"
386
441
  )
387
442
  worker_cpu_count = cluster_cpus - head_cpu_count
388
- print(f"Total worker CPUs: {worker_cpu_count}")
443
+ logger.info(f"Total worker CPUs: {worker_cpu_count}")
389
444
 
390
445
  # estimate memory requirements based on file extension
391
446
  estimated_memory_bytes = 0
392
447
  if extension_to_memory_multiplier:
393
- print(f"Resolving stats collection filesystem for: {src.url_path}.")
448
+ logger.info(f"Resolving stats collection filesystem for: {src.url_path}.")
394
449
  path, filesystem = resolve_path_and_filesystem(src.url_path, filesystem)
395
450
  if isinstance(filesystem, pafs.GcsFileSystem):
396
451
  from datetime import timedelta
@@ -402,7 +457,7 @@ def _copy_external_ray(
402
457
  anonymous=True,
403
458
  retry_time_limit=timedelta(seconds=10),
404
459
  )
405
- print(f"Using filesystem {type(filesystem)} to get file size of: {path}")
460
+ logger.info(f"Using filesystem {type(filesystem)} to get file size of: {path}")
406
461
  file_info = get_file_info(path, filesystem)
407
462
  if file_info.type != FileType.File:
408
463
  raise ValueError(
@@ -413,11 +468,11 @@ def _copy_external_ray(
413
468
  if inflation_multiplier is None:
414
469
  inflation_multiplier = extension_to_memory_multiplier.get("*")
415
470
  estimated_memory_bytes = inflation_multiplier * file_info.size
416
- print(
471
+ logger.info(
417
472
  f"Estimated Memory Required for Copy: "
418
473
  f"{estimated_memory_bytes/BYTES_PER_GIBIBYTE} GiB"
419
474
  )
420
- print(f"Starting DeltaCAT Copy at: {time.time_ns()}")
475
+ logger.info(f"Starting DeltaCAT Copy at: {time.time_ns()}")
421
476
 
422
477
  index_result = None
423
478
  num_cpus = 1
@@ -436,31 +491,31 @@ def _copy_external_ray(
436
491
  reader_args=reader_args,
437
492
  writer_args=writer_args,
438
493
  )
439
- print(f"Time to Launch Copy Task: {latency} seconds")
494
+ logger.info(f"Time to Launch Copy Task: {latency} seconds")
440
495
  try:
441
496
  index_result, latency = timed_invocation(
442
497
  ray.get,
443
498
  copy_task_pending,
444
499
  )
445
500
  except OutOfMemoryError as e:
446
- print(f"Copy Task Ran Out of Memory: {e}")
501
+ logger.warning(f"Copy Task Ran Out of Memory: {e}")
447
502
  max_single_node_cpus = min(
448
503
  max_allowed_cpus, find_max_single_node_resource_type("CPU")
449
504
  )
450
505
  num_cpus += 1
451
506
  if num_cpus > max_single_node_cpus:
452
507
  raise e
453
- print(f"Retrying Failed Copy Task with {num_cpus} dedicated CPUs")
508
+ logger.info(f"Retrying Failed Copy Task with {num_cpus} dedicated CPUs")
454
509
 
455
- print(f"Time to Launch Copy Task: {latency} seconds")
456
- print(f"Time to Complete Copy Task: {latency} seconds")
510
+ logger.info(f"Time to Launch Copy Task: {latency} seconds")
511
+ logger.info(f"Time to Complete Copy Task: {latency} seconds")
457
512
 
458
513
  total_gib_indexed = index_result.table_size / BYTES_PER_GIBIBYTE
459
514
 
460
- print(f"Records Copied: {index_result.table_length}")
461
- print(f"Bytes Copied: {total_gib_indexed} GiB")
462
- print(f"Conversion Rate: {total_gib_indexed/latency} GiB/s")
463
- print(f"Finished Copy at: {time.time_ns()}")
515
+ logger.info(f"Records Copied: {index_result.table_length}")
516
+ logger.info(f"Bytes Copied: {total_gib_indexed} GiB")
517
+ logger.info(f"Conversion Rate: {total_gib_indexed/latency} GiB/s")
518
+ logger.info(f"Finished Copy at: {time.time_ns()}")
464
519
 
465
520
  return dst.url
466
521
 
@@ -484,13 +539,13 @@ def copy_task(
484
539
  transforms=transforms,
485
540
  reader_args=reader_args,
486
541
  )
487
- print(f"Time to read {src.url_path}: {latency} seconds")
542
+ logger.debug(f"Time to read {src.url_path}: {latency} seconds")
488
543
 
489
544
  table_size = get_table_size(table)
490
- print(f"Table Size: {table_size/BYTES_PER_GIBIBYTE} GiB")
545
+ logger.debug(f"Table Size: {table_size/BYTES_PER_GIBIBYTE} GiB")
491
546
 
492
547
  table_length = get_table_length(table)
493
- print(f"Table Records: {table_length}")
548
+ logger.debug(f"Table Records: {table_length}")
494
549
 
495
550
  writer = DeltaCatUrlWriter(dest, dataset_type)
496
551
  written_file_path, latency = timed_invocation(
@@ -499,7 +554,7 @@ def copy_task(
499
554
  table,
500
555
  **writer_args,
501
556
  )
502
- print(f"Time to write {written_file_path}: {latency}")
557
+ logger.debug(f"Time to write {written_file_path}: {latency}")
503
558
 
504
559
  return CopyResult(table_size, table_length)
505
560
 
deltacat/aws/constants.py CHANGED
@@ -1,32 +1,9 @@
1
- import botocore
2
1
  from typing import Set
3
- from daft.exceptions import DaftTransientError
4
2
  from deltacat.utils.common import env_integer, env_string
5
3
 
6
4
 
7
5
  DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
8
- DEFAULT_FILE_READ_TIMEOUT_MS = env_integer(
9
- "DEFAULT_FILE_READ_TIMEOUT_MS", 300_000
10
- ) # 5 mins
11
6
  BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
12
7
  BOTO_TIMEOUT_ERROR_CODES: Set[str] = {"ReadTimeoutError", "ConnectTimeoutError"}
13
8
  BOTO_THROTTLING_ERROR_CODES: Set[str] = {"Throttling", "SlowDown"}
14
- RETRYABLE_TRANSIENT_ERRORS = (
15
- OSError,
16
- botocore.exceptions.ConnectionError,
17
- botocore.exceptions.HTTPClientError,
18
- botocore.exceptions.NoCredentialsError,
19
- botocore.exceptions.ConnectTimeoutError,
20
- botocore.exceptions.ReadTimeoutError,
21
- DaftTransientError,
22
- )
23
9
  AWS_REGION = env_string("AWS_REGION", "us-east-1")
24
- UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY = env_integer(
25
- "UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY", 10 * 60
26
- )
27
- UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY = env_integer(
28
- "UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY", 30 * 60
29
- )
30
- DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY = env_integer(
31
- "DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY", 30 * 60
32
- )