deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,66 +0,0 @@
1
- from __future__ import annotations
2
- from typing import Optional, Dict, List, Union, Any
3
- from deltacat.storage import (
4
- Delta,
5
- DeltaLocator,
6
- interface as unimplemented_deltacat_storage,
7
- )
8
-
9
-
10
- class MergeOnReadParams(dict):
11
- """
12
- This class represents the parameters passed to compact_partition (deltacat/compute/compactor/compaction_session.py)
13
- """
14
-
15
- @staticmethod
16
- def of(params: Optional[Dict]) -> MergeOnReadParams:
17
- params = {} if params is None else params
18
-
19
- result = MergeOnReadParams(params)
20
- assert result.deltas is not None, "deltas is a required arg"
21
-
22
- result.deltacat_storage = params.get(
23
- "deltacat_storage", unimplemented_deltacat_storage
24
- )
25
- result.reader_kwargs = params.get("reader_kwargs", {})
26
- result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
27
-
28
- return result
29
-
30
- @property
31
- def deltas(self) -> List[Union[Delta, DeltaLocator]]:
32
- """
33
- The list of deltas to compact in-memory.
34
- """
35
- return self["deltas"]
36
-
37
- @deltas.setter
38
- def deltas(self, to_set: List[Union[Delta, DeltaLocator]]) -> None:
39
- self["deltas"] = to_set
40
-
41
- @property
42
- def reader_kwargs(self) -> Dict[Any, Any]:
43
- """
44
- The key word arguments to be passed to the reader.
45
- """
46
- return self["reader_kwargs"]
47
-
48
- @reader_kwargs.setter
49
- def reader_kwargs(self, kwargs: Dict[Any, Any]) -> None:
50
- self["reader_kwargs"] = kwargs
51
-
52
- @property
53
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
54
- return self["deltacat_storage"]
55
-
56
- @deltacat_storage.setter
57
- def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
58
- self["deltacat_storage"] = storage
59
-
60
- @property
61
- def deltacat_storage_kwargs(self) -> dict:
62
- return self["deltacat_storage_kwargs"]
63
-
64
- @deltacat_storage_kwargs.setter
65
- def deltacat_storage_kwargs(self, kwargs: dict) -> None:
66
- self["deltacat_storage_kwargs"] = kwargs
@@ -1,42 +0,0 @@
1
- from typing import List, Dict, Any, Optional, Union
2
- from deltacat.storage.model.delta import Delta, DeltaLocator
3
- from deltacat.storage.model.types import DistributedDataset
4
- from deltacat.storage import (
5
- interface as unimplemented_deltacat_storage,
6
- )
7
- from deltacat.types.media import TableType, StorageType, DistributedDatasetType
8
-
9
-
10
- def create_df_from_all_deltas(
11
- deltas: List[Union[Delta, DeltaLocator]],
12
- table_type: TableType,
13
- distributed_dataset_type: DistributedDatasetType,
14
- reader_kwargs: Optional[Dict[Any, Any]] = None,
15
- deltacat_storage=unimplemented_deltacat_storage,
16
- deltacat_storage_kwargs: Optional[Dict[Any, Any]] = None,
17
- *args,
18
- **kwargs
19
- ) -> List[DistributedDataset]: # type: ignore
20
- """
21
- This method creates a distributed dataset for each delta and returns their references.
22
- """
23
-
24
- if reader_kwargs is None:
25
- reader_kwargs = {}
26
- if deltacat_storage_kwargs is None:
27
- deltacat_storage_kwargs = {}
28
-
29
- df_list = []
30
-
31
- for delta in deltas:
32
- df = deltacat_storage.download_delta(
33
- delta_like=delta,
34
- table_type=table_type,
35
- distributed_dataset_type=distributed_dataset_type,
36
- storage_type=StorageType.DISTRIBUTED,
37
- **reader_kwargs,
38
- **deltacat_storage_kwargs
39
- )
40
- df_list.append(df)
41
-
42
- return df_list
@@ -1,15 +0,0 @@
1
- import os
2
- import logging
3
- import argparse
4
- from deltacat import logs
5
-
6
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
7
-
8
-
9
- def store_cli_args_in_os_environ(script_args_list=[]):
10
- parser = argparse.ArgumentParser()
11
- for args, kwargs in script_args_list:
12
- parser.add_argument(*args, **kwargs)
13
- args = parser.parse_args()
14
- print(f"Command Line Arguments: {args}")
15
- os.environ.update(vars(args))
@@ -1,28 +0,0 @@
1
- from typing import Optional
2
-
3
- from pyiceberg.catalog import Catalog
4
- from deltacat.storage.model.scan.push_down import Pushdown
5
- from deltacat.storage.model.scan.scan_plan import ScanPlan
6
- from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
7
- from deltacat.storage.util.scan_planner import ScanPlanner
8
- from deltacat.storage.iceberg.impl import _try_load_iceberg_table
9
-
10
-
11
- class IcebergScanPlanner(ScanPlanner):
12
- def __init__(self, catalog: Catalog):
13
- self.catalog = catalog
14
-
15
- def create_scan_plan(
16
- self,
17
- table_name: str,
18
- namespace: Optional[str] = None,
19
- pushdown: Optional[Pushdown] = None,
20
- ) -> ScanPlan:
21
- iceberg_table = _try_load_iceberg_table(
22
- self.catalog, namespace=namespace, table_name=table_name
23
- )
24
- file_scan_tasks = []
25
- # TODO: implement predicate pushdown to Iceberg
26
- for scan_task in iceberg_table.scan().plan_files():
27
- file_scan_tasks.append(FileScanTask([DataFile(scan_task.file.file_path)]))
28
- return ScanPlan(file_scan_tasks)
@@ -1,11 +0,0 @@
1
- from .schema.schema import Schema
2
- from .schema.schema import Field
3
- from .dataset import Dataset
4
- from .schema.schema import Datatype
5
-
6
- __all__ = [
7
- "Schema",
8
- "Field",
9
- "Dataset",
10
- "Datatype",
11
- ]
@@ -1,5 +0,0 @@
1
- # TODO later on this will be moved to a dedicated package
2
- from deltacat.storage.rivulet.feather.file_reader import FeatherFileReader
3
- from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
4
-
5
- FileReaderRegistrar.register_reader("feather", FeatherFileReader)
@@ -1,5 +0,0 @@
1
- # TODO later on this will be moved to a dedicated package
2
- from deltacat.storage.rivulet.parquet.file_reader import ParquetFileReader
3
- from deltacat.storage.rivulet.reader.reader_type_registrar import FileReaderRegistrar
4
-
5
- FileReaderRegistrar.register_reader("parquet", ParquetFileReader)
@@ -1,231 +0,0 @@
1
- import pytest
2
- import os
3
- from moto import mock_s3
4
- import boto3
5
- from boto3.resources.base import ServiceResource
6
- from deltacat.compute.compactor.utils.round_completion_file import (
7
- read_round_completion_file,
8
- write_round_completion_file,
9
- )
10
- from deltacat.tests.compute.test_util_common import get_test_partition_locator
11
- from deltacat.compute.compactor import RoundCompletionInfo
12
-
13
- RCF_BUCKET_NAME = "rcf-bucket"
14
-
15
-
16
- @pytest.fixture(autouse=True, scope="module")
17
- def mock_aws_credential():
18
- os.environ["AWS_ACCESS_KEY_ID"] = "testing"
19
- os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
20
- os.environ["AWS_SECURITY_TOKEN"] = "testing"
21
- os.environ["AWS_SESSION_TOKEN"] = "testing"
22
- os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
23
- yield
24
-
25
-
26
- @pytest.fixture(autouse=True, scope="module")
27
- def s3_resource(mock_aws_credential):
28
- with mock_s3():
29
- yield boto3.resource("s3")
30
-
31
-
32
- @pytest.fixture(autouse=True, scope="function")
33
- def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
34
- s3_resource.create_bucket(
35
- ACL="authenticated-read",
36
- Bucket=RCF_BUCKET_NAME,
37
- )
38
- yield
39
- s3_resource.Bucket(RCF_BUCKET_NAME).objects.all().delete()
40
-
41
-
42
- class TestReadWriteRoundCompletionFile:
43
- def test_read_when_rcf_written_without_destination(self):
44
- """
45
- This test case tests the backward compatibility by successfully
46
- reading the previously written rcf.
47
- """
48
-
49
- source_locator = get_test_partition_locator("source")
50
- destination_locator = get_test_partition_locator("destination")
51
-
52
- expected_rcf = RoundCompletionInfo.of(
53
- high_watermark=122,
54
- compacted_delta_locator={},
55
- compacted_pyarrow_write_result={},
56
- sort_keys_bit_width=12,
57
- )
58
-
59
- rcf_url = write_round_completion_file(
60
- RCF_BUCKET_NAME, source_locator, None, expected_rcf
61
- )
62
-
63
- rcf = read_round_completion_file(
64
- RCF_BUCKET_NAME, source_locator, destination_locator
65
- )
66
-
67
- assert (
68
- rcf_url == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507.json"
69
- )
70
- assert rcf == expected_rcf
71
-
72
- def test_read_when_rcf_written_with_destination(self):
73
- """
74
- This test case tests the backward compatibility by successfully
75
- reading the previously written rcf.
76
- """
77
-
78
- source_locator = get_test_partition_locator("source")
79
- destination_locator = get_test_partition_locator("destination")
80
-
81
- expected_rcf = RoundCompletionInfo.of(
82
- high_watermark=122,
83
- compacted_delta_locator={},
84
- compacted_pyarrow_write_result={},
85
- sort_keys_bit_width=12,
86
- )
87
-
88
- rcf_url = write_round_completion_file(
89
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
90
- )
91
-
92
- rcf = read_round_completion_file(
93
- RCF_BUCKET_NAME, source_locator, destination_locator
94
- )
95
-
96
- assert (
97
- rcf_url
98
- == "s3://rcf-bucket/f9829af39770d904dbb811bd8f4e886dd307f507/e9939deadc091b3289a2eb0ca56b1ba86b9892f4.json"
99
- )
100
- assert rcf == expected_rcf
101
-
102
- def test_read_without_destination_when_rcf_written_with_destination(self):
103
- """
104
- This test case tests the backward compatibility by successfully
105
- reading the previously written rcf.
106
- """
107
-
108
- source_locator = get_test_partition_locator("source")
109
- destination_locator = get_test_partition_locator("destination")
110
-
111
- expected_rcf = RoundCompletionInfo.of(
112
- high_watermark=122,
113
- compacted_delta_locator={},
114
- compacted_pyarrow_write_result={},
115
- sort_keys_bit_width=12,
116
- )
117
-
118
- write_round_completion_file(
119
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
120
- )
121
-
122
- rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
123
-
124
- assert rcf is None
125
-
126
- def test_read_without_destination_when_rcf_written_without_destination(self):
127
- """
128
- This test case tests the backward compatibility by successfully
129
- reading the previously written rcf.
130
- """
131
-
132
- source_locator = get_test_partition_locator("source")
133
-
134
- expected_rcf = RoundCompletionInfo.of(
135
- high_watermark=122,
136
- compacted_delta_locator={},
137
- compacted_pyarrow_write_result={},
138
- sort_keys_bit_width=12,
139
- )
140
-
141
- write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
142
-
143
- rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
144
-
145
- assert rcf == expected_rcf
146
-
147
- def test_read_when_rcf_written_both_with_and_without_destination(self):
148
- """
149
- This test case tests the backward compatibility by successfully
150
- reading the previously written rcf.
151
- """
152
-
153
- source_locator = get_test_partition_locator("source")
154
- destination_locator = get_test_partition_locator("destination")
155
-
156
- expected_rcf = RoundCompletionInfo.of(
157
- high_watermark=122,
158
- compacted_delta_locator={},
159
- compacted_pyarrow_write_result={},
160
- sort_keys_bit_width=12,
161
- )
162
-
163
- expected_rcf_2 = RoundCompletionInfo.of(
164
- high_watermark=1223,
165
- compacted_delta_locator={},
166
- compacted_pyarrow_write_result={},
167
- sort_keys_bit_width=1233,
168
- )
169
-
170
- write_round_completion_file(RCF_BUCKET_NAME, source_locator, None, expected_rcf)
171
-
172
- write_round_completion_file(
173
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf_2
174
- )
175
-
176
- rcf = read_round_completion_file(
177
- RCF_BUCKET_NAME, source_locator, destination_locator
178
- )
179
-
180
- assert rcf == expected_rcf_2
181
-
182
- def test_read_when_none_destination_partition_id(self):
183
-
184
- source_locator = get_test_partition_locator("source")
185
- destination_locator = get_test_partition_locator(None)
186
-
187
- expected_rcf = RoundCompletionInfo.of(
188
- high_watermark=122,
189
- compacted_delta_locator={},
190
- compacted_pyarrow_write_result={},
191
- sort_keys_bit_width=12,
192
- )
193
-
194
- write_round_completion_file(
195
- RCF_BUCKET_NAME, source_locator, destination_locator, expected_rcf
196
- )
197
-
198
- rcf = read_round_completion_file(
199
- RCF_BUCKET_NAME, source_locator, destination_locator
200
- )
201
-
202
- assert rcf == expected_rcf
203
-
204
- def test_write_when_custom_url_is_passed(self):
205
- """
206
- This test case tests the backward compatibility by successfully
207
- reading the previously written rcf.
208
- """
209
-
210
- source_locator = get_test_partition_locator("source")
211
-
212
- expected_rcf = RoundCompletionInfo.of(
213
- high_watermark=122,
214
- compacted_delta_locator={},
215
- compacted_pyarrow_write_result={},
216
- sort_keys_bit_width=12,
217
- )
218
-
219
- completion_file_s3_url = f"s3://{RCF_BUCKET_NAME}/test.json"
220
- rcf_url = write_round_completion_file(
221
- RCF_BUCKET_NAME,
222
- source_locator,
223
- None,
224
- expected_rcf,
225
- completion_file_s3_url=completion_file_s3_url,
226
- )
227
-
228
- rcf = read_round_completion_file(RCF_BUCKET_NAME, source_locator, None)
229
-
230
- assert rcf_url == completion_file_s3_url
231
- assert rcf is None