deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,1235 +0,0 @@
1
- from typing import Any, Callable, Dict, List, Optional, Union, Tuple
2
-
3
- import pyarrow as pa
4
- import daft
5
- import json
6
- import sqlite3
7
- from sqlite3 import Cursor, Connection
8
- import uuid
9
- import ray
10
- import io
11
-
12
- from deltacat.tests.test_utils.storage import create_empty_delta
13
- from deltacat.utils.common import current_time_ms
14
-
15
-
16
- from deltacat.storage import (
17
- Delta,
18
- DeltaLocator,
19
- DeltaProperties,
20
- DeltaType,
21
- DistributedDataset,
22
- LifecycleState,
23
- ListResult,
24
- LocalDataset,
25
- LocalTable,
26
- ManifestAuthor,
27
- Namespace,
28
- NamespaceLocator,
29
- NamespaceProperties,
30
- Partition,
31
- PartitionScheme,
32
- Schema,
33
- Stream,
34
- StreamLocator,
35
- Table,
36
- TableVersion,
37
- TableVersionLocator,
38
- TableVersionProperties,
39
- TableLocator,
40
- TableProperties,
41
- CommitState,
42
- SortScheme,
43
- PartitionLocator,
44
- ManifestEntry,
45
- ManifestEntryList,
46
- EntryParams,
47
- PartitionValues,
48
- TransformName,
49
- StreamFormat,
50
- )
51
- from deltacat.storage.model.manifest import Manifest, ManifestMeta, EntryType
52
- from deltacat.types.media import (
53
- ContentType,
54
- StorageType,
55
- TableType,
56
- ContentEncoding,
57
- DistributedDatasetType,
58
- )
59
- from deltacat.utils.common import ReadKwargsProvider
60
- from deltacat.tests.local_deltacat_storage.exceptions import (
61
- InvalidNamespaceError,
62
- LocalStorageValidationError,
63
- )
64
-
65
- SQLITE_CUR_ARG = "sqlite3_cur"
66
- SQLITE_CON_ARG = "sqlite3_con"
67
- DB_FILE_PATH_ARG = "db_file_path"
68
-
69
- STREAM_FORMAT = StreamFormat.SQLITE3
70
- STREAM_ID_PROPERTY = "stream_id"
71
- CREATE_NAMESPACES_TABLE = (
72
- "CREATE TABLE IF NOT EXISTS namespaces(locator, value, PRIMARY KEY (locator))"
73
- )
74
- CREATE_TABLES_TABLE = (
75
- "CREATE TABLE IF NOT EXISTS tables(locator, namespace_locator, value, PRIMARY KEY (locator), "
76
- "FOREIGN KEY (namespace_locator) REFERENCES namespaces(locator))"
77
- )
78
- CREATE_TABLE_VERSIONS_TABLE = (
79
- "CREATE TABLE IF NOT EXISTS table_versions(locator, table_locator, value, PRIMARY KEY (locator), "
80
- "FOREIGN KEY (table_locator) REFERENCES tables(locator))"
81
- )
82
- CREATE_STREAMS_TABLE = (
83
- "CREATE TABLE IF NOT EXISTS streams(locator, table_version_locator, value, PRIMARY KEY(locator), "
84
- "FOREIGN KEY (table_version_locator) REFERENCES table_versions(locator))"
85
- )
86
- CREATE_PARTITIONS_TABLE = (
87
- "CREATE TABLE IF NOT EXISTS partitions(locator, stream_locator, value, PRIMARY KEY(locator), "
88
- "FOREIGN KEY (stream_locator) REFERENCES streams(locator))"
89
- )
90
- CREATE_DELTAS_TABLE = (
91
- "CREATE TABLE IF NOT EXISTS deltas(locator, partition_locator, value, PRIMARY KEY(locator), "
92
- "FOREIGN KEY (partition_locator) REFERENCES partitions(locator))"
93
- )
94
- CREATE_DATA_TABLE = "CREATE TABLE IF NOT EXISTS data(uri, value, PRIMARY KEY(uri))"
95
-
96
-
97
- def _get_sqlite3_cursor_con(kwargs) -> Tuple[Cursor, Connection]:
98
- if SQLITE_CUR_ARG in kwargs and SQLITE_CON_ARG in kwargs:
99
- return kwargs[SQLITE_CUR_ARG], kwargs[SQLITE_CON_ARG]
100
- elif DB_FILE_PATH_ARG in kwargs:
101
- con = sqlite3.connect(kwargs[DB_FILE_PATH_ARG])
102
- cur = con.cursor()
103
- return cur, con
104
-
105
- raise ValueError(f"Invalid local db connection kwargs: {kwargs}")
106
-
107
-
108
- def _get_manifest_entry_uri(manifest_entry_id: str) -> str:
109
- return f"cloudpickle://{manifest_entry_id}"
110
-
111
-
112
- def _merge_and_promote(
113
- partition_deltas: List[Delta], previous_partition_deltas: List[Delta]
114
- ):
115
- previous_partition_deltas_spos_gt: List[Delta] = [
116
- delta
117
- for delta in previous_partition_deltas
118
- if delta.stream_position > partition_deltas[0].stream_position
119
- ]
120
- # handle the case if the previous partition deltas have a greater stream position than the partition_delta
121
- partition_deltas = previous_partition_deltas_spos_gt + partition_deltas
122
- return partition_deltas
123
-
124
-
125
- def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
126
- cur, con = _get_sqlite3_cursor_con(kwargs)
127
- res = cur.execute("SELECT * FROM namespaces")
128
- fetched = res.fetchall()
129
- result = []
130
-
131
- for item in fetched:
132
- result.append(Namespace(json.loads(item[1])))
133
-
134
- return ListResult.of(result, None, None)
135
-
136
-
137
- def list_tables(namespace: str, *args, **kwargs) -> ListResult[Table]:
138
- cur, con = _get_sqlite3_cursor_con(kwargs)
139
- params = (NamespaceLocator.of(namespace).canonical_string(),)
140
- res = cur.execute("SELECT * FROM tables WHERE namespace_locator = ?", params)
141
- fetched = res.fetchall()
142
- result = []
143
-
144
- for item in fetched:
145
- result.append(Table(json.loads(item[2])))
146
-
147
- return ListResult.of(result, None, None)
148
-
149
-
150
- def list_table_versions(
151
- namespace: str, table_name: str, *args, **kwargs
152
- ) -> ListResult[TableVersion]:
153
- cur, con = _get_sqlite3_cursor_con(kwargs)
154
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
155
-
156
- res = cur.execute(
157
- "SELECT * FROM table_versions WHERE table_locator = ?",
158
- (table_locator.canonical_string(),),
159
- )
160
- fetched = res.fetchall()
161
- result = []
162
-
163
- for item in fetched:
164
- result.append(TableVersion(json.loads(item[2])))
165
-
166
- return ListResult.of(result, None, None)
167
-
168
-
169
- def list_partitions(
170
- namespace: str,
171
- table_name: str,
172
- table_version: Optional[str] = None,
173
- *args,
174
- **kwargs,
175
- ) -> ListResult[Partition]:
176
- cur, con = _get_sqlite3_cursor_con(kwargs)
177
-
178
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
179
-
180
- res = cur.execute(
181
- "SELECT * FROM partitions WHERE stream_locator = ?",
182
- (stream.locator.canonical_string(),),
183
- )
184
-
185
- fetched = res.fetchall()
186
- result = []
187
- for item in fetched:
188
- partition = Partition(json.loads(item[2]))
189
- if partition.state == CommitState.COMMITTED:
190
- result.append(partition)
191
-
192
- return ListResult.of(result, None, None)
193
-
194
-
195
- def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partition]:
196
- return list_partitions(
197
- stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
198
- )
199
-
200
-
201
- def list_deltas(
202
- namespace: str,
203
- table_name: str,
204
- partition_values: Optional[PartitionValues] = None,
205
- table_version: Optional[str] = None,
206
- first_stream_position: Optional[int] = None,
207
- last_stream_position: Optional[int] = None,
208
- ascending_order: Optional[bool] = None,
209
- include_manifest: bool = False,
210
- partition_scheme_id: Optional[str] = None,
211
- *args,
212
- **kwargs,
213
- ) -> ListResult[Delta]:
214
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
215
- if stream is None:
216
- return ListResult.of([], None, None)
217
-
218
- partition = get_partition(stream.locator, partition_values, *args, **kwargs)
219
-
220
- all_deltas = list_partition_deltas(
221
- partition,
222
- first_stream_position=first_stream_position,
223
- last_stream_position=last_stream_position,
224
- ascending_order=ascending_order,
225
- include_manifest=include_manifest,
226
- *args,
227
- **kwargs,
228
- ).all_items()
229
-
230
- result = []
231
-
232
- for delta in all_deltas:
233
- if (
234
- not first_stream_position or first_stream_position < delta.stream_position
235
- ) and (
236
- not last_stream_position or delta.stream_position <= last_stream_position
237
- ):
238
- result.append(delta)
239
-
240
- if not include_manifest:
241
- delta.manifest = None
242
-
243
- result.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
244
- return ListResult.of(result, None, None)
245
-
246
-
247
- def list_partition_deltas(
248
- partition_like: Union[Partition, PartitionLocator],
249
- first_stream_position: Optional[int] = None,
250
- last_stream_position: Optional[int] = None,
251
- ascending_order: bool = False,
252
- include_manifest: bool = False,
253
- *args,
254
- **kwargs,
255
- ) -> ListResult[Delta]:
256
- cur, con = _get_sqlite3_cursor_con(kwargs)
257
-
258
- if partition_like is None:
259
- return ListResult.of([], None, None)
260
-
261
- if first_stream_position is None:
262
- first_stream_position = 0
263
-
264
- if last_stream_position is None:
265
- last_stream_position = float("inf")
266
-
267
- assert isinstance(partition_like, Partition) or isinstance(
268
- partition_like, PartitionLocator
269
- ), f"Expected a Partition or PartitionLocator as an input argument but found {partition_like}"
270
-
271
- partition_locator = None
272
- if isinstance(partition_like, Partition):
273
- partition_locator = partition_like.locator
274
- else:
275
- partition_locator = partition_like
276
-
277
- res = cur.execute(
278
- "SELECT * FROM deltas WHERE partition_locator = ?",
279
- (partition_locator.canonical_string(),),
280
- )
281
-
282
- serialized_items = res.fetchall()
283
-
284
- if not serialized_items:
285
- return ListResult.of([], None, None)
286
-
287
- result = []
288
- for item in serialized_items:
289
- current_delta = Delta(json.loads(item[2]))
290
- if (
291
- first_stream_position
292
- <= current_delta.stream_position
293
- <= last_stream_position
294
- ):
295
- result.append(current_delta)
296
-
297
- if not include_manifest:
298
- current_delta.manifest = None
299
-
300
- result.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
301
- return ListResult.of(result, None, None)
302
-
303
-
304
- def get_delta(
305
- namespace: str,
306
- table_name: str,
307
- stream_position: int,
308
- partition_values: Optional[PartitionValues] = None,
309
- table_version: Optional[str] = None,
310
- include_manifest: bool = False,
311
- partition_scheme_id: Optional[str] = None,
312
- *args,
313
- **kwargs,
314
- ) -> Optional[Delta]:
315
- cur, con = _get_sqlite3_cursor_con(kwargs)
316
-
317
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
318
-
319
- partition = get_partition(stream.locator, partition_values, *args, **kwargs)
320
- delta_locator = DeltaLocator.of(partition.locator, stream_position)
321
-
322
- res = cur.execute(
323
- "SELECT * FROM deltas WHERE locator = ?", (delta_locator.canonical_string(),)
324
- )
325
-
326
- serialized_delta = res.fetchone()
327
- if serialized_delta is None:
328
- return None
329
-
330
- delta = Delta(json.loads(serialized_delta[2]))
331
-
332
- if not include_manifest:
333
- delta.manifest = None
334
-
335
- return delta
336
-
337
-
338
- def get_latest_delta(
339
- namespace: str,
340
- table_name: str,
341
- partition_values: Optional[PartitionValues] = None,
342
- table_version: Optional[str] = None,
343
- include_manifest: bool = False,
344
- partition_scheme_id: Optional[str] = None,
345
- *args,
346
- **kwargs,
347
- ) -> Optional[Delta]:
348
-
349
- deltas = list_deltas(
350
- namespace=namespace,
351
- table_name=table_name,
352
- partition_values=partition_values,
353
- table_version=table_version,
354
- first_stream_position=None,
355
- last_stream_position=None,
356
- ascending_order=False,
357
- include_manifest=include_manifest,
358
- partition_scheme_id=partition_scheme_id,
359
- *args,
360
- **kwargs,
361
- ).all_items()
362
-
363
- if not deltas:
364
- return None
365
-
366
- return deltas[0]
367
-
368
-
369
- def download_delta(
370
- delta_like: Union[Delta, DeltaLocator],
371
- table_type: TableType = TableType.PYARROW,
372
- storage_type: StorageType = StorageType.DISTRIBUTED,
373
- max_parallelism: Optional[int] = None,
374
- columns: Optional[List[str]] = None,
375
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
376
- ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
377
- distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
378
- *args,
379
- **kwargs,
380
- ) -> Union[LocalDataset, DistributedDataset]: # type: ignore
381
- result = []
382
- if isinstance(delta_like, Delta) and delta_like.manifest is not None:
383
- manifest = Delta(delta_like).manifest
384
- else:
385
- manifest = get_delta_manifest(delta_like, *args, **kwargs)
386
- for entry_index in range(len(manifest.entries)):
387
- result.append(
388
- download_delta_manifest_entry(
389
- delta_like=delta_like,
390
- entry_index=entry_index,
391
- table_type=table_type,
392
- columns=columns,
393
- file_reader_kwargs_provider=file_reader_kwargs_provider,
394
- *args,
395
- **kwargs,
396
- )
397
- )
398
-
399
- if storage_type == StorageType.DISTRIBUTED:
400
- if distributed_dataset_type is DistributedDatasetType.DAFT:
401
- return daft.from_arrow(result)
402
- elif distributed_dataset_type is DistributedDatasetType.RAY_DATASET:
403
- return ray.data.from_arrow(result)
404
- else:
405
- raise ValueError(f"Dataset type {distributed_dataset_type} not supported!")
406
-
407
- return result
408
-
409
-
410
- def download_delta_manifest_entry(
411
- delta_like: Union[Delta, DeltaLocator],
412
- entry_index: int,
413
- table_type: TableType = TableType.PYARROW,
414
- columns: Optional[List[str]] = None,
415
- file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
416
- *args,
417
- **kwargs,
418
- ) -> LocalTable:
419
- cur, con = _get_sqlite3_cursor_con(kwargs)
420
- if isinstance(delta_like, Delta) and delta_like.manifest is not None:
421
- manifest = Delta(delta_like).manifest
422
- else:
423
- manifest = get_delta_manifest(delta_like, *args, **kwargs)
424
- if entry_index >= len(manifest.entries):
425
- raise IndexError(
426
- f"Manifest entry index {entry_index} does not exist. "
427
- f"Valid values: [0, {len(manifest.entries)}]"
428
- )
429
-
430
- entry = manifest.entries[entry_index]
431
-
432
- res = cur.execute("SELECT value FROM data WHERE uri = ?", (entry.uri,))
433
- serialized_data = res.fetchone()
434
-
435
- if serialized_data is None:
436
- raise ValueError(
437
- f"Invalid value of delta locator: {delta_like.canonical_string()}"
438
- )
439
-
440
- serialized_data = serialized_data[0]
441
- if entry.meta.content_type == ContentType.PARQUET:
442
- if table_type == TableType.PYARROW_PARQUET:
443
- table = pa.parquet.ParquetFile(io.BytesIO(serialized_data))
444
- else:
445
- table = pa.parquet.read_table(io.BytesIO(serialized_data), columns=columns)
446
- elif entry.meta.content_type == ContentType.UNESCAPED_TSV:
447
- assert (
448
- table_type != TableType.PYARROW_PARQUET
449
- ), f"uTSV table cannot be read as {table_type}"
450
- parse_options = pa.csv.ParseOptions(delimiter="\t")
451
- convert_options = pa.csv.ConvertOptions(
452
- null_values=[""], strings_can_be_null=True, include_columns=columns
453
- )
454
- table = pa.csv.read_csv(
455
- io.BytesIO(serialized_data),
456
- parse_options=parse_options,
457
- convert_options=convert_options,
458
- )
459
- else:
460
- raise ValueError(f"Content type: {entry.meta.content_type} not supported.")
461
-
462
- if table_type == TableType.PYARROW:
463
- return table
464
- elif table_type == TableType.PYARROW_PARQUET:
465
- return table
466
- elif table_type == TableType.NUMPY:
467
- raise NotImplementedError(f"Table type={table_type} not supported")
468
- elif table_type == TableType.PANDAS:
469
- return table.to_pandas()
470
-
471
- return table
472
-
473
-
474
- def get_delta_manifest(
475
- delta_like: Union[Delta, DeltaLocator], *args, **kwargs
476
- ) -> Optional[Manifest]:
477
- delta = get_delta(
478
- namespace=delta_like.namespace,
479
- table_name=delta_like.table_name,
480
- stream_position=delta_like.stream_position,
481
- partition_values=delta_like.partition_values,
482
- table_version=delta_like.table_version,
483
- include_manifest=True,
484
- *args,
485
- **kwargs,
486
- )
487
- if not delta:
488
- return None
489
-
490
- return delta.manifest
491
-
492
-
493
- def create_namespace(
494
- namespace: str, properties: NamespaceProperties, *args, **kwargs
495
- ) -> Namespace:
496
- cur, con = _get_sqlite3_cursor_con(kwargs)
497
- locator = NamespaceLocator.of(namespace)
498
- result = Namespace.of(locator, properties)
499
- params = (locator.canonical_string(), json.dumps(result))
500
- cur.execute(CREATE_NAMESPACES_TABLE)
501
- cur.execute(CREATE_TABLES_TABLE)
502
- cur.execute(CREATE_TABLE_VERSIONS_TABLE)
503
- cur.execute(CREATE_STREAMS_TABLE)
504
- cur.execute(CREATE_PARTITIONS_TABLE)
505
- cur.execute(CREATE_DELTAS_TABLE)
506
- cur.execute(CREATE_DATA_TABLE)
507
- cur.execute("INSERT OR IGNORE INTO namespaces VALUES(?, ?)", params)
508
- con.commit()
509
- return result
510
-
511
-
512
- def update_namespace(
513
- namespace: str,
514
- properties: NamespaceProperties = None,
515
- new_namespace: Optional[str] = None,
516
- *args,
517
- **kwargs,
518
- ) -> None:
519
- assert new_namespace is None, "namespace name cannot be changed"
520
- cur, con = _get_sqlite3_cursor_con(kwargs)
521
- locator = NamespaceLocator.of(namespace)
522
- result = Namespace.of(locator, properties)
523
- params = (json.dumps(result), locator.canonical_string())
524
- cur.execute("UPDATE namespaces SET value = ? WHERE locator = ?", params)
525
- con.commit()
526
-
527
-
528
- def create_table_version(
529
- namespace: str,
530
- table_name: str,
531
- table_version: Optional[str] = None,
532
- schema: Optional[Union[pa.Schema, Any]] = None,
533
- partition_scheme: Optional[PartitionScheme] = None,
534
- sort_keys: Optional[SortScheme] = None,
535
- table_version_description: Optional[str] = None,
536
- table_version_properties: Optional[TableVersionProperties] = None,
537
- table_description: Optional[str] = None,
538
- table_properties: Optional[TableProperties] = None,
539
- supported_content_types: Optional[List[ContentType]] = None,
540
- *args,
541
- **kwargs,
542
- ) -> Stream:
543
- cur, con = _get_sqlite3_cursor_con(kwargs)
544
-
545
- if partition_scheme is not None:
546
- assert (
547
- partition_scheme.keys is not None
548
- ), "Partition keys must be specified with partition scheme"
549
- for key in partition_scheme.keys:
550
- assert (
551
- key.transform is None or key.transform.name == TransformName.IDENTITY
552
- ), (
553
- "Local DeltaCAT storage does not support creating table versions "
554
- "with non identity transform partition spec"
555
- )
556
- if sort_keys is not None:
557
- assert (
558
- sort_keys.keys is not None
559
- ), "Sort keys must be specified with sort scheme"
560
- for key in sort_keys.keys:
561
- assert (
562
- key.transform is None or key.transform.name == TransformName.IDENTITY
563
- ), (
564
- "Local DeltaCAT storage does not support creating table versions "
565
- "with non identity transform sort spec"
566
- )
567
-
568
- latest_version = get_latest_table_version(namespace, table_name, *args, **kwargs)
569
- if (
570
- table_version is not None
571
- and latest_version
572
- and int(latest_version.table_version) + 1 != int(table_version)
573
- ):
574
- raise AssertionError(
575
- f"Table version can only be incremented. Last version={latest_version.table_version}"
576
- )
577
- elif table_version is None:
578
- table_version = (
579
- (int(latest_version.table_version) + 1) if latest_version else "1"
580
- )
581
-
582
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
583
- table_obj = Table.of(table_locator, table_description, table_properties)
584
- table_version_locator = TableVersionLocator.of(
585
- table_locator=table_locator, table_version=table_version
586
- )
587
-
588
- stream_id = uuid.uuid4().__str__()
589
-
590
- if table_version_properties is None:
591
- table_version_properties = {}
592
-
593
- properties = {**table_version_properties, STREAM_ID_PROPERTY: stream_id}
594
- table_version_obj = TableVersion.of(
595
- table_version_locator,
596
- schema=Schema.of(schema) if schema else None,
597
- partition_scheme=partition_scheme,
598
- description=table_version_description,
599
- properties=properties,
600
- sort_scheme=sort_keys,
601
- content_types=supported_content_types,
602
- )
603
- stream_locator = StreamLocator.of(
604
- table_version_obj.locator, stream_id=stream_id, stream_format=STREAM_FORMAT
605
- )
606
- result_stream = Stream.of(
607
- stream_locator, partition_scheme=partition_scheme, state=CommitState.COMMITTED
608
- )
609
-
610
- params = (
611
- table_locator.canonical_string(),
612
- table_locator.namespace_locator.canonical_string(),
613
- json.dumps(table_obj),
614
- )
615
- cur.execute("INSERT OR IGNORE INTO tables VALUES (?, ?, ?)", params)
616
- params = (
617
- table_version_locator.canonical_string(),
618
- table_locator.canonical_string(),
619
- json.dumps(table_version_obj),
620
- )
621
- cur.execute("INSERT OR IGNORE INTO table_versions VALUES (?, ?, ?)", params)
622
-
623
- params = (
624
- stream_locator.canonical_string(),
625
- table_version_locator.canonical_string(),
626
- json.dumps(result_stream),
627
- )
628
- cur.execute("INSERT OR IGNORE INTO streams VALUES (?, ?, ?)", params)
629
- con.commit()
630
- return result_stream
631
-
632
-
633
- def update_table(
634
- namespace: str,
635
- table_name: str,
636
- description: Optional[str] = None,
637
- properties: Optional[TableProperties] = None,
638
- new_table_name: Optional[str] = None,
639
- *args,
640
- **kwargs,
641
- ) -> None:
642
- cur, con = _get_sqlite3_cursor_con(kwargs)
643
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
644
- table_obj = Table.of(table_locator, description, properties)
645
-
646
- params = (table_locator.canonical_string(),)
647
- cur.execute("DELETE FROM tables WHERE locator = ?", params)
648
- params = (
649
- table_locator.canonical_string(),
650
- table_locator.namespace_locator.canonical_string(),
651
- json.dumps(table_obj),
652
- )
653
- cur.execute("INSERT INTO tables VALUES (?, ?, ?)", params)
654
- con.commit()
655
-
656
-
657
- def update_table_version(
658
- namespace: str,
659
- table_name: str,
660
- table_version: str,
661
- lifecycle_state: Optional[LifecycleState] = None,
662
- schema: Optional[Union[pa.Schema, Any]] = None,
663
- description: Optional[str] = None,
664
- properties: Optional[TableVersionProperties] = None,
665
- *args,
666
- **kwargs,
667
- ) -> None:
668
- cur, con = _get_sqlite3_cursor_con(kwargs)
669
- table_locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
670
- table_version_locator = TableVersionLocator.of(
671
- table_locator=table_locator, table_version=table_version
672
- )
673
-
674
- res = cur.execute(
675
- "SELECT * from table_versions WHERE locator = ?",
676
- (table_version_locator.canonical_string(),),
677
- )
678
- serialized_table_version = res.fetchone()
679
- assert (
680
- serialized_table_version is not None
681
- ), f"Table version not found with locator={table_version_locator.canonical_string()}"
682
- current_table_version_obj = TableVersion(json.loads(serialized_table_version[2]))
683
-
684
- if properties is None:
685
- properties = {}
686
-
687
- current_props = (
688
- current_table_version_obj.properties
689
- if current_table_version_obj.properties
690
- else {}
691
- )
692
-
693
- tv_properties = {**properties, **current_props}
694
- table_version_obj = TableVersion.of(
695
- table_version_locator,
696
- schema=Schema.of(schema) if schema else None,
697
- partition_scheme=current_table_version_obj.partition_scheme,
698
- description=description,
699
- properties=tv_properties,
700
- sort_scheme=current_table_version_obj.sort_scheme,
701
- content_types=current_table_version_obj.content_types,
702
- )
703
-
704
- params = (
705
- table_locator.canonical_string(),
706
- json.dumps(table_version_obj),
707
- table_version_locator.canonical_string(),
708
- )
709
- cur.execute(
710
- "UPDATE table_versions SET table_locator = ?, value = ? WHERE locator = ?",
711
- params,
712
- )
713
- con.commit()
714
-
715
-
716
- def stage_stream(
717
- namespace: str,
718
- table_name: str,
719
- table_version: Optional[str] = None,
720
- *args,
721
- **kwargs,
722
- ) -> Stream:
723
- cur, con = _get_sqlite3_cursor_con(kwargs)
724
-
725
- existing_table_version = get_table_version(
726
- namespace, table_name, table_version, *args, **kwargs
727
- )
728
- existing_stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
729
-
730
- stream_id = uuid.uuid4().__str__()
731
- new_stream_locator = StreamLocator.of(
732
- existing_table_version.locator, stream_id, STREAM_FORMAT
733
- )
734
- new_stream = Stream.of(
735
- new_stream_locator,
736
- existing_stream.partition_scheme,
737
- CommitState.STAGED,
738
- existing_stream.locator.canonical_string(),
739
- )
740
-
741
- params = (
742
- new_stream_locator.canonical_string(),
743
- existing_table_version.locator.canonical_string(),
744
- json.dumps(new_stream),
745
- )
746
- cur.execute("INSERT INTO streams VALUES (?, ?, ?)", params)
747
- con.commit()
748
-
749
- return new_stream
750
-
751
-
752
- def commit_stream(stream: Stream, *args, **kwargs) -> Stream:
753
- cur, con = _get_sqlite3_cursor_con(kwargs)
754
-
755
- existing_table_version = get_table_version(
756
- stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
757
- )
758
- stream_to_commit = Stream.of(
759
- stream.locator,
760
- stream.partition_scheme,
761
- CommitState.COMMITTED,
762
- stream.previous_stream_id,
763
- )
764
-
765
- existing_table_version.properties[
766
- STREAM_ID_PROPERTY
767
- ] = stream_to_commit.locator.stream_id
768
-
769
- params = (
770
- json.dumps(existing_table_version),
771
- existing_table_version.locator.canonical_string(),
772
- )
773
- cur.execute("UPDATE table_versions SET value = ? WHERE locator = ?", params)
774
- params = (json.dumps(stream_to_commit), stream_to_commit.locator.canonical_string())
775
- cur.execute("UPDATE streams SET value = ? WHERE locator = ?", params)
776
- con.commit()
777
-
778
- return stream_to_commit
779
-
780
-
781
- def delete_stream(
782
- namespace: str,
783
- table_name: str,
784
- table_version: Optional[str] = None,
785
- *args,
786
- **kwargs,
787
- ) -> None:
788
- cur, con = _get_sqlite3_cursor_con(kwargs)
789
-
790
- table_version_locator = TableVersionLocator.of(
791
- TableLocator.of(NamespaceLocator.of(namespace), table_name), table_version
792
- )
793
-
794
- res = cur.execute(
795
- "SELECT locator FROM streams WHERE table_version_locator = ?",
796
- (table_version_locator.canonical_string(),),
797
- )
798
- locators = res.fetchall()
799
- cur.executemany("DELETE FROM streams WHERE locator = ?", locators)
800
- cur.execute(
801
- "DELETE FROM table_versions WHERE locator = ?",
802
- (table_version_locator.canonical_string(),),
803
- )
804
-
805
- con.commit()
806
-
807
-
808
- def stage_partition(
809
- stream: Stream, partition_values: Optional[PartitionValues] = None, *args, **kwargs
810
- ) -> Partition:
811
- cur, con = _get_sqlite3_cursor_con(kwargs)
812
- partition_id = uuid.uuid4().__str__()
813
- partition_locator = PartitionLocator.of(
814
- stream.locator, partition_values=partition_values, partition_id=partition_id
815
- )
816
-
817
- tv = get_table_version(
818
- stream.namespace, stream.table_name, stream.table_version, *args, **kwargs
819
- )
820
-
821
- pv_partition = get_partition(
822
- stream.locator, partition_values=partition_values, *args, **kwargs
823
- )
824
-
825
- stream_position = current_time_ms()
826
- partition = Partition.of(
827
- partition_locator,
828
- schema=tv.schema,
829
- content_types=tv.content_types,
830
- state=CommitState.STAGED,
831
- previous_stream_position=pv_partition.stream_position if pv_partition else None,
832
- previous_partition_id=pv_partition.partition_id if pv_partition else None,
833
- stream_position=stream_position,
834
- )
835
-
836
- params = (
837
- partition.locator.canonical_string(),
838
- partition.stream_locator.canonical_string(),
839
- json.dumps(partition),
840
- )
841
- cur.execute("INSERT INTO partitions VALUES (?, ?, ?)", params)
842
- con.commit()
843
-
844
- return partition
845
-
846
-
847
- def commit_partition(
848
- partition: Partition,
849
- previous_partition: Optional[Partition] = None,
850
- *args,
851
- **kwargs,
852
- ) -> Partition:
853
- cur, con = _get_sqlite3_cursor_con(kwargs)
854
- pv_partition: Optional[Partition] = previous_partition or get_partition(
855
- partition.stream_locator,
856
- partition_values=partition.partition_values,
857
- *args,
858
- **kwargs,
859
- )
860
- # deprecate old partition and commit new one
861
- if pv_partition:
862
- pv_partition.state = CommitState.DEPRECATED
863
- params = (json.dumps(pv_partition), pv_partition.locator.canonical_string())
864
- cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
865
- previous_partition_deltas = (
866
- list_partition_deltas(
867
- pv_partition, ascending_order=False, *args, **kwargs
868
- ).all_items()
869
- or []
870
- )
871
-
872
- partition_deltas: Optional[List[Delta]] = (
873
- list_partition_deltas(
874
- partition, ascending_order=False, *args, **kwargs
875
- ).all_items()
876
- or []
877
- )
878
-
879
- # if previous_partition is passed in, table is in-place compacted and we need to run merge-and-promote
880
- if previous_partition:
881
- partition_deltas = _merge_and_promote(
882
- partition_deltas, previous_partition_deltas
883
- )
884
-
885
- stream_position = (
886
- partition_deltas[0].stream_position
887
- if partition_deltas
888
- else partition.stream_position
889
- )
890
-
891
- partition.stream_position = stream_position
892
- if partition_deltas:
893
- partition.locator = partition_deltas[0].partition_locator
894
-
895
- partition.state = CommitState.COMMITTED
896
- partition.previous_stream_position = (
897
- pv_partition.stream_position if pv_partition else None
898
- )
899
- params = (json.dumps(partition), partition.locator.canonical_string())
900
- cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
901
- con.commit()
902
-
903
- return partition
904
-
905
-
906
- def delete_partition(
907
- namespace: str,
908
- table_name: str,
909
- table_version: Optional[str] = None,
910
- partition_values: Optional[PartitionValues] = None,
911
- *args,
912
- **kwargs,
913
- ) -> None:
914
- cur, con = _get_sqlite3_cursor_con(kwargs)
915
- stream = get_stream(namespace, table_name, table_version, *args, **kwargs)
916
- partition = get_partition(stream.locator, partition_values, *args, **kwargs)
917
-
918
- partition.state = CommitState.DEPRECATED
919
- params = (json.dumps(partition), partition.locator.canonical_string())
920
-
921
- cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
922
- con.commit()
923
-
924
-
925
- def get_partition(
926
- stream_locator: StreamLocator,
927
- partition_values: Optional[PartitionValues] = None,
928
- *args,
929
- **kwargs,
930
- ) -> Optional[Partition]:
931
- cur, con = _get_sqlite3_cursor_con(kwargs)
932
-
933
- res = cur.execute(
934
- "SELECT * FROM partitions WHERE stream_locator = ?",
935
- (stream_locator.canonical_string(),),
936
- )
937
-
938
- serialized_partitions = res.fetchall()
939
-
940
- if not serialized_partitions:
941
- return None
942
-
943
- if partition_values is None:
944
- partition_values = []
945
-
946
- prior_pv = ",".join(partition_values)
947
-
948
- for item in serialized_partitions:
949
- partition = Partition(json.loads(item[2]))
950
- pv = ",".join(partition.partition_values if partition.partition_values else [])
951
-
952
- if pv == prior_pv and partition.state == CommitState.COMMITTED:
953
- return partition
954
-
955
- return None
956
-
957
-
958
- def stage_delta(
959
- data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
960
- partition: Partition,
961
- delta_type: DeltaType = DeltaType.UPSERT,
962
- max_records_per_entry: Optional[int] = None,
963
- author: Optional[ManifestAuthor] = None,
964
- properties: Optional[DeltaProperties] = None,
965
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
966
- content_type: ContentType = ContentType.PARQUET,
967
- entry_params: Optional[EntryParams] = None,
968
- *args,
969
- **kwargs,
970
- ) -> Delta:
971
- cur, con = _get_sqlite3_cursor_con(kwargs)
972
- manifest_id = uuid.uuid4().__str__()
973
- uri = _get_manifest_entry_uri(manifest_id)
974
-
975
- if data is None:
976
- delta = create_empty_delta(
977
- partition,
978
- delta_type,
979
- author,
980
- properties=properties,
981
- manifest_entry_id=manifest_id,
982
- )
983
- cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", (uri, None))
984
- params = (delta.locator.canonical_string(), "staged_delta", json.dumps(delta))
985
- cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
986
- con.commit()
987
- return delta
988
-
989
- serialized_data = None
990
- if content_type == ContentType.PARQUET:
991
- buffer = io.BytesIO()
992
- pa.parquet.write_table(data, buffer)
993
- serialized_data = buffer.getvalue()
994
- elif content_type == ContentType.UNESCAPED_TSV:
995
- buffer = io.BytesIO()
996
- write_options = pa.csv.WriteOptions(
997
- include_header=True, delimiter="\t", quoting_style="none"
998
- )
999
- pa.csv.write_csv(data, buffer, write_options=write_options)
1000
- serialized_data = buffer.getvalue()
1001
- else:
1002
- raise ValueError(f"Unsupported content type: {content_type}")
1003
-
1004
- stream_position = current_time_ms()
1005
- delta_locator = DeltaLocator.of(partition.locator, stream_position=stream_position)
1006
-
1007
- entry_type = (
1008
- EntryType.EQUALITY_DELETE if delta_type is DeltaType.DELETE else EntryType.DATA
1009
- )
1010
- meta = ManifestMeta.of(
1011
- len(data),
1012
- len(serialized_data),
1013
- content_type=content_type,
1014
- content_encoding=ContentEncoding.IDENTITY,
1015
- source_content_length=data.nbytes,
1016
- entry_type=entry_type,
1017
- entry_params=entry_params,
1018
- )
1019
-
1020
- manifest = Manifest.of(
1021
- entries=ManifestEntryList.of(
1022
- [
1023
- ManifestEntry.of(
1024
- uri=uri,
1025
- url=uri,
1026
- meta=meta,
1027
- mandatory=True,
1028
- uuid=manifest_id,
1029
- )
1030
- ]
1031
- ),
1032
- author=author,
1033
- uuid=manifest_id,
1034
- entry_type=entry_type,
1035
- entry_params=entry_params,
1036
- )
1037
-
1038
- delta = Delta.of(
1039
- delta_locator,
1040
- delta_type=delta_type,
1041
- meta=meta,
1042
- properties=properties,
1043
- manifest=manifest,
1044
- previous_stream_position=partition.stream_position,
1045
- )
1046
-
1047
- params = (uri, serialized_data)
1048
- cur.execute("INSERT OR IGNORE INTO data VALUES (?, ?)", params)
1049
-
1050
- params = (delta_locator.canonical_string(), "staged_delta", json.dumps(delta))
1051
- cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
1052
-
1053
- con.commit()
1054
- return delta
1055
-
1056
-
1057
- def commit_delta(delta: Delta, *args, **kwargs) -> Delta:
1058
- cur, con = _get_sqlite3_cursor_con(kwargs)
1059
- delta_stream_position: Optional[int] = delta.stream_position
1060
- delta.locator.stream_position = delta_stream_position or current_time_ms()
1061
-
1062
- params = (
1063
- delta.locator.canonical_string(),
1064
- delta.partition_locator.canonical_string(),
1065
- json.dumps(delta),
1066
- )
1067
-
1068
- cur.execute("INSERT OR IGNORE INTO deltas VALUES (?, ?, ?)", params)
1069
-
1070
- params = (
1071
- delta.partition_locator.canonical_string(),
1072
- json.dumps(delta),
1073
- delta.locator.canonical_string(),
1074
- )
1075
- cur.execute(
1076
- "UPDATE deltas SET partition_locator = ?, value = ? WHERE locator = ?", params
1077
- )
1078
- con.commit()
1079
- return delta
1080
-
1081
-
1082
- def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
1083
- cur, con = _get_sqlite3_cursor_con(kwargs)
1084
- locator = NamespaceLocator.of(namespace)
1085
-
1086
- res = cur.execute(
1087
- "SELECT * FROM namespaces WHERE locator = ?", (locator.canonical_string(),)
1088
- )
1089
- serialized_result = res.fetchone()
1090
-
1091
- if serialized_result is None:
1092
- return None
1093
-
1094
- return Namespace(json.loads(serialized_result[1]))
1095
-
1096
-
1097
- def namespace_exists(namespace: str, *args, **kwargs) -> bool:
1098
- obj = get_namespace(namespace, *args, **kwargs)
1099
-
1100
- return obj is not None
1101
-
1102
-
1103
- def get_table(namespace: str, table_name: str, *args, **kwargs) -> Optional[Table]:
1104
- cur, con = _get_sqlite3_cursor_con(kwargs)
1105
- locator = TableLocator.of(NamespaceLocator.of(namespace), table_name)
1106
-
1107
- res = cur.execute(
1108
- "SELECT * FROM tables WHERE locator = ?", (locator.canonical_string(),)
1109
- )
1110
- serialized_result = res.fetchone()
1111
-
1112
- if serialized_result is None:
1113
- return None
1114
-
1115
- return Table(json.loads(serialized_result[2]))
1116
-
1117
-
1118
- def table_exists(namespace: str, table_name: str, *args, **kwargs) -> bool:
1119
- obj = get_table(namespace, table_name, *args, **kwargs)
1120
-
1121
- return obj is not None
1122
-
1123
-
1124
- def get_table_version(
1125
- namespace: str, table_name: str, table_version: str, *args, **kwargs
1126
- ) -> Optional[TableVersion]:
1127
- cur, con = _get_sqlite3_cursor_con(kwargs)
1128
- locator = TableVersionLocator.of(
1129
- TableLocator.of(NamespaceLocator.of(namespace), table_name), table_version
1130
- )
1131
-
1132
- res = cur.execute(
1133
- "SELECT * FROM table_versions WHERE locator = ?", (locator.canonical_string(),)
1134
- )
1135
- serialized_table_version = res.fetchone()
1136
-
1137
- if serialized_table_version is None:
1138
- return None
1139
-
1140
- return TableVersion(json.loads(serialized_table_version[2]))
1141
-
1142
-
1143
- def get_latest_table_version(
1144
- namespace: str, table_name: str, *args, **kwargs
1145
- ) -> Optional[TableVersion]:
1146
- table_versions = list_table_versions(
1147
- namespace, table_name, *args, **kwargs
1148
- ).all_items()
1149
- if not table_versions:
1150
- return None
1151
-
1152
- table_versions.sort(reverse=True, key=lambda v: int(v.table_version))
1153
- return table_versions[0]
1154
-
1155
-
1156
- def get_latest_active_table_version(
1157
- namespace: str, table_name: str, *args, **kwargs
1158
- ) -> Optional[TableVersion]:
1159
-
1160
- # This module does not support table version lifecycle state
1161
- return get_latest_table_version(namespace, table_name, *args, **kwargs)
1162
-
1163
-
1164
- def get_table_version_schema(
1165
- namespace: str,
1166
- table_name: str,
1167
- table_version: Optional[str] = None,
1168
- *args,
1169
- **kwargs,
1170
- ) -> Optional[Union[pa.Schema, Any]]:
1171
- obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
1172
-
1173
- return obj.schema
1174
-
1175
-
1176
- def table_version_exists(
1177
- namespace: str, table_name: str, table_version: str, *args, **kwargs
1178
- ) -> bool:
1179
- obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
1180
-
1181
- return obj is not None
1182
-
1183
-
1184
- def get_stream(
1185
- namespace: str,
1186
- table_name: str,
1187
- table_version: Optional[str] = None,
1188
- *args,
1189
- **kwargs,
1190
- ) -> Optional[Stream]:
1191
- assert not isinstance(table_version, int), f"Passed an integer as the table version"
1192
- obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
1193
-
1194
- if obj is None:
1195
- return None
1196
-
1197
- stream_id = obj.properties.get(STREAM_ID_PROPERTY)
1198
- if stream_id is None:
1199
- return None
1200
-
1201
- cur, con = _get_sqlite3_cursor_con(kwargs)
1202
- stream_locator = StreamLocator.of(
1203
- obj.locator, stream_id=stream_id, stream_format=STREAM_FORMAT
1204
- )
1205
- res = cur.execute(
1206
- "SELECT * FROM streams WHERE locator = ?", (stream_locator.canonical_string(),)
1207
- )
1208
-
1209
- serialized_stream = res.fetchone()
1210
- if serialized_stream is None:
1211
- return None
1212
-
1213
- return Stream(json.loads(serialized_stream[2]))
1214
-
1215
-
1216
- def get_table_version_column_names(
1217
- namespace: str,
1218
- table_name: str,
1219
- table_version: Optional[str] = None,
1220
- *args,
1221
- **kwargs,
1222
- ) -> Optional[List[str]]:
1223
- raise NotImplementedError("Fetching column names is not supported")
1224
-
1225
-
1226
- def can_categorize(e: BaseException, **kwargs) -> bool:
1227
- if isinstance(e, InvalidNamespaceError):
1228
- return True
1229
- else:
1230
- return False
1231
-
1232
-
1233
- def raise_categorized_error(e: BaseException, **kwargs):
1234
- if isinstance(e, InvalidNamespaceError):
1235
- raise LocalStorageValidationError("Namespace provided is invalid!")