deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,745 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import itertools
5
+ import posixpath
6
+ from typing import Dict, List, Optional, Tuple, Iterable, Iterator
7
+
8
+ import pyarrow.fs
9
+ import pyarrow as pa
10
+ import pyarrow.dataset
11
+ import pyarrow.json
12
+ import pyarrow.csv
13
+ import pyarrow.parquet
14
+
15
+ from deltacat.constants import (
16
+ DEFAULT_NAMESPACE,
17
+ DEFAULT_PARTITION_ID,
18
+ DEFAULT_PARTITION_VALUES,
19
+ DEFAULT_STREAM_ID,
20
+ DEFAULT_TABLE_VERSION,
21
+ )
22
+ from deltacat.storage.model.partition import Partition, PartitionLocator
23
+ from deltacat.storage.model.shard import Shard, ShardingStrategy
24
+ from deltacat.storage.model.stream import Stream, StreamLocator
25
+ from deltacat.storage.model.transaction import TransactionOperationList
26
+ from deltacat.storage.model.types import CommitState, StreamFormat
27
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
28
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
29
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
30
+ DatasetMetastore,
31
+ )
32
+ from deltacat.experimental.storage.rivulet import Schema, Field
33
+ from deltacat.utils.export import export_dataset
34
+ from .schema.schema import Datatype
35
+
36
+ from deltacat.experimental.storage.rivulet.reader.data_scan import DataScan
37
+ from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
38
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
39
+ QueryExpression,
40
+ )
41
+
42
+ from deltacat.experimental.storage.rivulet.writer.dataset_writer import DatasetWriter
43
+ from deltacat.experimental.storage.rivulet.writer.memtable_dataset_writer import (
44
+ MemtableDatasetWriter,
45
+ )
46
+
47
+ from deltacat.storage import (
48
+ Namespace,
49
+ NamespaceLocator,
50
+ Table,
51
+ TableLocator,
52
+ TableVersion,
53
+ TableVersionLocator,
54
+ Transaction,
55
+ TransactionOperation,
56
+ TransactionOperationType,
57
+ )
58
+ from deltacat import logs
59
+
60
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
61
+
62
+
63
+ # These are the hardcoded default schema names
64
+ ALL = "all"
65
+ DEFAULT = "default"
66
+
67
+
68
+ class FieldsAccessor:
69
+ """Accessor class used to make it easy to do actions like dataset.fields['name'] to work with fields in the Dataset.
70
+ All field mutation and access should come through this class, or through the public helper functions in the dataset
71
+ class, e.g. 'add_fields()'.
72
+ """
73
+
74
+ def __init__(self, dataset: Dataset):
75
+ self.dataset = dataset
76
+
77
+ def __getitem__(self, field_name: str) -> Field:
78
+ if field_name not in self.dataset.schemas[ALL]:
79
+ raise KeyError(f"Field '{field_name}' not found in dataset.")
80
+ return self.dataset.schemas[ALL][field_name]
81
+
82
+ def __setitem__(self, field_name: str, field: Field):
83
+ if not isinstance(field, Field):
84
+ raise TypeError("Value must be a Field object")
85
+ self.dataset.schemas[ALL][field_name] = field
86
+
87
+ def __delitem__(self, field_name: str):
88
+ if field_name not in self.dataset.schemas[ALL]:
89
+ raise ValueError(f"Field '{field_name}' does not exist.")
90
+ del self.dataset.schemas[ALL][field_name]
91
+ for schema in self.dataset._schemas.values():
92
+ if field_name in schema:
93
+ del schema[field_name]
94
+
95
+ def __contains__(self, field_name: str) -> bool:
96
+ """Allows 'field_name in dataset.fields' checks."""
97
+ return field_name in self.dataset.schemas[ALL]
98
+
99
+ def __iter__(self):
100
+ return iter(self.dataset.schemas[ALL].items())
101
+
102
+ def __len__(self):
103
+ return len(self.dataset.schemas[ALL])
104
+
105
+ def __repr__(self):
106
+ return f"Fields({list(self.dataset.schemas['all'].keys())})"
107
+
108
+ def add(
109
+ self,
110
+ name: str,
111
+ datatype: Datatype,
112
+ *,
113
+ schema_name: str = DEFAULT,
114
+ is_merge_key: bool = False,
115
+ ):
116
+ """Simple helper to add a field when you don't have a Field object"""
117
+ self.dataset.add_fields(
118
+ fields=[(name, datatype)],
119
+ schema_name=schema_name,
120
+ merge_keys=[name] if is_merge_key else None,
121
+ )
122
+
123
+
124
+ class SchemasAccessor:
125
+ """Accessor class used to make it easy to do actions like dataset.schemas['all'] to work with schemas in the Dataset.
126
+ All schema mutation and access should come through this class, or through the public helper functions in the dataset
127
+ class, e.g. 'add_fields()'.
128
+ """
129
+
130
+ def __init__(self, dataset: Dataset):
131
+ self.dataset = dataset
132
+
133
+ def __getitem__(self, name: str) -> Schema:
134
+ if name not in self.dataset._schemas:
135
+ raise KeyError(f"Schema '{name}' not found.")
136
+ return self.dataset._schemas[name]
137
+
138
+ def __setitem__(self, schema_name: str, field_names: List[str]) -> None:
139
+ self.dataset._add_fields_to_schema(
140
+ field_names=field_names, schema_name=schema_name
141
+ )
142
+
143
+ def __delitem__(self, schema_name: str) -> None:
144
+ if schema_name not in self.dataset._schemas:
145
+ raise ValueError(f"Schema '{schema_name}' does not exist.")
146
+ if schema_name == ALL:
147
+ raise ValueError("Cannot remove the 'all' schema.")
148
+ del self.dataset._schemas[schema_name]
149
+
150
+ def __contains__(self, schema_name: str) -> bool:
151
+ return schema_name in self.dataset._schemas
152
+
153
+ def __iter__(self) -> Iterator[str]:
154
+ return iter(self.dataset._schemas.keys())
155
+
156
+ def __len__(self) -> int:
157
+ return len(self.dataset._schemas)
158
+
159
+ def __repr__(self) -> str:
160
+ return f"SchemasAccessor({list(self.dataset._schemas.keys())})"
161
+
162
+
163
+ class Dataset:
164
+ def __init__(
165
+ self,
166
+ *,
167
+ dataset_name: str,
168
+ metadata_uri: Optional[str] = None,
169
+ schema: Optional[Schema] = None,
170
+ schema_name: Optional[str] = None,
171
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
172
+ namespace: Optional[str] = DEFAULT_NAMESPACE,
173
+ ):
174
+ """
175
+ Create an empty Dataset w/ optional schema. This method is typically only used for small datasets that are manually created.
176
+ Use the Dataset.from_*() to create a dataset from existing data.
177
+
178
+ Args:
179
+ dataset_name: Unique identifier for the dataset.
180
+ metadata_uri: The directory to store the _metadata_folder ('.riv-meta-{dataset_name}') containing dataset metadata.
181
+ If not provided, we'll use the local directory.
182
+
183
+ Private Attributes:
184
+ _metadata_folder (str):
185
+ The folder name where metadata for the dataset is kept. It will always be
186
+ '.riv-meta-{dataset_name}', and be stored under `metadata_uri`.
187
+ _schemas (dict[str, Schema]):
188
+ Maps a schemas by name (e.g., "default", "analytics"). This is how fields in the dataset are grouped and accessed.
189
+ _file_store (FileStore):
190
+ The FileStore used by the Dataset class for reading and writing metadata files.
191
+ _file_provider (FileProvider):
192
+ Used to resolve file URIs within the `_file_store`.
193
+ _metastore (DatasetMetastore):
194
+ Uses the _file_store and _file_provider to manage metadata (schema, stats, file locations, manifests, etc.) for this Dataset.
195
+ """
196
+ if not dataset_name or not isinstance(dataset_name, str):
197
+ raise ValueError("Name must be a non-empty string")
198
+
199
+ self.dataset_name = dataset_name
200
+ self._schemas: Dict[str, Schema] = {ALL: Schema()}
201
+
202
+ self._metadata_folder = f".riv-meta-{dataset_name}"
203
+ path, filesystem = FileStore.filesystem(
204
+ metadata_uri or self._metadata_folder, filesystem
205
+ )
206
+ self._metadata_path = posixpath.join(path, self._metadata_folder)
207
+
208
+ self._table_name = dataset_name
209
+ self._table_version = DEFAULT_TABLE_VERSION
210
+ self._namespace = namespace
211
+ self._partition_id = DEFAULT_PARTITION_ID
212
+
213
+ self._create_metadata_directories()
214
+
215
+ # TODO: remove locator state here. The deltacat catalog and
216
+ # storage interface should remove the need to pass around locator state
217
+ self._locator = PartitionLocator.at(
218
+ namespace=self._namespace,
219
+ table_name=self.dataset_name,
220
+ table_version=self._table_version,
221
+ stream_id=DEFAULT_STREAM_ID,
222
+ stream_format=StreamFormat.DELTACAT,
223
+ partition_values=DEFAULT_PARTITION_VALUES,
224
+ partition_id=self._partition_id,
225
+ )
226
+
227
+ self._file_store = FileStore(self._metadata_path, filesystem)
228
+ self._file_provider = FileProvider(
229
+ self._metadata_path, self._locator, self._file_store
230
+ )
231
+
232
+ self._metastore = DatasetMetastore(
233
+ self._metadata_path, self._file_provider, self._locator
234
+ )
235
+
236
+ self.fields = FieldsAccessor(self)
237
+ self.schemas = SchemasAccessor(self)
238
+
239
+ if schema:
240
+ self.add_schema(schema, schema_name=schema_name)
241
+
242
+ def _create_metadata_directories(self) -> List[str]:
243
+ """
244
+ Creates rivulet metadata files using deltacat transactions.
245
+ This is a temporary solution until deltacat storage is integrated.
246
+
247
+ {CATALOG_ROOT}/
248
+ ├── {NAMESPACE_ID}/
249
+ │ ├── {TABLE_ID}/
250
+ │ │ ├── {TABLE_VERSION}/
251
+ │ │ │ ├── {STREAM}/
252
+ │ │ │ │ ├── {PARTITION}/
253
+ │ │ │ │ │ ├── {DELTA}/
254
+ │ │ │ │ │ │ ├── rev/
255
+ │ │ │ │ │ │ │ ├── 00000000000000000001_create_<txn_id>.mpk # Delta Metafile
256
+ │ │ │ │ │ └── ...
257
+
258
+ Currently, we assume **fixed** values for:
259
+ - Table Version → "table_version"
260
+ - Stream → "stream"
261
+ - Partition → "partition"
262
+
263
+ TODO this will be replaced with Deltacat Storage interface - https://github.com/ray-project/deltacat/issues/477
264
+ TODO: Consider how to support **dynamic values** for these entities.
265
+ """
266
+ metafiles = [
267
+ Namespace.of(locator=NamespaceLocator.of(namespace=self._namespace)),
268
+ Table.of(
269
+ locator=TableLocator.at(self._namespace, self.dataset_name),
270
+ description=f"Table for {self.dataset_name}",
271
+ ),
272
+ TableVersion.of(
273
+ locator=TableVersionLocator.at(
274
+ self._namespace, self.dataset_name, self._table_version
275
+ ),
276
+ schema=None,
277
+ ),
278
+ Stream.of(
279
+ locator=StreamLocator.at(
280
+ namespace=self._namespace,
281
+ table_name=self.dataset_name,
282
+ table_version=self._table_version,
283
+ stream_id=DEFAULT_STREAM_ID,
284
+ stream_format=StreamFormat.DELTACAT,
285
+ ),
286
+ partition_scheme=None,
287
+ state=CommitState.STAGED,
288
+ previous_stream_id=None,
289
+ watermark=None,
290
+ ),
291
+ Partition.of(
292
+ locator=PartitionLocator.at(
293
+ namespace=self._namespace,
294
+ table_name=self.dataset_name,
295
+ table_version=self._table_version,
296
+ stream_id=DEFAULT_STREAM_ID,
297
+ stream_format=StreamFormat.DELTACAT,
298
+ partition_values=DEFAULT_PARTITION_VALUES,
299
+ partition_id=self._partition_id,
300
+ ),
301
+ content_types=None,
302
+ ),
303
+ ]
304
+
305
+ txn_operations = [
306
+ TransactionOperation.of(
307
+ operation_type=TransactionOperationType.CREATE, dest_metafile=meta
308
+ )
309
+ for meta in metafiles
310
+ ]
311
+
312
+ transaction = Transaction.of(
313
+ txn_operations=TransactionOperationList.of(txn_operations),
314
+ )
315
+
316
+ try:
317
+ paths = transaction.commit(self._metadata_path)[0]
318
+ return paths
319
+ except Exception as e:
320
+ # TODO: Have deltacat storage interface handle transaction errors.
321
+ error_message = str(e).lower()
322
+ if "already exists" in error_message:
323
+ logger.debug(f"Skipping creation: {e}")
324
+ return []
325
+ else:
326
+ raise
327
+
328
+ @classmethod
329
+ def from_parquet(
330
+ cls,
331
+ name: str,
332
+ file_uri: str,
333
+ merge_keys: str | Iterable[str],
334
+ metadata_uri: Optional[str] = None,
335
+ schema_mode: str = "union",
336
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
337
+ namespace: str = DEFAULT_NAMESPACE,
338
+ ) -> Dataset:
339
+ """
340
+ Create a Dataset from parquet files.
341
+
342
+ TODO: Make pluggable(from_x) with other file formats.
343
+
344
+ Args:
345
+ name: Unique identifier for the dataset.
346
+ metadata_uri: Base URI for the dataset, where dataset metadata is stored. If not specified, will be placed in ${file_uri}/riv-meta
347
+ file_uri: Path to parquet file(s)
348
+ merge_keys: Fields to specify as merge keys for future 'zipper merge' operations on the dataset
349
+ schema_mode: Schema combination mode. Options:
350
+ - 'union': Use unified schema with all columns
351
+ - 'intersect': Use only common columns across files
352
+
353
+ Returns:
354
+ Dataset: New dataset instance with the schema automatically inferred from the source parquet files
355
+ """
356
+ # TODO: integrate this with filesystem from deltacat catalog
357
+ file_uri, file_fs = FileStore.filesystem(file_uri, filesystem=filesystem)
358
+ if metadata_uri is None:
359
+ metadata_uri = posixpath.join(posixpath.dirname(file_uri), "riv-meta")
360
+ else:
361
+ metadata_uri, metadata_fs = FileStore.filesystem(
362
+ metadata_uri, filesystem=filesystem
363
+ )
364
+
365
+ # TODO: when integrating deltacat consider if we can support multiple filesystems
366
+ if file_fs.type_name != metadata_fs.type_name:
367
+ raise ValueError(
368
+ "File URI and metadata URI must be on the same filesystem."
369
+ )
370
+ pyarrow_dataset = pyarrow.dataset.dataset(file_uri, filesystem=file_fs)
371
+
372
+ if schema_mode == "intersect":
373
+ schemas = []
374
+ for file in pyarrow_dataset.files:
375
+ with file_fs.open_input_file(file) as f:
376
+ schema = pyarrow.parquet.read_schema(f)
377
+ schemas.append(schema)
378
+
379
+ common_columns = set(schemas[0].names)
380
+ for schema in schemas[1:]:
381
+ common_columns.intersection_update(schema.names)
382
+
383
+ intersect_schema = pa.schema(
384
+ [(name, schemas[0].field(name).type) for name in common_columns]
385
+ )
386
+ pyarrow_schema = intersect_schema
387
+ else:
388
+ schemas = []
389
+ for file in pyarrow_dataset.files:
390
+ with file_fs.open_input_file(file) as f:
391
+ schema = pyarrow.parquet.read_schema(f)
392
+ schemas.append(schema)
393
+ pyarrow_schema = pa.unify_schemas(schemas)
394
+
395
+ dataset_schema = Schema.from_pyarrow(pyarrow_schema, merge_keys)
396
+
397
+ # TODO the file URI never gets stored/saved, do we need to do so?
398
+ dataset = cls(
399
+ dataset_name=name,
400
+ metadata_uri=metadata_uri,
401
+ schema=dataset_schema,
402
+ filesystem=file_fs,
403
+ namespace=namespace,
404
+ )
405
+
406
+ # TODO: avoid write! associate fields with their source data.
407
+ writer = dataset.writer()
408
+
409
+ for batch in pyarrow_dataset.scanner().to_batches():
410
+ writer.write(batch)
411
+ writer.flush()
412
+
413
+ return dataset
414
+
415
+ @classmethod
416
+ def from_json(
417
+ cls,
418
+ name: str,
419
+ file_uri: str,
420
+ merge_keys: str | Iterable[str],
421
+ metadata_uri: Optional[str] = None,
422
+ schema_mode: str = "union",
423
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
424
+ namespace: str = DEFAULT_NAMESPACE,
425
+ ) -> "Dataset":
426
+ """
427
+ Create a Dataset from a single JSON file.
428
+
429
+ TODO: Add support for reading directories with multiple JSON files.
430
+
431
+ Args:
432
+ name: Unique identifier for the dataset.
433
+ metadata_uri: Base URI for the dataset, where dataset metadata is stored. If not specified, will be placed in ${file_uri}/riv-meta
434
+ file_uri: Path to a single JSON file.
435
+ merge_keys: Fields to specify as merge keys for future 'zipper merge' operations on the dataset.
436
+ schema_mode: Currently ignored as this is for a single file.
437
+
438
+ Returns:
439
+ Dataset: New dataset instance with the schema automatically inferred
440
+ from the JSON file.
441
+ """
442
+ # TODO: integrate this with filesystem from deltacat catalog
443
+ file_uri, file_fs = FileStore.filesystem(file_uri, filesystem=filesystem)
444
+ if metadata_uri is None:
445
+ metadata_uri = posixpath.join(posixpath.dirname(file_uri), "riv-meta")
446
+ else:
447
+ metadata_uri, metadata_fs = FileStore.filesystem(
448
+ metadata_uri, filesystem=filesystem
449
+ )
450
+
451
+ # TODO: when integrating deltacat consider if we can support multiple filesystems
452
+ if file_fs.type_name != metadata_fs.type_name:
453
+ raise ValueError(
454
+ "File URI and metadata URI must be on the same filesystem."
455
+ )
456
+
457
+ # Read the JSON file into a PyArrow Table
458
+ pyarrow_table = pyarrow.json.read_json(file_uri, filesystem=file_fs)
459
+ pyarrow_schema = pyarrow_table.schema
460
+
461
+ # Create the dataset schema
462
+ dataset_schema = Schema.from_pyarrow(pyarrow_schema, merge_keys)
463
+
464
+ # Create the Dataset instance
465
+ dataset = cls(
466
+ dataset_name=name,
467
+ metadata_uri=metadata_uri,
468
+ schema=dataset_schema,
469
+ filesystem=file_fs,
470
+ namespace=namespace,
471
+ )
472
+
473
+ writer = dataset.writer()
474
+ writer.write(pyarrow_table.to_batches())
475
+ writer.flush()
476
+
477
+ return dataset
478
+
479
+ @classmethod
480
+ def from_csv(
481
+ cls,
482
+ name: str,
483
+ file_uri: str,
484
+ merge_keys: str | Iterable[str],
485
+ metadata_uri: Optional[str] = None,
486
+ schema_mode: str = "union",
487
+ filesystem: Optional[pyarrow.fs.FileSystem] = None,
488
+ namespace: str = DEFAULT_NAMESPACE,
489
+ ) -> "Dataset":
490
+ """
491
+ Create a Dataset from a single JSON file.
492
+
493
+ TODO: Add support for reading directories with multiple CSV files.
494
+
495
+ Args:
496
+ name: Unique identifier for the dataset.
497
+ metadata_uri: Base URI for the dataset, where dataset metadata is stored. If not specified, will be placed in ${file_uri}/riv-meta
498
+ file_uri: Path to a single CSV file.
499
+ merge_keys: Fields to specify as merge keys for future 'zipper merge' operations on the dataset.
500
+ schema_mode: Currently ignored as this is for a single file.
501
+
502
+ Returns:
503
+ Dataset: New dataset instance with the schema automatically inferred
504
+ from the CSV file.
505
+ """
506
+ # TODO: integrate this with filesystem from deltacat catalog
507
+ file_uri, file_fs = FileStore.filesystem(file_uri, filesystem=filesystem)
508
+ if metadata_uri is None:
509
+ metadata_uri = posixpath.join(posixpath.dirname(file_uri), "riv-meta")
510
+ else:
511
+ metadata_uri, metadata_fs = FileStore.filesystem(
512
+ metadata_uri, filesystem=filesystem
513
+ )
514
+
515
+ # TODO: when integrating deltacat consider if we can support multiple filesystems
516
+ if file_fs.type_name != metadata_fs.type_name:
517
+ raise ValueError(
518
+ "File URI and metadata URI must be on the same filesystem."
519
+ )
520
+
521
+ # Read the CSV file into a PyArrow Table
522
+ table = pyarrow.csv.read_csv(file_uri, filesystem=file_fs)
523
+ pyarrow_schema = table.schema
524
+
525
+ # Create the dataset schema
526
+ dataset_schema = Schema.from_pyarrow(pyarrow_schema, merge_keys)
527
+
528
+ # Create the Dataset instance
529
+ dataset = cls(
530
+ dataset_name=name,
531
+ metadata_uri=metadata_uri,
532
+ schema=dataset_schema,
533
+ filesystem=file_fs,
534
+ namespace=namespace,
535
+ )
536
+
537
+ writer = dataset.writer()
538
+ writer.write(table.to_batches())
539
+ writer.flush()
540
+
541
+ return dataset
542
+
543
+ def print(self, num_records: int = 10) -> None:
544
+ """Prints the first `num_records` records in the dataset."""
545
+ records = self.scan().to_pydict()
546
+ for record in itertools.islice(records, num_records):
547
+ print(record)
548
+
549
+ def export(
550
+ self,
551
+ file_uri: str,
552
+ format: str = "parquet",
553
+ query: QueryExpression = QueryExpression(),
554
+ ) -> None:
555
+ """Export the dataset to a file.
556
+
557
+ Args:
558
+ file_uri: The URI to write the dataset to.
559
+ format: The format to write the dataset in. Options are [parquet, feather].
560
+ """
561
+ export_dataset(self, file_uri, format, query)
562
+
563
+ def _add_fields_to_schema(
564
+ self,
565
+ field_names: Iterable[str],
566
+ schema_name: str,
567
+ ) -> None:
568
+ """
569
+ An internal function to add fields to a new or existing schema (creating the schema if it doesn't exist).
570
+ Note: This function will error if the fields do not exist (rather than add them).
571
+
572
+ Args:
573
+ field_names: List of field names to add to the schema.
574
+ schema_name: Name of the schema.
575
+
576
+ Raises:
577
+ ValueError: If any field does not exist in the dataset.
578
+ """
579
+
580
+ # Input Validation
581
+ # Ensure all fields exist
582
+ for name in field_names:
583
+ if name not in self.schemas[ALL]:
584
+ raise ValueError(f"Field '{name}' does not exist in the dataset.")
585
+
586
+ # Begin adding schema/fields to the schema map, this must be completed as a transaction w/o error or the schemas will be
587
+ # left in an undefined state.
588
+ # TODO: This is not threadsafe
589
+
590
+ # Create the empty schema if it doesn't exist
591
+ if schema_name not in self._schemas:
592
+ self._schemas[schema_name] = Schema()
593
+
594
+ # Add the (existing) fields from the 'all' schema to the defined schema
595
+ for name in field_names:
596
+ self._schemas[schema_name].add_field(self.schemas[ALL][name])
597
+
598
+ def add_fields(
599
+ self,
600
+ fields: Iterable[Tuple[str, Datatype] | Field],
601
+ schema_name: str = DEFAULT,
602
+ merge_keys: Optional[Iterable[str]] = None,
603
+ ) -> None:
604
+ """
605
+ Helper function to simultaneously add a set of new fields, put them under a new or existing schema,
606
+ and add merge keys, all in a single function.
607
+
608
+ This can also be done field by field using:
609
+ * dataset.fields.add(name=.., datatype=.., ...)
610
+
611
+ Or it can be done by using add_schema().
612
+
613
+ Args:
614
+ fields: List of tuples (name, datatype) or Field objects.
615
+ schema_name: User defined name to give to the group of fields.
616
+ merge_keys: Optional list of field names to set as merge keys.
617
+
618
+ Raises:
619
+ ValueError: If any field has the same name as an existing field.
620
+ """
621
+ if not fields:
622
+ raise ValueError("No fields provided.")
623
+ merge_keys = merge_keys or {}
624
+
625
+ # Convert all input tuples to Field objects
626
+ processed_fields = []
627
+ field_names = set()
628
+
629
+ for field in fields:
630
+ if isinstance(field, tuple):
631
+ name, datatype = field
632
+ processed_field = Field(
633
+ name=name, datatype=datatype, is_merge_key=(name in merge_keys)
634
+ )
635
+ elif isinstance(field, Field):
636
+ processed_field = field
637
+ name = field.name
638
+ # Check if merge key status on field conflicts with any provided status form merge_key list
639
+ if name in merge_keys:
640
+ if processed_field.is_merge_key is not True:
641
+ raise TypeError(
642
+ f"Merge key status conflict for field '{name}'. "
643
+ f"Field({name}).is_merge_key is set to 'false', but was '{name}' was provided in the merge_keys list. "
644
+ f"Remove {name} from merge_keys or change Field({name}).is_merge_key to true."
645
+ )
646
+ else:
647
+ raise TypeError(f"Unexpected field type: {type(field)}")
648
+
649
+ processed_fields.append(processed_field)
650
+ field_names.add(name)
651
+
652
+ # Input Validation
653
+ # Check that merge_keys defined are present in the fields being added
654
+ if merge_keys:
655
+ missing_keys = set(merge_keys) - field_names
656
+ if missing_keys:
657
+ raise ValueError(
658
+ f"The following merge keys were not found in the provided fields: {', '.join(missing_keys)}"
659
+ )
660
+
661
+ # Add/update the schema
662
+ self.add_schema(Schema(processed_fields), schema_name=schema_name)
663
+
664
+ def add_schema(self, schema: Schema, schema_name: str = DEFAULT) -> None:
665
+ """
666
+ Merges the provided schema into the existing schema, or creates a new schema if it doesn't exist.
667
+ Will also add all fields to the 'all' schema.
668
+
669
+ Args:
670
+ schema: The Schema to add or merge into the named dataset schema.
671
+ schema_name: The name of the schema to update or create. Defaults to "default".
672
+
673
+ Raises:
674
+ ValueError: If fields in the provided schema conflict with existing fields in the dataset.
675
+ """
676
+ schema_name = schema_name or DEFAULT
677
+
678
+ # Check for any fields that already exist
679
+ for field in schema.values():
680
+ if field.name in self.schemas[ALL]:
681
+ existing_field = self.schemas[ALL][field.name]
682
+ if existing_field is not None and field != existing_field:
683
+ raise ValueError(
684
+ f"Field '{field.name}' already exists and is of a different type: New({field}) Existing({existing_field})."
685
+ )
686
+
687
+ # Begin adding fields, this must be completed as a transaction w/o error or the field maps will be
688
+ # left in an undefined state.
689
+ # TODO: This is not threadsafe
690
+
691
+ # Create schema if it doesn't exist
692
+ if schema_name not in self._schemas:
693
+ self._schemas[schema_name] = Schema()
694
+
695
+ # Merge new schema into 'all' and provided schema_name
696
+ self._schemas[schema_name].merge(schema)
697
+ self._schemas[ALL].merge(schema)
698
+
699
+ def get_merge_keys(self) -> Iterable[str]:
700
+ """Return a list of all merge keys."""
701
+ return self.schemas[ALL].get_merge_keys()
702
+
703
+ def writer(
704
+ self,
705
+ schema_name: str = None,
706
+ file_format: str | None = None,
707
+ ) -> DatasetWriter:
708
+ """Create a new (stateful) writer using the schema at the conjunction of given schemas.
709
+
710
+ Invoking this will register any unregistered schemas.
711
+
712
+ :param schema_name: The schema to use for write, if None, uses the 'all' schema
713
+ :param file_format Write data to this format. Options are [parquet, feather]. If not specified, library will choose
714
+ based on schema
715
+ :return: new dataset writer with a schema at the conjunction of the given schemas
716
+ """
717
+ schema_name = schema_name or ALL
718
+
719
+ return MemtableDatasetWriter(
720
+ self._file_provider, self.schemas[schema_name], self._locator, file_format
721
+ )
722
+
723
+ def shards(
724
+ self,
725
+ num_shards: int,
726
+ strategy: str = "range",
727
+ ) -> Iterable[Shard]:
728
+ """Create a set of shards for this dataset.
729
+
730
+ :param num_shards: The number of shards to create.
731
+ :param strategy: Sharding strategy used to create shards..
732
+ :return Iterable[Shard]: A set of shards for this dataset.
733
+ """
734
+ return ShardingStrategy.from_string(strategy).shards(
735
+ num_shards, self._metastore
736
+ )
737
+
738
+ def scan(
739
+ self,
740
+ query: QueryExpression = QueryExpression(),
741
+ schema_name: str = ALL,
742
+ shard: Optional[Shard] = None,
743
+ ) -> DataScan:
744
+ dataset_reader = DatasetReader(self._metastore)
745
+ return DataScan(self.schemas[schema_name], query, dataset_reader, shard=shard)