deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,128 @@
1
+ # Similar to daft's datatype, this is a big ole enum of all possible types
2
+ # In the long term, this will have to be interoperable with pandas/daft/spark/parquet/iceberg/etc type systems
3
+ # Our Spec will need to publish data type mappings, such as Iceberg's data type mappings: https://iceberg.apache.org/spec/#file-system-operations
4
+ # It also has the unique responsibility of representing multi-modal (e.g. image) types
5
+ from dataclasses import dataclass
6
+ from typing import Optional
7
+
8
+ import pyarrow as pa
9
+
10
+
11
+ # OPEN QUESTIONS:
12
+ # Do we want to support the notion of logical vs physical type like parquet?
13
+
14
+ # TODO turn into an interface or otherwise allow pluggable datatypes
15
+ @dataclass(frozen=True)
16
+ class Datatype:
17
+ type_name: str
18
+
19
+ @property
20
+ def subtype(self) -> Optional[str]:
21
+ """
22
+ Higher level formats like binary or image will have "subtype", such as image(jpg) or binary(np_array)
23
+ TODO - Note that we are replacing this schema system with DeltaCat schema model, which supports extended/decorated pyarrow types
24
+ For now going to do a super minimal/hacky implementation of types like binary and image, where
25
+ :return: Subtype if it exists, or None
26
+ """
27
+ if not self.type_name.endswith(")"):
28
+ return None
29
+ if self.type_name.startswith("binary(") or self.type_name.startswith("image("):
30
+ return self.type_name[self.type_name.find("(") + 1 : -1]
31
+ return None
32
+
33
+ @classmethod
34
+ def binary(cls, binary_format):
35
+ """
36
+ :param binary_format:
37
+ :return:
38
+ """
39
+ return cls(type_name=f"binary({binary_format})")
40
+
41
+ @classmethod
42
+ def image(cls, image_format):
43
+ return cls(type_name=f"image({image_format})")
44
+
45
+ @classmethod
46
+ def string(cls):
47
+ return cls(type_name="string")
48
+
49
+ @classmethod
50
+ def float(cls):
51
+ return cls(type_name="float")
52
+
53
+ @classmethod
54
+ def int16(cls):
55
+ return cls(type_name="int16")
56
+
57
+ @classmethod
58
+ def int32(cls):
59
+ return cls(type_name="int32")
60
+
61
+ @classmethod
62
+ def int64(cls):
63
+ return cls(type_name="int64")
64
+
65
+ @classmethod
66
+ def bool(cls):
67
+ return cls(type_name="bool")
68
+
69
+ @classmethod
70
+ def from_pyarrow(cls, pa_type: pa.DataType) -> "Datatype":
71
+ """
72
+ Convert a pa type to a Rivulet Datatype.
73
+
74
+ Args:
75
+ pa_type: pa DataType to convert
76
+
77
+ Returns:
78
+ Datatype: Corresponding Rivulet Datatype
79
+
80
+ Raises:
81
+ ValueError: If the pa type is not supported
82
+ """
83
+ if pa.types.is_string(pa_type):
84
+ return cls.string()
85
+ elif pa.types.is_float64(pa_type):
86
+ return cls.float()
87
+ elif pa.types.is_int16(pa_type):
88
+ return cls.int16()
89
+ elif pa.types.is_int32(pa_type):
90
+ return cls.int32()
91
+ elif pa.types.is_int64(pa_type):
92
+ return cls.int64()
93
+ elif pa.types.is_boolean(pa_type):
94
+ return cls.bool()
95
+ elif pa.types.is_binary(pa_type):
96
+ # TODO: Use pyarrow metadata on schema field to map correctly into image and other binary types
97
+ return cls.binary("binary") # Default binary format
98
+ else:
99
+ raise ValueError(f"Unsupported pa type: {pa_type}")
100
+
101
+ def to_pyarrow(self) -> pa.field:
102
+ """
103
+ In the future we want to be more thoughtful about how we do type conversions
104
+
105
+ For now, just build a simple mapping of every time to pyarrow
106
+ For what it's worth, Daft schema types have a giant if/else like this
107
+
108
+ :return: pyarrow type
109
+ """
110
+ if self.type_name == "string":
111
+ return pa.string()
112
+ elif self.type_name == "float":
113
+ return pa.float64()
114
+ elif self.type_name == "int16":
115
+ return pa.int16()
116
+ elif self.type_name == "int32":
117
+ return pa.int32()
118
+ elif self.type_name == "int64":
119
+ return pa.int64()
120
+ elif self.type_name == "bool":
121
+ return pa.bool_()
122
+ elif self.type_name.startswith("image(") or self.type_name.startswith(
123
+ "binary("
124
+ ):
125
+ # TODO we will need to think about how custom types work with tabular libraries
126
+ return pa.binary()
127
+ else:
128
+ raise ValueError(f"Unsupported type conversion to pa: {self.type_name}")
@@ -0,0 +1,251 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, asdict
4
+ from typing import MutableMapping, Dict, Iterable, Tuple, Optional
5
+
6
+ import pyarrow as pa
7
+
8
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class Field:
13
+ name: str
14
+ datatype: Datatype
15
+ is_merge_key: bool = False
16
+
17
+
18
+ class Schema(MutableMapping[str, Field]):
19
+ """
20
+ A mutable mapping representing a schema for structured data, requiring at least one merge key field.
21
+
22
+ TODO FUTURE ITERATIONS
23
+ 1. We may use Deltacat for schema
24
+ 2. We almost certainly want our schema system based on arrow types,
25
+ since many libraries we are integrating with (e.g. daft) are
26
+ interoperable with arrow schemas
27
+
28
+ Attributes:
29
+ name: The name of the schema (for storing in dict/map)
30
+ _fields (dict): Maps field names to Field objects.
31
+
32
+ Methods:
33
+ from_pyarrow(pyarrow_schema: pa.Schema, key: str) -> Schema:
34
+ Creates a Schema instance from a PyArrow schema.
35
+
36
+ __len__() -> int: Returns number of fields.
37
+ __getitem__(key: str) -> Field: Gets field by name.
38
+ __setitem__(key: str, value: Field | Datatype): Adds/updates field.
39
+ __delitem__(key: str): Deletes field if not a merge key.
40
+ __iter__(): Iterates over fields.
41
+
42
+ add_field(field: Field): Adds a Field using its name as the key.
43
+ to_pyarrow() -> pa.Schema:
44
+ Converts schema to PyArrow format.
45
+
46
+ keys(): Returns field names.
47
+ values(): Returns Field objects.
48
+ items(): Returns (name, Field) pairs.
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ fields: Iterable[Tuple[str, Datatype] | Field] = None,
54
+ merge_keys: Optional[Iterable[str]] = None,
55
+ ):
56
+ self._fields: Dict[str, Field] = {}
57
+ merge_keys = merge_keys or {}
58
+ if len(fields or []) == 0:
59
+ if len(merge_keys) > 0:
60
+ raise TypeError(
61
+ "It is invalid to specify merge keys when no fields are specified. Add fields or remove the merge keys."
62
+ )
63
+ return
64
+ # Convert all input tuples to Field objects and add to fields
65
+ for field in fields:
66
+ if isinstance(field, tuple):
67
+ name, datatype = field
68
+ processed_field = Field(
69
+ name=name, datatype=datatype, is_merge_key=(name in merge_keys)
70
+ )
71
+ elif isinstance(field, Field):
72
+ processed_field = field
73
+ name = field.name
74
+ # Check if merge key status conflicts
75
+ if len(merge_keys) > 0:
76
+ expected_merge_key_status = name in merge_keys
77
+ if processed_field.is_merge_key != expected_merge_key_status:
78
+ raise TypeError(
79
+ f"Merge key status conflict for field '{name}': "
80
+ f"Provided as merge key: {expected_merge_key_status}, "
81
+ f"Field's current status: {processed_field.is_merge_key}. "
82
+ f"Merge keys should only be defined if raw (name, Datatype) tuples are used."
83
+ )
84
+ else:
85
+ raise TypeError(f"Unexpected field type: {type(field)}")
86
+ self.add_field(processed_field)
87
+
88
+ @classmethod
89
+ def from_dict(cls, data) -> Schema:
90
+ fields = [
91
+ Field(
92
+ name=field_data["name"],
93
+ datatype=Datatype(**field_data["datatype"])
94
+ if isinstance(field_data["datatype"], dict)
95
+ else field_data["datatype"],
96
+ is_merge_key=field_data["is_merge_key"],
97
+ )
98
+ for field_data in data["fields"]
99
+ ]
100
+ return cls(fields)
101
+
102
+ @classmethod
103
+ def from_pyarrow(
104
+ cls, pyarrow_schema: pa.Schema, merge_keys: str | Iterable[str] = None
105
+ ) -> Schema:
106
+ """
107
+ Create a Schema instance from a PyArrow schema.
108
+
109
+ Args:
110
+ pyarrow_schema: PyArrow Schema to convert
111
+ merge_keys: The optional set of merge keys to add to the schema as it's being translated.
112
+ These keys must be present in the schema.
113
+
114
+ Returns:
115
+ Schema: New Schema instance
116
+
117
+ Raises:
118
+ ValueError: If key is not found in schema
119
+ """
120
+ merge_keys = [merge_keys] if isinstance(merge_keys, str) else merge_keys
121
+ fields = {}
122
+
123
+ for field in pyarrow_schema:
124
+ dtype = Datatype.from_pyarrow(field.type)
125
+ fields[field.name] = Field(
126
+ field.name, dtype, is_merge_key=(field.name in merge_keys)
127
+ )
128
+
129
+ # Validate that the defined merge_keys are present in the fields being added
130
+ missing_keys = merge_keys - fields.keys()
131
+ if missing_keys:
132
+ raise ValueError(
133
+ f"The following merge keys not found in the provided schema: {', '.join(missing_keys)}"
134
+ )
135
+
136
+ return cls(fields.values())
137
+
138
+ @classmethod
139
+ def merge_all(cls, schemas: Iterable[Schema]) -> Schema:
140
+ """Merges a list of schemas into a new schema"""
141
+ merged = cls({})
142
+ for schema in schemas:
143
+ merged.merge(schema)
144
+ return merged
145
+
146
+ def __getitem__(self, key: str) -> Field:
147
+ return self._fields[key]
148
+
149
+ def __setitem__(
150
+ self, key: str, value: Field | Datatype | Tuple[Datatype, bool]
151
+ ) -> None:
152
+ # Create field from [str, Datatype, bool] where bool is merge_key
153
+ if isinstance(value, Field):
154
+ processed_field = value
155
+ elif isinstance(value, Datatype):
156
+ processed_field = Field(
157
+ key, value
158
+ ) # is_merge_key is always false in this case
159
+ elif isinstance(value, tuple):
160
+ (datatype, merge_key) = value
161
+ processed_field = Field(key, datatype, merge_key)
162
+ else:
163
+ raise TypeError(
164
+ "The field must be an instance of the Field class, Datatype, or Tuple[Datatype, bool], where bool is whether the field is a merge key."
165
+ )
166
+ processed_field: Field = processed_field
167
+ # if len(self._fields) == 0 and not processed_field.is_merge_key:
168
+ # raise TypeError("The first field set on a Schema must be a merge key.")
169
+
170
+ self._fields[processed_field.name] = processed_field
171
+
172
+ def __delitem__(self, key: str) -> None:
173
+ field = self._fields[key]
174
+ if field.is_merge_key:
175
+ raise ValueError("Cannot delete a merge key field")
176
+ del self._fields[key]
177
+
178
+ def __len__(self) -> int:
179
+ return len(self._fields)
180
+
181
+ def __iter__(self) -> Iterable[str]:
182
+ return iter(self._fields.keys())
183
+
184
+ def __hash__(self) -> int:
185
+ return hash((frozenset(self._fields.items())))
186
+
187
+ def __eq__(self, other) -> bool:
188
+ if isinstance(other, Schema):
189
+ return self._fields == other._fields
190
+ return False
191
+
192
+ # Has a spurious type check problem in @dataclass + asdict(): https://youtrack.jetbrains.com/issue/PY-76059/Incorrect-Type-warning-with-asdict-and-Dataclass
193
+ def to_dict(self) -> dict[str, list[dict[str, Field]]]:
194
+ return {"fields": [asdict(field) for field in self._fields.values()]}
195
+
196
+ def add_field(self, field: Field) -> None:
197
+ """Adds a Field object using its name as the key, raises ValueError if it already exists"""
198
+ if field.name in self._fields:
199
+ raise ValueError(
200
+ f"Attempting to add a field with the same name as an existing field: {field.name}"
201
+ )
202
+ self[field.name] = field
203
+
204
+ def get_merge_keys(self) -> Iterable[str]:
205
+ """Return a list of all merge keys."""
206
+ return [field.name for field in self._fields.values() if field.is_merge_key]
207
+
208
+ def get_merge_key(self) -> str:
209
+ """Returns a single merge key if there is one, or raises if not. Used for simple schemas w/ a single key"""
210
+ # Get the merge key
211
+ merge_keys = list(self.get_merge_keys())
212
+ if len(merge_keys) != 1:
213
+ raise ValueError(
214
+ f"Schema must have exactly one merge key, but found {merge_keys}"
215
+ )
216
+ return merge_keys[0]
217
+
218
+ def merge(self, other: Schema) -> None:
219
+ """Merges another schema's fields into the current schema."""
220
+ if not other:
221
+ return
222
+ for name, field in other._fields.items():
223
+ if name in self._fields:
224
+ if self._fields[name] != field:
225
+ raise ValueError(
226
+ f"Field '{name}' already exists in the current schema with different definition"
227
+ )
228
+ else:
229
+ self.add_field(field)
230
+
231
+ def to_pyarrow(self) -> pa.Schema:
232
+ """
233
+ Convert the Schema to a PyArrow schema.
234
+
235
+ Returns:
236
+ pyarrow.schema: A PyArrow schema representation of this Schema.
237
+ """
238
+ # TODO: Should we track merge_keys as it goes to/from pyarrow?
239
+ fields = []
240
+ for name, field in self._fields.items():
241
+ fields.append(pa.field(name, field.datatype.to_pyarrow()))
242
+ return pa.schema(fields)
243
+
244
+ def keys(self) -> Iterable[str]:
245
+ return self._fields.keys()
246
+
247
+ def values(self) -> Iterable[Field]:
248
+ return self._fields.values()
249
+
250
+ def items(self) -> Iterable[tuple[str, Field]]:
251
+ return self._fields.items()
@@ -0,0 +1,40 @@
1
+ from typing import Protocol, Iterable, List, Union, Any, Dict
2
+
3
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
4
+ import pyarrow as pa
5
+
6
+ MEMTABLE_DATA = Union[Iterable[Dict[str, Any]], pa.Table]
7
+
8
+
9
+ class DataSerializer(Protocol):
10
+ """
11
+ Interface for writing data only.
12
+
13
+ As data is written, it must emit sufficient metadata to build SSTable
14
+ Each format will have a specific data writer (e.g. ParquetDataWriter)
15
+
16
+ TODO future improvements:
17
+ 1. How does data writer control how it chooses to write to existing files vs new files?
18
+ For now, we will not expose this configuration and always write each batch to
19
+ a new file
20
+ 2. Related to 1, how should we expose URI(s) to write to? Probably DataWriter can
21
+ use FileProvider and needs to know relevant ids like task ID.
22
+ """
23
+
24
+ def flush_batch(self, sorted_records: MEMTABLE_DATA) -> List[SSTableRow]:
25
+ """
26
+ Flushes rows to file, and return appropriate metadata to build SSTable
27
+
28
+ TODO future improvements
29
+ 1. Finalize type for input records (instead of MvpRow)
30
+
31
+ Options could be:
32
+ (a) Something like Iceberg "StructLike" which allows flexible integrations without memcopy for row-oriented formats, e.g. can make Spark InternalRow structlike
33
+ (b) use arrow. We will probably use arrow for writing parquet, although
34
+ probably it isn't ideal for row oriented formats
35
+ 2. Keep in mind, most implementation of DataWriter will be written in rust.
36
+
37
+ :param sorted_records: Records sorted by key
38
+ :return: metadata used to build SS Table
39
+ """
40
+ ...
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ from deltacat.experimental.storage.rivulet.parquet.serializer import (
4
+ ParquetDataSerializer,
5
+ )
6
+ from deltacat.experimental.storage.rivulet import Schema
7
+ from deltacat.experimental.storage.rivulet.serializer import DataSerializer
8
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
9
+
10
+ from deltacat.experimental.storage.rivulet.feather.serializer import (
11
+ FeatherDataSerializer,
12
+ )
13
+
14
+
15
+ class DataSerializerFactory:
16
+ """
17
+ Simple factory class for getting the appropriate serializer given a schema
18
+ TODO make this more modular/pluggable like DatasetReaderRegistrar
19
+ This will be more challenging to make pluggable, because we should not rely on a simple 1:1 mapping of type to serializer
20
+ The actual logic for determining how to serialize a given schema may be complex
21
+ e.g.: if schema contains datatype X, you must use serializer Y. Otherwise, default to serializer Z
22
+ """
23
+
24
+ @classmethod
25
+ def get_serializer(
26
+ self,
27
+ schema: Schema,
28
+ file_provider: FileProvider,
29
+ user_provided_format: str | None = None,
30
+ ) -> DataSerializer:
31
+ if user_provided_format == "parquet":
32
+ return ParquetDataSerializer(file_provider, schema)
33
+ elif user_provided_format == "feather":
34
+ return FeatherDataSerializer(file_provider, schema)
35
+ elif user_provided_format is not None:
36
+ raise ValueError("Unsupported format. Must be 'parquet' or 'feather'.")
37
+
38
+ # Default engine logic. For now, if there is image or binary use feather
39
+ has_binary_or_image = any(
40
+ field.datatype.type_name.startswith(("binary", "image"))
41
+ for field in schema.values()
42
+ )
43
+ if has_binary_or_image:
44
+ return FeatherDataSerializer(file_provider, schema)
45
+ else:
46
+ return ParquetDataSerializer(file_provider, schema)
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+ from typing import Generic, List, Union, Iterable
3
+ from deltacat.storage.model.shard import T, Shard, ShardingStrategy
4
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
5
+ DatasetMetastore,
6
+ )
7
+
8
+
9
+ class RangeShard(Shard, Generic[T]):
10
+ """
11
+ Represents a range-based shard with minimum and maximum keys.
12
+
13
+ param: min_key: The minimum key for the shard.
14
+ param: max_key: The maximum key for the shard.
15
+ """
16
+
17
+ def __init__(self, min_key: T, max_key: T):
18
+ self.min_key = min_key
19
+ self.max_key = max_key
20
+
21
+ def __repr__(self) -> str:
22
+ return f"Shard(type=range, min_key={self.min_key}, max_key={self.max_key})"
23
+
24
+ @staticmethod
25
+ def split(
26
+ global_min: Union[int, str], global_max: Union[int, str], num_shards: int
27
+ ) -> List[RangeShard]:
28
+ """
29
+ Splits a range into `num_shards` shards.
30
+ Currently supports splitting ranges of integers and strings.
31
+
32
+ Note: If global_min == global_max or num_shards <= 1, a single shard is returned,
33
+ num_shards is ignored.
34
+
35
+ :param global_min: The minimum key for the entire range (int or str).
36
+ :param global_max: The maximum key for the entire range (int or str).
37
+ :param num_shards: The number of shards to create.
38
+ :return: A list of RangeShard objects.
39
+ """
40
+ if global_min == global_max or num_shards <= 1:
41
+ return [RangeShard(global_min, global_max)]
42
+
43
+ # Determine which interpolation function to use based on the type of min/max
44
+ if isinstance(global_min, int) and isinstance(global_max, int):
45
+ interpolate = RangeShard._interpolate_numeric
46
+ elif isinstance(global_min, str) and isinstance(global_max, str):
47
+ interpolate = RangeShard._interpolate_str
48
+ else:
49
+ raise ValueError(
50
+ "Unsupported combination of types for global_min and global_max."
51
+ )
52
+
53
+ shards: List[RangeShard] = []
54
+ for i in range(num_shards):
55
+ start = interpolate(global_min, global_max, i, num_shards)
56
+ end = interpolate(global_min, global_max, i + 1, num_shards)
57
+
58
+ if i > 0:
59
+ if isinstance(start, int):
60
+ start = shards[-1].max_key + 1
61
+ elif isinstance(start, int):
62
+ char_list = list(start)
63
+ char_list[-1] = chr(ord(char_list[-1]) + 1)
64
+ start = "".join(char_list)
65
+
66
+ shards.append(RangeShard(start, end))
67
+
68
+ return shards
69
+
70
+ @staticmethod
71
+ def _interpolate_numeric(start: int, end: int, step: int, total_steps: int) -> int:
72
+ """
73
+ Integer interpolation using integer (floor) division.
74
+
75
+ param: start (int): The starting number.
76
+ param: end (int): The ending number.
77
+ param: step (int): The current step in the interpolation (0-based).
78
+ param: total_steps (int): The total number of interpolation steps.
79
+
80
+ returns: int: The interpolated integer.
81
+ """
82
+ return start + (end - start) * step // total_steps
83
+
84
+ @staticmethod
85
+ def _interpolate_str(start: str, end: str, step: int, total_steps: int) -> str:
86
+ """
87
+ Interpolates between two strings lexicographically.
88
+
89
+ param: start (str): The starting string.
90
+ param: end (str): The ending string.
91
+ param: step (int): The current step in the interpolation (0-based).
92
+ param: total_steps (int): The total number of interpolation steps.
93
+
94
+ returns: str: The interpolated string.
95
+ """
96
+ max_len = max(len(start), len(end))
97
+
98
+ # Pad strings to the same length with spaces (smallest lexicographical character).
99
+ start = start.ljust(max_len, " ")
100
+ end = end.ljust(max_len, " ")
101
+
102
+ # Interpolate character by character based on ordinal values.
103
+ interpolated_chars = [
104
+ chr(round(ord(s) + (ord(e) - ord(s)) * step / total_steps))
105
+ for s, e in zip(start, end)
106
+ ]
107
+
108
+ return "".join(interpolated_chars).rstrip()
109
+
110
+
111
+ class RangeShardingStrategy(ShardingStrategy, Generic[T]):
112
+ """
113
+ Implements a sharding strategy to divide a range of keys into shards.
114
+
115
+ method: shards: Generates a list of RangeShard objects based on the global range.
116
+ """
117
+
118
+ def shards(
119
+ self, num_shards: int, metastore: DatasetMetastore
120
+ ) -> Iterable[RangeShard[T]]:
121
+ """
122
+ Divides the global range of keys into evenly sized shards.
123
+
124
+ param: num_shards: The number of shards to divide the range into.
125
+ param: metastore: The dataset metastore providing access to manifests.
126
+ returns: A list of RangeShard objects representing the divided range.
127
+ """
128
+ min, max = metastore.get_min_max_keys()
129
+ return RangeShard.split(min, max, num_shards)
@@ -0,0 +1,29 @@
1
+ from typing import Protocol, Iterable, Union, Any, Dict
2
+ import pyarrow as pa
3
+
4
+ DATA = Union[Iterable[Dict[str, Any]], Iterable[pa.RecordBatch], pa.RecordBatch]
5
+
6
+
7
+ class DatasetWriter(Protocol):
8
+ """
9
+ Top level interface for writing records to rivulet dataset. This is used by dataset.py
10
+
11
+ This writes both data AND metadata (SSTs, manifests).
12
+
13
+ The general paradigm is that records are written iteratively through write or write_batch. At configurable intervals (based on record count or size), data and metadata gets flushed.
14
+
15
+ When the user either closes the dataset writer or calls commit(), this triggers all buffered data and metadata to be flushed.
16
+ """
17
+
18
+ def write(self, record: DATA) -> None:
19
+ ...
20
+
21
+ def flush(self) -> str:
22
+ """
23
+ Explicitly flush any data and metadata and commit to dataset
24
+
25
+ This is a blocking operation
26
+
27
+ :return: URI of manifest written for commit
28
+ """
29
+ ...