deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,389 @@
1
+ import heapq
2
+ import logging
3
+
4
+ from collections import defaultdict
5
+ from typing import (
6
+ Generator,
7
+ Dict,
8
+ Set,
9
+ Type,
10
+ TypeVar,
11
+ NamedTuple,
12
+ Any,
13
+ List,
14
+ Generic,
15
+ AbstractSet,
16
+ )
17
+
18
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
19
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
20
+ from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
21
+ OrderedBlockGroups,
22
+ BlockGroup,
23
+ Block,
24
+ )
25
+ from deltacat.experimental.storage.rivulet.reader.data_reader import (
26
+ RowAndKey,
27
+ FileReader,
28
+ )
29
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
30
+ DatasetMetastore,
31
+ )
32
+ from deltacat.experimental.storage.rivulet.reader.pyarrow_data_reader import (
33
+ ArrowDataReader,
34
+ )
35
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
36
+ QueryExpression,
37
+ )
38
+ from deltacat.experimental.storage.rivulet.reader.reader_type_registrar import (
39
+ FileReaderRegistrar,
40
+ )
41
+ from deltacat.experimental.storage.rivulet import Schema
42
+ from deltacat import logs
43
+
44
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
45
+
46
+ FILE_FORMAT = TypeVar("FILE_FORMAT")
47
+ MEMORY_FORMAT = TypeVar("MEMORY_FORMAT")
48
+
49
+
50
+ class FileReaderWithContext(NamedTuple):
51
+ reader: FileReader[FILE_FORMAT]
52
+ context: DeltaContext
53
+
54
+
55
+ class ZipperMergeHeapRecord(NamedTuple):
56
+ """
57
+ Named tuple for data structure we're putting into heap during zipper merge
58
+
59
+ Note we override the equality/comparison operators to use key
60
+ so that we can add these items to a heap by key
61
+ """
62
+
63
+ key: Any
64
+ data: FILE_FORMAT
65
+ reader: FileReaderWithContext
66
+
67
+ def __lt__(self, other):
68
+ return self.key < other.key
69
+
70
+ def __le__(self, other):
71
+ return self.key <= other.key
72
+
73
+ def __gt__(self, other):
74
+ return self.key > other.key
75
+
76
+ def __ge__(self, other):
77
+ return self.key >= other.key
78
+
79
+
80
+ class ZipperBlockScanExecutor(Generic[MEMORY_FORMAT]):
81
+ """
82
+ Class for managing a zipper scan across multiple field groups. This class is only ever called inside the higher level BlockScanner class
83
+
84
+ It is factored into a dedicated class because of the complexity and state management of
85
+ zipper merging
86
+ """
87
+
88
+ def __init__(
89
+ self,
90
+ result_schema: Schema,
91
+ deserialize_to: Type[MEMORY_FORMAT],
92
+ ordered_block_groups: OrderedBlockGroups,
93
+ query: QueryExpression[Any],
94
+ metastore: DatasetMetastore,
95
+ file_readers: Dict[str, FileReader],
96
+ ):
97
+
98
+ self.result_schema = result_schema
99
+ self.deserialize_to = deserialize_to
100
+ self.ordered_block_groups = ordered_block_groups
101
+ self.query = query
102
+ self.metastore = metastore
103
+ self.file_readers = file_readers
104
+ """
105
+ Keeps track of block file readers that are open, across block group boundaries. E.g., if Block Group 1 has
106
+ blocks [1,2,3] and BlockGroup2 has blocks [2,3], we will start reading blocks [2,3] and need to re-use the
107
+ open iterator while reading BlockGroup2
108
+ """
109
+ self._open_file_readers: Dict[SSTableRow, FileReaderWithContext] = {}
110
+
111
+ def scan(self) -> Generator[MEMORY_FORMAT, None, None]:
112
+ """
113
+ Perform N-wise zipper across N field groups.
114
+ Within each field group, there is a set of blocks which belong in this BlockGroup's key range
115
+
116
+ As a simplified example, we may have:
117
+ FieldGroup1: [BlockA, BlockB]
118
+ FieldGroup2: [BlockC]
119
+ BlockA: keys 1,3,9,10
120
+ BlockB: keys 2,4,5,6,7,8
121
+ BlockC: keys 1-10
122
+
123
+ The algorithm to merge these looks like:
124
+ 1. Load each block in DataReader to get iterator over sorted keys
125
+ 2. Build a heap of records across blocks across field groups
126
+ 3. Pop record(s) from heap as long as they have equal keys. For up to N records, merge column wise
127
+ 4. Continue until all blocks are read OR the key range in query is exceeded
128
+ """
129
+ for block_group in self.ordered_block_groups.block_groups:
130
+
131
+ logger.debug(f"Starting scan of block group {block_group}")
132
+
133
+ # Set of all blocks that need to be read within this block group
134
+ blocks: set[Block] = {
135
+ block
136
+ for block_set in block_group.field_group_to_blocks.values()
137
+ for block in block_set
138
+ }
139
+ # Open all file readers, such that self._open_block_iterators has pointers to open readers
140
+ self.__open_file_readers(blocks)
141
+ record_heap: List[ZipperMergeHeapRecord] = []
142
+
143
+ # Seed record heap with record from each iterator
144
+ file_reader_context: FileReaderWithContext
145
+ for block, file_reader_context in self._open_file_readers.items():
146
+ self.__push_next_row_back_to_heap(
147
+ block_group, file_reader_context, record_heap
148
+ )
149
+
150
+ # For each zipper merged entry from heap traversal, delegate to deserializer
151
+ for zipper_merged in self.__zipper_merge_sorted_records(
152
+ record_heap, block_group
153
+ ):
154
+ records = [z.data for z in self._dedupe_records(zipper_merged)]
155
+ # TODO (multi format support) we need to handle joining across data readers in the future
156
+ # For now, assume all data readers MUST read to Arrow intermediate format
157
+ for result in ArrowDataReader().join_deserialize_records(
158
+ records, self.deserialize_to, self.result_schema.get_merge_key()
159
+ ):
160
+ yield result
161
+
162
+ def _dedupe_records(
163
+ self, records: List[ZipperMergeHeapRecord]
164
+ ) -> List[ZipperMergeHeapRecord]:
165
+ """Deduplicate records with the same key (as a sorted list of records).
166
+
167
+ Deduplication chooses records based on the following rules of precedence
168
+
169
+ 1. Levels with lower numbers take precedence over levels with higher numbers (L0 is preferred over L1)
170
+ 2. Newer stream positions take precedence over older stream positions
171
+
172
+ Undefined Behavior:
173
+
174
+ - Duplicate records within files from the same manifest (either in the same ir across data files)
175
+
176
+ TODO: allow for the definition of a 'dedupe' column to break ties.
177
+ """
178
+ sort_criteria = lambda x: (
179
+ -x.reader.context.level,
180
+ x.reader.context.stream_position,
181
+ )
182
+
183
+ grouped_by_sort_group: defaultdict[
184
+ Schema, List[ZipperMergeHeapRecord]
185
+ ] = defaultdict(list)
186
+ for record in records:
187
+ grouped_by_sort_group[record.reader.context.schema].append(record)
188
+ deduped = [
189
+ max(group, key=sort_criteria) for group in grouped_by_sort_group.values()
190
+ ]
191
+ # Sort one last time across schemas (in case there's overlapping fields)
192
+ deduped.sort(key=sort_criteria)
193
+ return deduped
194
+
195
+ def __zipper_merge_sorted_records(
196
+ self, record_heap: List[ZipperMergeHeapRecord], block_group: BlockGroup
197
+ ) -> Generator[List[ZipperMergeHeapRecord], None, None]:
198
+ """
199
+ Continually pop from heap until heap empty OR block range exceeded. Generate "zipper merge" of records
200
+
201
+ Algorithm is:
202
+ (1) Pop lowest element from heap. Includes pointer to the iterator it came from.
203
+ Push next largest element from that generator back onto heap
204
+ (2) Buffer records of same key and peek/pop the heap as long as there is a key match
205
+ For any record popped, push next largest element from generator back onto heap
206
+ (3) Yield merged record by invoking Data Reader
207
+
208
+ This solution maintains the following invariants:
209
+ (1) the heap will have at most N records, where N=total blocks in BlockGroup
210
+ (2) the heap has the N smallest records globally
211
+ (3) any data that needs to be merged for a given key exists in the heap
212
+
213
+ :param record_heap: seeded heap of ZipperMergeHeapRecords.
214
+ :param block_group: block group being traversed
215
+ :return: generator of merged records. Note this is a list not a set to not require hash support
216
+ """
217
+ if not record_heap:
218
+ return
219
+
220
+ # Keep iterating until heap is empty or key range is exceeded
221
+ while record_heap:
222
+ curr_heap_record = heapq.heappop(record_heap)
223
+ curr_pk = curr_heap_record.key
224
+
225
+ if not self.query.matches_query(curr_pk):
226
+ continue
227
+
228
+ # Sanity check - assert that key we are looking at is in block group's range
229
+ if not block_group.key_in_range(curr_pk):
230
+ raise RuntimeError(
231
+ f"Did not expect to find key {curr_pk} on zipper merge heap"
232
+ f"for block group {block_group}"
233
+ )
234
+
235
+ # Find all records to be merged by continuing to pop heap
236
+ merged_by_pk = [curr_heap_record]
237
+ # For the current record itself - push next row back to heap
238
+ self.__push_next_row_back_to_heap(
239
+ block_group, curr_heap_record.reader, record_heap
240
+ )
241
+ # For the rest of the heap elements - peek/pop as long as they equal key
242
+ # Note that heap[0] is equivalent to peek operation
243
+ while record_heap and record_heap[0][0] == curr_pk:
244
+ merge_heap_record: ZipperMergeHeapRecord = heapq.heappop(record_heap)
245
+ merged_by_pk.append(merge_heap_record)
246
+ self.__push_next_row_back_to_heap(
247
+ block_group, merge_heap_record.reader, record_heap
248
+ )
249
+ yield merged_by_pk
250
+
251
+ def __push_next_row_back_to_heap(
252
+ self,
253
+ block_group: BlockGroup,
254
+ row_context: FileReaderWithContext,
255
+ record_heap: List[ZipperMergeHeapRecord],
256
+ ):
257
+ """
258
+ This is a helper function for __zipper_merge_sorted_records and for scan().
259
+
260
+ Given a file reader, it will next() records until it finds the next record within the block group
261
+ and current query. It then pushes that record onto the heap
262
+
263
+ Sometimes we end up needing to seek into the middle of a block because the key range of a query starts
264
+ in the middle of the block. For example, if the block has keys range [0,100],
265
+ and the query is for keys [50-100], we need to seek to the first key in the block that is >= 50
266
+
267
+ TODO better support for seeking within block (rather than O(N) iteration)
268
+ """
269
+
270
+ file_reader = row_context.reader
271
+ while file_reader.peek() is not None and (
272
+ block_group.key_below_range(file_reader.peek().key)
273
+ or self.query.below_query_range(file_reader.peek().key)
274
+ ):
275
+ try:
276
+ # call next() on file reader to throw out key which is below range of block group
277
+ next(file_reader)
278
+ except StopIteration:
279
+ # If we have exhausted iterator, this just means no keys from this block actually match the query
280
+ file_reader.close()
281
+ # TODO how to remove file reader from _open_file_readers?
282
+
283
+ if (
284
+ file_reader.peek()
285
+ and self.query.matches_query(file_reader.peek().key)
286
+ and block_group.key_in_range(file_reader.peek().key)
287
+ ):
288
+ try:
289
+ r: RowAndKey = next(file_reader)
290
+ heapq.heappush(
291
+ record_heap,
292
+ ZipperMergeHeapRecord(r.key, r.row, row_context),
293
+ )
294
+ except StopIteration:
295
+ # This means we have exhausted the open FileReader and should close it
296
+ file_reader.__exit__()
297
+ # TODO how to remove file reader from _open_file_readers?
298
+
299
+ def __open_file_readers(self, blocks: AbstractSet[Block]):
300
+ """
301
+ This method should be called once per block group.
302
+ It opens iterators across all blocks in the block group and stores them in a map
303
+ Blocks may already be open, if they were also in previous block groups.
304
+ """
305
+ for block in blocks:
306
+ sst_row: SSTableRow = block.row
307
+ if sst_row not in self._open_file_readers:
308
+ file_reader = FileReaderRegistrar.construct_reader_instance(
309
+ sst_row,
310
+ self.metastore.file_provider,
311
+ self.result_schema.get_merge_key(),
312
+ self.result_schema,
313
+ self.file_readers,
314
+ )
315
+ file_reader.__enter__()
316
+ # TODO we need some way to compare the blocks. using serialized timestamp as proxy for now
317
+ context = FileReaderWithContext(file_reader, block.context)
318
+ self._open_file_readers[sst_row] = context
319
+
320
+
321
+ class BlockScanner:
322
+ """
323
+ BlockScanner is a low level internal class which performs IO on Block Groups
324
+
325
+ Note that we expect a block scanner to be initialized PER QUERY because it will keep state about ongoing execution,
326
+ e.g. open iterators across block groups
327
+
328
+ TODO efficiency improvements like parallelizing scanning.
329
+ TODO handle "partial schema" use case, in which the query schema is a subset of full schema
330
+ TODO in the future we will probably want to cache blocks read across queries
331
+ """
332
+
333
+ def __init__(self, metastore: DatasetMetastore):
334
+ # Persist initialized file readers
335
+ self.metastore = metastore
336
+ self.file_readers: Dict[str, FileReader] = {}
337
+
338
+ def scan(
339
+ self,
340
+ schema: Schema,
341
+ deserialize_to: Type[MEMORY_FORMAT],
342
+ blocks: Set[SSTableRow],
343
+ query: QueryExpression[Any](),
344
+ ) -> Generator[MEMORY_FORMAT, None, None]:
345
+ """
346
+ Scan records given query and deserialize to desired memory output format
347
+ Set of blocks can all be scanned and returned independently
348
+ TODO handle "partial schema" use case, in which the query schema is a subset of full schema
349
+ TODO parallelize scan with async io
350
+ """
351
+ data_reader = ArrowDataReader()
352
+ for block in blocks:
353
+ file_reader = FileReaderRegistrar.construct_reader_instance(
354
+ block,
355
+ self.metastore.file_provider,
356
+ schema.get_merge_key(),
357
+ schema,
358
+ self.file_readers,
359
+ )
360
+ with file_reader:
361
+ for generated_records in file_reader.__iter__():
362
+ # Check whether row matches key in query before deserializing
363
+ if query.key_range:
364
+ start, end = query.key_range
365
+ if generated_records.key < start or generated_records.key > end:
366
+ continue
367
+
368
+ # Otherwise, key predicate matched and yield deserialized row
369
+ for deserialized_row in data_reader.deserialize_records(
370
+ generated_records, deserialize_to
371
+ ):
372
+ yield deserialized_row
373
+
374
+ def scan_with_zipper(
375
+ self,
376
+ schema: Schema,
377
+ deserialize_to: Type[MEMORY_FORMAT],
378
+ ordered_block_groups: OrderedBlockGroups,
379
+ query: QueryExpression[Any](),
380
+ ) -> Generator[MEMORY_FORMAT, None, None]:
381
+ zipper_scan_executor = ZipperBlockScanExecutor(
382
+ schema,
383
+ deserialize_to,
384
+ ordered_block_groups,
385
+ query,
386
+ self.metastore,
387
+ self.file_readers,
388
+ )
389
+ return zipper_scan_executor.scan()
@@ -0,0 +1,136 @@
1
+ import typing
2
+ from abc import abstractmethod
3
+ from dataclasses import dataclass
4
+ from typing import (
5
+ Protocol,
6
+ Generator,
7
+ Any,
8
+ TypeVar,
9
+ Type,
10
+ Generic,
11
+ List,
12
+ Iterator,
13
+ Optional,
14
+ )
15
+
16
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
17
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
18
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
19
+
20
+ FILE_FORMAT = TypeVar("FILE_FORMAT")
21
+ MEMORY_FORMAT = TypeVar("MEMORY_FORMAT")
22
+
23
+ T = TypeVar("T")
24
+
25
+
26
+ @dataclass
27
+ class RowAndKey(Generic[FILE_FORMAT]):
28
+ """
29
+ Named tuple for a record batch with an index into a specific row
30
+ Note that record batches store data by column, so the row index should be
31
+ used to index into each column array
32
+ """
33
+
34
+ row: FILE_FORMAT
35
+ key: Any
36
+
37
+
38
+ class FileReader(
39
+ Protocol[FILE_FORMAT],
40
+ Iterator[RowAndKey[FILE_FORMAT]],
41
+ typing.ContextManager,
42
+ ):
43
+ """
44
+ Interface for reading specific file
45
+
46
+ TODO (IO abstraction) we will need to think about how various IO interfaces (S3, filesystem, memory)
47
+ plug into this.
48
+ """
49
+
50
+ @abstractmethod
51
+ def __init__(
52
+ self,
53
+ sst_row: SSTableRow,
54
+ file_provider: FileProvider,
55
+ primary_key: str,
56
+ schema: Schema,
57
+ ) -> None:
58
+ """
59
+ Required constructor (see: FileReaderRegistrar)
60
+
61
+ :param sst_row: SSTableRow containing file metadata
62
+ :param file_store: Object providing file access
63
+ """
64
+ ...
65
+
66
+ @abstractmethod
67
+ def peek(self) -> Optional[RowAndKey[FILE_FORMAT]]:
68
+ """
69
+ Peek at the next RowAndPrimaryKey without advancing the iterator
70
+ :return: Optional of RowAndPrimaryKey
71
+ """
72
+ ...
73
+
74
+ @abstractmethod
75
+ def __next__(self) -> RowAndKey[FILE_FORMAT]:
76
+ """
77
+ Fetch the next RowAndPrimaryKey and advance iterator
78
+ """
79
+ ...
80
+
81
+ @abstractmethod
82
+ def close(self):
83
+ """
84
+ Explicit add close so that resources can be cleaned up outside the ContextManager.
85
+
86
+ We expect that callers opening the reader can EITHER use a with statement or call __enter__()
87
+ Callers closing the reader can EITHER explicitly call close() or have with statement manage calling __exit__
88
+ """
89
+ ...
90
+
91
+
92
+ class DataReader(Protocol[FILE_FORMAT]):
93
+ """
94
+ Interface for reading specific file formats
95
+ A DatasetReader uses a different DataReader for each format
96
+
97
+ TODO (IO abstraction) we will need to think about how various IO interfaces (S3, filesystem, memory)
98
+ plug into this.
99
+ """
100
+
101
+ @abstractmethod
102
+ def deserialize_records(
103
+ self, records: FILE_FORMAT, output_type: Type[MEMORY_FORMAT]
104
+ ) -> Generator[MEMORY_FORMAT, None, None]:
105
+ """
106
+ Deserialize records into the specified format.
107
+
108
+ Note that output_type gets set based on what a DataScan converts results to,
109
+ e.g. to_arrow, to_dict
110
+
111
+ :param records: Input data (generated by generate_records method)
112
+ :param output_type: Type to deserialize into
113
+ :returns: A generator yielding records of the specified type.
114
+ """
115
+ ...
116
+
117
+ @abstractmethod
118
+ def join_deserialize_records(
119
+ self,
120
+ records: List[FILE_FORMAT],
121
+ output_type: Type[MEMORY_FORMAT],
122
+ join_key: str,
123
+ ) -> Generator[MEMORY_FORMAT, None, None]:
124
+ """
125
+ Deserialize records into the specified format.
126
+
127
+ Note that output_type gets set based on what a DataScan converts results to,
128
+ e.g. to_arrow, to_dict
129
+
130
+ :param records: Multiple records which should be merged into final output record
131
+ Note this is a list instead of a set to not enforce hashability
132
+ :param join_key name of field to join across record. This field must be present on all records
133
+ :param output_type: Type to deserialize into
134
+ :returns: A generator yielding records of the specified type.
135
+ """
136
+ ...
@@ -0,0 +1,65 @@
1
+ from typing import Generator, Dict, Optional
2
+
3
+ import pyarrow as pa
4
+
5
+ from deltacat.storage.model.shard import Shard
6
+ from deltacat.experimental.storage.rivulet.reader.dataset_reader import DatasetReader
7
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
8
+ QueryExpression,
9
+ )
10
+ from deltacat.experimental.storage.rivulet import Schema
11
+
12
+
13
+ class DataScan:
14
+ """
15
+ Top level class representing and executing a data scan, on both riv internal and external data
16
+ This class is lazy, and executed when the user calls a method "to_{format}"
17
+ to deserialize data into the chosen in-memory format
18
+
19
+ Dataset.py scan() is the entrypoint to create and return data scan. The user
20
+ then has to chain a "to_{format}" method to read rows in their chosen in-memory format
21
+
22
+ Rivulet cannot simply return file URIs and allow query engine to process files,
23
+ because rivulet will internally manage details like indexes, custom file formats for bulk records, where data is physically laid out across row groups, etc.
24
+
25
+ DataScan allows query engines to send push down predicates. Push down predicates are used to filter on dimensions natively indexed by riv (e.g. primary key), and also
26
+
27
+ DataScan is coupled to internals of the riv rivulet format. If the rivulet format evolves, DataScan execution hould be able to understand which rivulet spec version is used and be compatible with any valid rivule rivulet.
28
+
29
+ FUTURE IMPROVEMENTS
30
+ 1. Implement full spec for push down predicates
31
+ 2. Figure out how permissions/credential providers work.
32
+ 3. Figure out how extension libraries can plug in to_x deserialization support.
33
+ One potential option is to override __getattr__ and check a static class-level Registry
34
+ of to_x methods. Modules would have to import DataScan and call DataScan.register_deserializer(...)
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ dataset_schema: Schema,
40
+ query: QueryExpression,
41
+ dataset_reader: DatasetReader,
42
+ shard: Optional[Shard],
43
+ ):
44
+ self.dataset_schema = dataset_schema
45
+ self.query = query
46
+ self.dataset_reader = dataset_reader
47
+ self.shard = shard
48
+
49
+ def to_arrow(self) -> Generator[pa.RecordBatch, None, None]:
50
+ """
51
+ Generates scan results as arrow record batches
52
+
53
+ TODO how to make the .to_x methods pluggable?
54
+ """
55
+ return self.dataset_reader.scan(
56
+ self.dataset_schema, pa.RecordBatch, self.query, shard=self.shard
57
+ )
58
+
59
+ def to_pydict(self) -> Generator[Dict, None, None]:
60
+ """
61
+ Generates scan results as a Dict for each row
62
+ """
63
+ return self.dataset_reader.scan(
64
+ self.dataset_schema, Dict, self.query, shard=self.shard
65
+ )