deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,179 @@
1
+ import logging
2
+ import posixpath
3
+ from typing import Generator, Optional
4
+
5
+ import pyarrow
6
+ import pyarrow.fs
7
+
8
+ from deltacat.constants import REV_DIR_NAME
9
+ from deltacat.storage import Delta
10
+ from deltacat.storage.model.partition import PartitionLocator
11
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
12
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
13
+ from deltacat.experimental.storage.rivulet.metastore.json_sst import JsonSstReader
14
+ from deltacat.experimental.storage.rivulet.metastore.delta import (
15
+ ManifestIO,
16
+ DeltaContext,
17
+ RivuletDelta,
18
+ DeltacatManifestIO,
19
+ )
20
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTReader, SSTable
21
+ from deltacat.utils.metafile_locator import _find_table_path
22
+ from deltacat import logs
23
+
24
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
25
+
26
+
27
+ class ManifestAccessor:
28
+ """Accessor for retrieving a manifest's SSTable entities."""
29
+
30
+ def __init__(
31
+ self, delta: RivuletDelta, file_provider: FileProvider, sst_reader: SSTReader
32
+ ):
33
+ self.manifest: RivuletDelta = delta
34
+ self.file_provider: FileProvider = file_provider
35
+ self._sst_reader = sst_reader
36
+
37
+ @property
38
+ def context(self) -> DeltaContext:
39
+ return self.manifest.context
40
+
41
+ def generate_sstables(self) -> Generator[SSTable, None, None]:
42
+ """
43
+ Generate the SortedString Tables from this Manifest
44
+
45
+ :return a generator of SSTables for this manifest
46
+ """
47
+ for sst_uri in self.manifest.sst_files:
48
+ sst_file = self.file_provider.provide_input_file(sst_uri)
49
+ yield self._sst_reader.read(sst_file)
50
+
51
+
52
+ class DatasetMetastore:
53
+ """
54
+ Metastore implementation for manifests stored on a filesystem
55
+
56
+ TODO this will be replaced with Deltacat Storage interface - https://github.com/ray-project/deltacat/issues/477
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ # URI at which we expect to find deltas
62
+ delta_root_uri: str,
63
+ file_provider: FileProvider,
64
+ locator: PartitionLocator,
65
+ *,
66
+ manifest_io: ManifestIO = None,
67
+ sst_reader: SSTReader = None,
68
+ ):
69
+ self._min_key = None
70
+ self._max_key = None
71
+ self.delta_root_uri = delta_root_uri
72
+ self.file_provider = file_provider
73
+ self.manifest_io = manifest_io or DeltacatManifestIO(delta_root_uri, locator)
74
+ self.sst_reader = sst_reader or JsonSstReader()
75
+ self.locator = locator
76
+
77
+ def _get_delta(
78
+ self, delta_dir: str, filesystem: pyarrow.fs.FileSystem
79
+ ) -> Optional[RivuletDelta]:
80
+ """
81
+ Find the latest revision in a delta directory.
82
+
83
+ param: delta_dir: The directory containing the revisions.
84
+ param: filesystem: The filesystem to search for the revisions.
85
+ returns: The latest revision as a RivuletDelta.
86
+ """
87
+ rev_directory = posixpath.join(delta_dir, REV_DIR_NAME)
88
+ revisions = filesystem.get_file_info(
89
+ pyarrow.fs.FileSelector(rev_directory, allow_not_found=True)
90
+ )
91
+
92
+ if not revisions:
93
+ logger.warning(f"No revision files found in {rev_directory}")
94
+ return None
95
+
96
+ # Take lexicographical max to find the latest revision
97
+ latest_revision = max(revisions, key=lambda f: f.path)
98
+
99
+ return (
100
+ RivuletDelta.of(Delta.read(latest_revision.path))
101
+ if latest_revision
102
+ else None
103
+ )
104
+
105
+ def generate_manifests(self) -> Generator[ManifestAccessor, None, None]:
106
+ """
107
+ Generate all manifests within the Metastore
108
+ NOTE: this will be replaced by deltacat storage API.
109
+
110
+ TODO: Generate partition path using Deltacat Storage interface.
111
+
112
+ param: delta_root_uri: The URI at which we expect to find deltas.
113
+ returns: a generator of ManifestAccessors for all manifests in the dataset.
114
+ """
115
+
116
+ root_path, filesystem = resolve_path_and_filesystem(self.delta_root_uri)
117
+
118
+ partition_path = posixpath.join(
119
+ _find_table_path(root_path, filesystem),
120
+ self.locator.table_version,
121
+ self.locator.stream_id,
122
+ self.locator.partition_id,
123
+ )
124
+
125
+ partition_info = filesystem.get_file_info(partition_path)
126
+
127
+ if partition_info.type != pyarrow.fs.FileType.Directory:
128
+ logger.debug(f"Partition directory {partition_path} not found. Skipping.")
129
+ return
130
+
131
+ # Locate "rev" directory inside the partition
132
+ rev_directory = posixpath.join(partition_path, REV_DIR_NAME)
133
+ rev_info = filesystem.get_file_info(rev_directory)
134
+
135
+ if rev_info.type != pyarrow.fs.FileType.Directory:
136
+ logger.debug(f"Revision directory {rev_directory} not found. Skipping.")
137
+ return
138
+
139
+ # Fetch all delta directories inside the partition
140
+ delta_dirs = filesystem.get_file_info(
141
+ pyarrow.fs.FileSelector(
142
+ partition_path, allow_not_found=True, recursive=False
143
+ )
144
+ )
145
+
146
+ delta_dirs = [
147
+ delta
148
+ for delta in delta_dirs
149
+ if delta.type == pyarrow.fs.FileType.Directory and delta.base_name.isdigit()
150
+ ]
151
+
152
+ for delta_dir in delta_dirs:
153
+ rivulet_delta = self._get_delta(delta_dir.path, filesystem)
154
+ if rivulet_delta:
155
+ yield ManifestAccessor(
156
+ rivulet_delta, self.file_provider, self.sst_reader
157
+ )
158
+
159
+ def get_min_max_keys(self):
160
+ """
161
+ Compute and cache the minimum and maximum keys in the dataset.
162
+
163
+ returns: a tuple of the minimum and maximum keys in the dataset
164
+ """
165
+ if self._min_key is not None and self._max_key is not None:
166
+ return (self._min_key, self._max_key)
167
+
168
+ min_key = None
169
+ max_key = None
170
+ for manifest_accessor in self.generate_manifests():
171
+ for sstable in manifest_accessor.generate_sstables():
172
+ if min_key is None or sstable.min_key < min_key:
173
+ min_key = sstable.min_key
174
+ if max_key is None or sstable.max_key > max_key:
175
+ max_key = sstable.max_key
176
+
177
+ self._min_key = min_key
178
+ self._max_key = max_key
179
+ return (min_key, max_key)
@@ -0,0 +1,158 @@
1
+ import logging
2
+ from typing import Generator, Optional, Set, Type, TypeVar, Any
3
+
4
+ from deltacat.storage.model.shard import Shard
5
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow, SSTable
6
+ from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
7
+ BlockIntervalTree,
8
+ OrderedBlockGroups,
9
+ )
10
+ from deltacat.experimental.storage.rivulet.reader.block_scanner import BlockScanner
11
+ from deltacat.experimental.storage.rivulet.reader.dataset_metastore import (
12
+ DatasetMetastore,
13
+ ManifestAccessor,
14
+ )
15
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
16
+ QueryExpression,
17
+ )
18
+ from deltacat.experimental.storage.rivulet import Schema
19
+
20
+ # The type of data returned to reader
21
+ T = TypeVar("T")
22
+
23
+
24
+ class DatasetReader:
25
+ """
26
+ DatasetReader is an internal class used to execute a scan
27
+
28
+ TODO - Currently, this reader is limited to reading a single field group
29
+ The next CR will fast follow to modify this to read and zipper multiple field groups
30
+
31
+ TODO currently this assumes all SST files are L0 files with overlapping key ranges
32
+ Future CR will support L1+ SSTs
33
+ """
34
+
35
+ BLOCK_READER_POOL_SIZE = 8
36
+
37
+ def __init__(self, metastore: DatasetMetastore):
38
+ self.metastore: DatasetMetastore = metastore
39
+ self.block_scanner = BlockScanner(self.metastore)
40
+
41
+ def scan(
42
+ self,
43
+ schema: Schema,
44
+ deserialize_to: Type[T],
45
+ query: QueryExpression[Any](),
46
+ shard: Optional[Shard] = None,
47
+ ) -> Generator[T, None, None]:
48
+ """
49
+ Scan records given query and deserialize to desired memory output format
50
+
51
+ # TODO handle "partial schema" use case, in which the query schema is a subset of full schema
52
+
53
+ # TODO this is where we will do the ziper merge when we support multiple field groups
54
+ # for each SST row which may overlap key range, read data chunk
55
+ # we will later improve and parallelize this when we do zipper merge work
56
+ """
57
+
58
+ # Read manifests and differentiate between "full schema" and "zipper merge" use case
59
+ manifests = set(self.metastore.generate_manifests())
60
+ schemas = set([manifest.context.schema for manifest in manifests])
61
+ levels = set([manifest.context.level for manifest in manifests])
62
+ # Must zipper if there are multiple schemas
63
+ cannot_avoid_zipper = len(schemas) > 1
64
+ # Must zipper if L0 is involved or if manifests span multiple levels
65
+ cannot_avoid_zipper |= 0 in levels or len(levels) > 0
66
+
67
+ if cannot_avoid_zipper:
68
+ logging.info(f"Done scanning manifests. Can avoid zipper-merge")
69
+ for scan_result in self.__scan_with_zipper(
70
+ schema, deserialize_to, manifests, query, shard=shard
71
+ ):
72
+ yield scan_result
73
+ else:
74
+ logging.info(f"Done scanning manifests. Must perform zipper-merge")
75
+ for scan_result in self.__scan_no_zipper(
76
+ schema, deserialize_to, manifests, query, shard=shard
77
+ ):
78
+ yield scan_result
79
+
80
+ def __scan_no_zipper(
81
+ self,
82
+ schema: Schema,
83
+ deserialize_to: Type[T],
84
+ manifests: Set[ManifestAccessor],
85
+ query: QueryExpression[Any](),
86
+ shard: Optional[Shard] = None,
87
+ ) -> Generator[T, None, None]:
88
+ # Build final query using user query and shard boundaries (ensures only blocks in shard and query range are read).
89
+ # TODO: improve query expression implementation to have a builder of some sort.
90
+ query = QueryExpression().with_shard(query, shard)
91
+ # Map manifests to all SST rows which match query
92
+ matching_sst_rows: Set[SSTableRow] = {
93
+ row
94
+ for manifest in manifests
95
+ for table in manifest.generate_sstables()
96
+ for row in self.__load_sst_rows(table, query)
97
+ }
98
+
99
+ for result_row in self.block_scanner.scan(
100
+ schema, deserialize_to, matching_sst_rows, query
101
+ ):
102
+ yield result_row
103
+
104
+ def __scan_with_zipper(
105
+ self,
106
+ schema: Schema,
107
+ deserialize_to: Type[T],
108
+ manifests: Set[ManifestAccessor],
109
+ query: QueryExpression[Any](),
110
+ shard: Optional[Shard] = None,
111
+ ) -> Generator[T, None, None]:
112
+ # Build final query using user query and shard boundaries (ensures only blocks in shard and query range are read).
113
+ # TODO: improve query expression implementation to have a builder of some sort.
114
+ query = QueryExpression().with_shard(query, shard)
115
+ # Build interval tree from manifests and plan scan
116
+ sst_interval_tree = BlockIntervalTree()
117
+ for manifest in manifests:
118
+ for table in manifest.generate_sstables():
119
+ rows = self.__load_sst_rows(table, query)
120
+ sst_interval_tree.add_sst_rows(rows, manifest.context)
121
+
122
+ scan_block_groups: OrderedBlockGroups = (
123
+ sst_interval_tree.get_sorted_block_groups(query.min_key, query.max_key)
124
+ )
125
+ for result_row in self.block_scanner.scan_with_zipper(
126
+ schema, deserialize_to, scan_block_groups, query
127
+ ):
128
+ yield result_row
129
+
130
+ def __load_sst_rows(
131
+ self, table: SSTable, query: QueryExpression
132
+ ) -> Set[SSTableRow]:
133
+ # Short circuit table if there isn't any overlap with min and max
134
+ if not self.__overlaps_primary_key_range(query, table.min_key, table.max_key):
135
+ return set()
136
+ return {
137
+ r
138
+ for r in table.rows
139
+ if self.__overlaps_primary_key_range(query, r.key_min, r.key_max)
140
+ }
141
+
142
+ def __overlaps_primary_key_range(
143
+ self, query: QueryExpression, min_key, max_key
144
+ ) -> bool:
145
+ """
146
+ Helper method to check whether a query expression has overlap with a primary key range
147
+ """
148
+ # If no PK range set, the query is across all primary keys, so return true
149
+ if not query.key_range:
150
+ return True
151
+
152
+ query_start, query_end = query.key_range
153
+ if query_end < min_key:
154
+ return False
155
+ elif query_start > max_key:
156
+ return False
157
+ else:
158
+ return True
@@ -0,0 +1,124 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Generator, Dict, Type, NamedTuple, List
4
+
5
+ from pyarrow import RecordBatch
6
+
7
+ from deltacat.experimental.storage.rivulet.reader.data_reader import (
8
+ DataReader,
9
+ MEMORY_FORMAT,
10
+ )
11
+ import pyarrow as pa
12
+
13
+
14
+ class RecordBatchRowIndex(NamedTuple):
15
+ """
16
+ Named tuple for a record batch with an index into a specific row
17
+ Note that record batches store data by column, so the row index should be
18
+ used to index into each column array
19
+ """
20
+
21
+ batch: RecordBatch
22
+ row_index: int
23
+
24
+
25
+ class ArrowDataReader(DataReader[RecordBatchRowIndex]):
26
+ """
27
+ Parquet reader to iteratively load records from parquet files
28
+ """
29
+
30
+ def deserialize_records(
31
+ self, record: RecordBatchRowIndex, output_type: Type[MEMORY_FORMAT]
32
+ ) -> Generator[MEMORY_FORMAT, None, None]:
33
+ """
34
+ Deserialize records into the specified format.
35
+
36
+ Note that output_type gets set based on what a DataScan converts results to,
37
+ e.g. to_arrow, to_dict
38
+
39
+ :param record: Input data (generated by generate_records method)
40
+ :param output_type: Type to deserialize into
41
+ :returns: A generator yielding records of the specified type.
42
+ """
43
+ batch, row_idx = record[0].batch, record[0].row_index
44
+
45
+ if output_type == Dict:
46
+ yield {
47
+ column: batch.column(column_idx)[row_idx].as_py()
48
+ for column_idx, column in enumerate(batch.column_names)
49
+ }
50
+
51
+ elif output_type == RecordBatch:
52
+ # only yield full record batch if row_idx is 0.
53
+ # TODO this logic will need to change in zipper use case across data formats
54
+ if row_idx == 0:
55
+ yield batch
56
+
57
+ def join_deserialize_records(
58
+ self,
59
+ records: List[RecordBatchRowIndex],
60
+ output_type: Type[MEMORY_FORMAT],
61
+ join_key: str,
62
+ ) -> Generator[MEMORY_FORMAT, None, None]:
63
+ """
64
+ Deserialize records into the specified format.
65
+
66
+ Note that output_type gets set based on what a DataScan converts results to,
67
+ e.g. to_arrow, to_dict
68
+
69
+ :param records: Input data (generated by generate_records method)
70
+ :param output_type: Type to deserialize into
71
+ :returns: A generator yielding records of the specified type.
72
+ """
73
+
74
+ if output_type == Dict:
75
+ yield self.__join_records_as_dict(records)
76
+ elif output_type == RecordBatch:
77
+ yield self.__join_records_as_record_batch(records, join_key)
78
+
79
+ @staticmethod
80
+ def __join_records_as_dict(records: List[RecordBatchRowIndex]) -> Dict[str, any]:
81
+ """
82
+ Deserialize records into a PyDict
83
+
84
+ :param records: input record data
85
+ :returns: A PyDict that's joined the given records around the primary key.
86
+ """
87
+ batch: RecordBatch
88
+ row_idx: int
89
+ out = {}
90
+ for r in records:
91
+ batch, row_idx = r
92
+ # Note this stomps over join key but that's OK
93
+ for column_idx, column in enumerate(batch.schema.names):
94
+ col = batch.column(column_idx)
95
+ if len(col) <= row_idx:
96
+ raise IndexError(
97
+ f"row index {row_idx} out of bounds for column {column} with length {len(col)}"
98
+ )
99
+
100
+ out.update({column: col[row_idx].as_py()})
101
+ return out
102
+
103
+ @staticmethod
104
+ def __join_records_as_record_batch(
105
+ records: List[RecordBatchRowIndex], join_key: str
106
+ ) -> RecordBatch:
107
+ """
108
+ Deserialize records into a RecordBatch
109
+
110
+ :param records: input record data
111
+ :returns: RecordBatch that's inner-joined the given records around the primary key.
112
+ """
113
+ batch: RecordBatch
114
+ row_idx: int
115
+ out: pa.Table | None = None
116
+ for record in records:
117
+ batch, row_idx = record
118
+ batch_slice: RecordBatch = batch.slice(row_idx, 1)
119
+ if not out:
120
+ out = pa.Table.from_batches([batch_slice])
121
+ else:
122
+ table2 = pa.Table.from_batches([batch_slice])
123
+ out = out.join(table2, keys=join_key, join_type="inner")
124
+ return out.to_batches()[0]
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ import typing
4
+ from typing import Optional
5
+
6
+ from deltacat.storage.model.shard import Shard
7
+
8
+ T = typing.TypeVar("T") # Type of primary key in query expression. Must be comparable
9
+
10
+
11
+ class QueryExpression(typing.Generic[T]):
12
+ """
13
+ Top level class for creating representing queries on a riv dataset.
14
+
15
+ For now, this is a minimal implementation which just allows for different predicates.
16
+
17
+ FUTURE IMPROVEMENTS
18
+ 1. Support builder using operator overloading or fluent builder pattern,e.g.
19
+ (operator overloading) query = Column("Foo") < 10 & Column("PK")==100
20
+ (fluent interface) query = builder.column("colA").less_than(10)
21
+ .and_()
22
+ .column("PK").equals(100)
23
+ .build()
24
+
25
+ 2. Support better push down predicate integration end to end. Specifically,
26
+ scan operation will need to return which query predicates were honored
27
+ """
28
+
29
+ def __init__(self):
30
+ self.key_range: Optional[(T, T)] = None
31
+
32
+ def with_key(self, val: T) -> QueryExpression:
33
+ """
34
+ Syntactic sugar for setting key range to a single value
35
+ """
36
+ if self.key_range:
37
+ raise ValueError(
38
+ f"Query expression already has set key range to: {self.key_range}"
39
+ )
40
+ self.key_range = (val, val)
41
+ return self
42
+
43
+ def with_range(self, bound1: T, bound2: T) -> QueryExpression:
44
+ if self.key_range:
45
+ raise ValueError(f"Key range already set to {self.key_range}")
46
+ self.key_range = tuple(sorted([bound1, bound2]))
47
+ return self
48
+
49
+ @staticmethod
50
+ def with_shard(query: Optional[QueryExpression], shard: Shard):
51
+ """
52
+ Generate a query expression that accounts for the shard boundaries.
53
+ Shard boundaries are inclusive and mark the outer bounds of the query.
54
+ """
55
+ if shard is None:
56
+ return query
57
+
58
+ if query.key_range is None:
59
+ return QueryExpression().with_range(shard.min_key, shard.max_key)
60
+
61
+ min_key = shard.min_key
62
+ max_key = shard.max_key
63
+
64
+ if min_key > query.min_key:
65
+ min_key = query.min_key
66
+
67
+ if max_key < query.max_key:
68
+ max_key = query.max_key
69
+
70
+ return QueryExpression().with_range(min_key, max_key)
71
+
72
+ @property
73
+ def min_key(self) -> T | None:
74
+ if not self.key_range:
75
+ return None
76
+ return self.key_range[0]
77
+
78
+ @property
79
+ def max_key(self) -> T | None:
80
+ if not self.key_range:
81
+ return None
82
+ return self.key_range[1]
83
+
84
+ def matches_query(self, key: any) -> bool:
85
+ """
86
+ Returns true if the key is within the range of the query expression
87
+ """
88
+ if not self.key_range:
89
+ return True
90
+ return self.min_key <= key <= self.max_key
91
+
92
+ def below_query_range(self, key: any) -> bool:
93
+ """
94
+ Returns true if the key is below the range of the query expression
95
+ will return false if key range is not set
96
+ """
97
+ if not self.key_range:
98
+ return False
99
+ return self.min_key > key
@@ -0,0 +1,84 @@
1
+ from deltacat.experimental.storage.rivulet.fs.file_provider import FileProvider
2
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTableRow
3
+ from deltacat.experimental.storage.rivulet.reader.data_reader import FileReader
4
+ from typing import Type, Dict
5
+
6
+ from deltacat.experimental.storage.rivulet.schema.schema import Schema
7
+
8
+
9
+ class FileReaderRegistrar:
10
+ """
11
+ Registrar for readers of rivulet data
12
+
13
+ Readers must adhere to the Protocol DataReader
14
+
15
+ Packages with extension classes should call into this registrar in __init__.py
16
+ """
17
+
18
+ _readers = {}
19
+
20
+ @classmethod
21
+ def register_reader(
22
+ cls,
23
+ extension: str,
24
+ reader_class: Type[FileReader],
25
+ allow_overwrite: bool = False,
26
+ ):
27
+ """
28
+ Register a file extension associated with a dataset reader
29
+
30
+ Parameters:
31
+ - extension: str, the file extension to register
32
+ - reader_class: Type[DataReader], the reader class to associate with the extension
33
+ - allow_overwrite: bool, if True, allows overwriting an existing reader for the extension
34
+ """
35
+ if extension in cls._readers and not allow_overwrite:
36
+ raise ValueError(
37
+ f"Reader for extension '{extension}' is already registered. "
38
+ f"Set allow_overwrite=True to replace the existing reader."
39
+ )
40
+ normalized_extension = extension.lower()
41
+ cls._readers[normalized_extension] = reader_class
42
+
43
+ @classmethod
44
+ def get_reader_class(cls, uri: str) -> Type[FileReader]:
45
+ """
46
+ Gets the reader class given a URI
47
+
48
+ :param uri: URI of file to be read. Note that we expect the URI to end in a file extension
49
+ :raises ValueError: if no registered data reader is found for the URI's extension type
50
+ """
51
+ # Find the file extension from the URI
52
+ extension = uri.split(".")[-1].lower()
53
+
54
+ # Return the reader class if the extension is registered, otherwise return None
55
+ return cls._readers.get(extension)
56
+
57
+ @classmethod
58
+ def construct_reader_instance(
59
+ cls,
60
+ sst_row: SSTableRow,
61
+ file_provider: FileProvider,
62
+ primary_key: str,
63
+ schema: Schema,
64
+ reader_cache: Dict[str, FileReader] = None,
65
+ ) -> FileReader:
66
+ """
67
+ Construct a data reader for an instance of a given uri
68
+
69
+ :param uri: URI of file to be read. Note that we expect the URI to end in a file extension
70
+ :param reader_cache: Optional cache of readers keyed by extension
71
+ :raises ValueError: if no registered data reader is found for the URI's extension type
72
+ """
73
+ extension = sst_row.uri.split(".")[-1].lower()
74
+
75
+ if reader_cache is not None and extension in reader_cache:
76
+ return reader_cache[extension]
77
+
78
+ reader_class = FileReaderRegistrar.get_reader_class(sst_row.uri)
79
+ reader_instance = reader_class(sst_row, file_provider, primary_key, schema)
80
+
81
+ if reader_cache:
82
+ reader_cache[extension] = reader_instance
83
+
84
+ return reader_instance