deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,408 @@
1
+ import posixpath
2
+ from deltacat.utils.metafile_locator import _find_partition_path
3
+ import pytest
4
+
5
+ import pyarrow as pa
6
+ from deltacat.experimental.storage.rivulet import Schema, Field, Datatype
7
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
8
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
9
+ QueryExpression,
10
+ )
11
+
12
+
13
+ @pytest.fixture
14
+ def sample_schema():
15
+ return Schema(
16
+ fields=[
17
+ Field("id", Datatype.int32(), is_merge_key=True),
18
+ Field("name", Datatype.string()),
19
+ Field("age", Datatype.int32()),
20
+ ]
21
+ )
22
+
23
+
24
+ @pytest.fixture
25
+ def sample_pydict():
26
+ return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
27
+
28
+
29
+ @pytest.fixture
30
+ def sample_parquet_data(tmp_path, sample_pydict):
31
+ parquet_path = tmp_path / "test.parquet"
32
+ table = pa.Table.from_pydict(sample_pydict)
33
+ pa.parquet.write_table(table, parquet_path)
34
+ return parquet_path
35
+
36
+
37
+ # Updated Tests
38
+
39
+
40
+ def test_dataset_creation_with_schema(tmp_path, sample_schema):
41
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
42
+ assert len(dataset.fields) == 3
43
+ assert "id" in dataset.fields
44
+ assert dataset.fields["id"].is_merge_key
45
+
46
+
47
+ def test_dataset_initialization_with_metadata(tmp_path):
48
+ dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
49
+ assert dataset.dataset_name == "test_dataset"
50
+ assert dataset._metadata_folder.startswith(".riv-meta")
51
+
52
+
53
+ def test_invalid_dataset_initialization():
54
+ with pytest.raises(ValueError, match="Name must be a non-empty string"):
55
+ Dataset(dataset_name="")
56
+
57
+
58
+ def test_dataset_creation_metadata_structure(tmp_path):
59
+ dataset = Dataset(dataset_name="test_dataset", metadata_uri=str(tmp_path))
60
+
61
+ assert dataset._metadata_folder.startswith(".riv-meta")
62
+ assert dataset._namespace == "default"
63
+ assert dataset.dataset_name == "test_dataset"
64
+ assert dataset._metadata_path == str(tmp_path / ".riv-meta-test_dataset")
65
+
66
+ locator = dataset._locator
67
+ root_uri = dataset._metadata_path
68
+
69
+ partition_path = _find_partition_path(root_uri, locator)
70
+
71
+ # Ensures that directory structure for namespace -> table -> table_version -> stream_id -> partition_id exists
72
+ assert posixpath.exists(partition_path)
73
+
74
+
75
+ def test_fields_accessor_add_field(tmp_path, sample_schema):
76
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
77
+ dataset.fields.add("new_field", Datatype.float())
78
+ assert "new_field" in dataset.fields
79
+ assert dataset.fields["new_field"].datatype == Datatype.float()
80
+
81
+ dataset.fields["new_field2"] = Field("new_field2", Datatype.int32())
82
+ assert "new_field2" in dataset.fields
83
+ assert "new_field2" in dataset.schemas["all"]
84
+ with pytest.raises(TypeError):
85
+ dataset.fields["new_field3"] = 2
86
+
87
+
88
+ def test_field_removal(tmp_path, sample_schema):
89
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
90
+ del dataset.fields["age"]
91
+ assert "age" not in dataset.fields
92
+ with pytest.raises(ValueError):
93
+ del dataset.fields["age"]
94
+ with pytest.raises(KeyError):
95
+ _ = dataset.fields["age"]
96
+
97
+
98
+ def test_fields_accessor_repr(tmp_path, sample_schema):
99
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
100
+ repr_output = repr(dataset.fields)
101
+ for field_name in ["id", "name", "age"]:
102
+ assert field_name in repr_output, f"Field '{field_name}' missing in repr output"
103
+
104
+
105
+ def test_schemas_accessor_add_group(tmp_path, sample_schema):
106
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
107
+ dataset.schemas["analytics"] = ["id", "name"]
108
+ assert "analytics" in dataset.schemas
109
+ assert len(dataset.schemas["analytics"]) == 2
110
+
111
+
112
+ def test_schema_removal(tmp_path, sample_schema):
113
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
114
+ with pytest.raises(ValueError):
115
+ del dataset.schemas["all"]
116
+ with pytest.raises(ValueError):
117
+ del dataset.schemas["does_not_exist"]
118
+ dataset.schemas["new"] = ["id", "name"]
119
+ del dataset.schemas["new"]
120
+ with pytest.raises(KeyError):
121
+ _ = dataset.schemas["new"]
122
+
123
+
124
+ def test_dataset_from_parquet(tmp_path, sample_parquet_data):
125
+ dataset = Dataset.from_parquet(
126
+ name="test_dataset",
127
+ file_uri=str(sample_parquet_data),
128
+ metadata_uri=str(tmp_path),
129
+ merge_keys="id",
130
+ )
131
+ assert len(dataset.fields) == 3
132
+ assert "id" in dataset.fields
133
+ assert dataset.fields["id"].is_merge_key
134
+
135
+
136
+ def test_parquet_schema_modes(tmp_path, sample_pydict):
137
+ # Create two parquet files with overlapping and unique schemas
138
+ data_1 = {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}
139
+ data_2 = {"id": [4, 5, 6], "age": [25, 30, 35]}
140
+
141
+ path_1 = tmp_path / "data1.parquet"
142
+ path_2 = tmp_path / "data2.parquet"
143
+ pa.parquet.write_table(pa.Table.from_pydict(data_1), path_1)
144
+ pa.parquet.write_table(pa.Table.from_pydict(data_2), path_2)
145
+
146
+ dataset_union = Dataset.from_parquet(
147
+ name="test_dataset_union",
148
+ file_uri=str(tmp_path),
149
+ merge_keys="id",
150
+ schema_mode="union",
151
+ )
152
+ assert len(dataset_union.fields) == 3 # id, name, age
153
+
154
+ dataset_intersect = Dataset.from_parquet(
155
+ name="test_dataset_intersect",
156
+ file_uri=str(tmp_path),
157
+ merge_keys="id",
158
+ schema_mode="intersect",
159
+ )
160
+ assert len(dataset_intersect.fields) == 1 # Only id
161
+
162
+
163
+ def test_merge_all_schemas():
164
+ schema1 = Schema(
165
+ fields=[
166
+ Field("id", Datatype.int32(), is_merge_key=True),
167
+ Field("name", Datatype.string()),
168
+ ]
169
+ )
170
+ schema2 = Schema(
171
+ fields=[
172
+ Field("id", Datatype.int32(), is_merge_key=True),
173
+ Field("age", Datatype.int32()),
174
+ ]
175
+ )
176
+ merged_schema = Schema.merge_all([schema1, schema2])
177
+ assert len(merged_schema) == 3
178
+ assert "id" in merged_schema
179
+ assert "name" in merged_schema
180
+ assert "age" in merged_schema
181
+
182
+
183
+ def test_writer_creation_with_custom_format(tmp_path, sample_schema):
184
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
185
+ writer = dataset.writer(file_format="feather")
186
+ assert writer is not None
187
+
188
+
189
+ def test_scan_with_query(tmp_path, sample_schema):
190
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
191
+ query = QueryExpression() # Placeholder query
192
+ scan = dataset.scan(query)
193
+ assert scan is not None
194
+
195
+
196
+ def test_add_schema_to_new_schemas(tmp_path):
197
+ """Test adding a schema to a new field group."""
198
+ base_uri = str(tmp_path / "test_dataset")
199
+ dataset = Dataset(dataset_name=base_uri)
200
+
201
+ schema = Schema(
202
+ [
203
+ ("id", Datatype.int32()),
204
+ ("name", Datatype.string()),
205
+ ("age", Datatype.int32()),
206
+ ],
207
+ merge_keys=["id"],
208
+ )
209
+
210
+ dataset.add_schema(schema, schema_name="new_group")
211
+
212
+ # Verify the field group is added
213
+ assert "new_group" in dataset.schemas
214
+ assert len(dataset.schemas["new_group"]) == 3
215
+ assert dataset.schemas["new_group"]["id"].datatype == Datatype.int32()
216
+ assert dataset.schemas["new_group"]["name"].datatype == Datatype.string()
217
+ assert dataset.schemas["new_group"]["age"].datatype == Datatype.int32()
218
+
219
+
220
+ def test_add_schema_to_existing_schemas(tmp_path):
221
+ """Test merging a schema into an existing field group."""
222
+ base_uri = str(tmp_path / "test_dataset")
223
+ dataset = Dataset(dataset_name=base_uri)
224
+
225
+ schema_1 = Schema(
226
+ [
227
+ ("id", Datatype.int32()),
228
+ ("name", Datatype.string()),
229
+ ],
230
+ merge_keys=["id"],
231
+ )
232
+
233
+ dataset.add_schema(schema_1, schema_name="existing_group")
234
+
235
+ schema_2 = Schema(
236
+ [
237
+ ("age", Datatype.int32()),
238
+ ("email", Datatype.string()),
239
+ ],
240
+ merge_keys=["id"],
241
+ )
242
+
243
+ dataset.add_schema(schema_2, schema_name="existing_group")
244
+
245
+ # Verify the merged schema
246
+ assert "existing_group" in dataset.schemas
247
+ assert len(dataset.schemas["existing_group"]) == 4
248
+ assert dataset.schemas["existing_group"]["id"].datatype == Datatype.int32()
249
+ assert dataset.schemas["existing_group"]["name"].datatype == Datatype.string()
250
+ assert dataset.schemas["existing_group"]["age"].datatype == Datatype.int32()
251
+ assert dataset.schemas["existing_group"]["email"].datatype == Datatype.string()
252
+
253
+
254
+ def test_add_schema_conflicting_fields(tmp_path):
255
+ """Test adding a schema with conflicting fields."""
256
+ base_uri = str(tmp_path / "test_dataset")
257
+ dataset = Dataset(dataset_name=base_uri)
258
+
259
+ schema_1 = Schema(
260
+ [
261
+ ("id", Datatype.int32()),
262
+ ("name", Datatype.string()),
263
+ ],
264
+ merge_keys=["id"],
265
+ )
266
+
267
+ dataset.add_schema(schema_1, schema_name="conflicting_group")
268
+
269
+ schema_2 = Schema(
270
+ [
271
+ ("id", Datatype.string()), # Conflict: datatype mismatch
272
+ ("age", Datatype.int32()),
273
+ ],
274
+ merge_keys=["id"],
275
+ )
276
+
277
+ with pytest.raises(ValueError, match="already exists"):
278
+ dataset.add_schema(schema_2, schema_name="conflicting_group")
279
+
280
+ schema_3 = Schema(
281
+ [
282
+ ("id", Datatype.int32()), # Conflict: datatype mismatch
283
+ ("age", Datatype.int32()),
284
+ ],
285
+ merge_keys=["id"],
286
+ )
287
+
288
+ dataset.add_schema(schema_3, schema_name="conflicting_group")
289
+ assert "conflicting_group" in dataset.schemas
290
+ assert len(dataset.schemas["conflicting_group"]) == 3
291
+ assert dataset.schemas["conflicting_group"]["id"].datatype == Datatype.int32()
292
+ assert dataset.schemas["conflicting_group"]["name"].datatype == Datatype.string()
293
+ assert dataset.schemas["conflicting_group"]["age"].datatype == Datatype.int32()
294
+
295
+
296
+ def test_add_fields_with_merge_key_field(tmp_path):
297
+ base_uri = str(tmp_path / "test_dataset")
298
+ dataset = Dataset(dataset_name=base_uri)
299
+ dataset.add_fields([Field("my_merge_key", Datatype.string(), True)])
300
+ assert dataset.schemas["default"].get_merge_key() == "my_merge_key"
301
+
302
+
303
+ def test_add_schema_to_nonexistent_schemas(tmp_path):
304
+ """Test adding a schema to a nonexistent field group."""
305
+ base_uri = str(tmp_path / "test_dataset")
306
+ dataset = Dataset(dataset_name=base_uri)
307
+
308
+ schema = Schema(
309
+ [
310
+ ("id", Datatype.int32()),
311
+ ("name", Datatype.string()),
312
+ ],
313
+ merge_keys=["id"],
314
+ )
315
+
316
+ # Add to a non-existent field group
317
+ dataset.add_schema(schema, schema_name="nonexistent_group")
318
+
319
+ # Verify the field group is created
320
+ assert "nonexistent_group" in dataset.schemas
321
+ assert len(dataset.schemas["nonexistent_group"]) == 2
322
+
323
+
324
+ def test_add_missing_field_to_schema_raises_error(tmp_path, sample_schema):
325
+ """
326
+ Test that attempting to add a missing field to the 'all' schema raises a ValueError.
327
+ """
328
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
329
+
330
+ # Attempt to add a non-existent field to the 'all' schema
331
+ with pytest.raises(
332
+ ValueError, match="Field 'missing_field' does not exist in the dataset."
333
+ ):
334
+ dataset.schemas["all"] = [
335
+ "missing_field"
336
+ ] # Attempt to set a list with a missing field
337
+
338
+
339
+ def test_schemas_accessor_methods(tmp_path, sample_schema):
340
+ """
341
+ Test the __iter__, __len__, and __repr__ methods of SchemasAccessor.
342
+ """
343
+ dataset = Dataset(
344
+ dataset_name="test_dataset", schema=sample_schema
345
+ ) # Default schema is defined automatically
346
+ dataset.schemas["schema_1"] = ["id", "name"]
347
+ dataset.schemas["schema_2"] = ["age"]
348
+
349
+ # Test __iter__
350
+ schema_names = list(iter(dataset.schemas))
351
+ assert set(schema_names) == {
352
+ "schema_1",
353
+ "schema_2",
354
+ "all",
355
+ "default",
356
+ }, "Schema names do not match expected values"
357
+
358
+ # Test __len__
359
+ assert len(dataset.schemas) == 4, "Length of schemas accessor is incorrect"
360
+
361
+ # Test __repr__
362
+ repr_output = repr(dataset.schemas)
363
+ for schema_name in ["schema_1", "schema_2", "all"]:
364
+ assert (
365
+ schema_name in repr_output
366
+ ), f"Schema '{schema_name}' missing in repr output"
367
+
368
+
369
+ def test_get_merge_keys(tmp_path, sample_schema):
370
+ """
371
+ Test the get_merge_keys method to ensure it returns all merge keys in the dataset.
372
+ """
373
+ dataset = Dataset(dataset_name="test_dataset", schema=sample_schema)
374
+
375
+ # Add fields with additional merge key to the dataset
376
+ other_schema = Schema(
377
+ [("id2", Datatype.int32()), ("zip", Datatype.string())], merge_keys=["id2"]
378
+ )
379
+
380
+ dataset.add_schema(other_schema, "id2+zip")
381
+
382
+ # Call get_merge_keys and validate the result
383
+ merge_keys = dataset.get_merge_keys()
384
+ assert merge_keys == [
385
+ "id",
386
+ "id2",
387
+ ], f"Expected merge keys ['id', 'id2'], got {merge_keys}"
388
+
389
+
390
+ def test_add_fields_no_fields_raises_error(tmp_path, sample_schema):
391
+ dataset = Dataset(dataset_name="test_dataset")
392
+ with pytest.raises(ValueError):
393
+ dataset.add_fields(fields=[])
394
+
395
+
396
+ def test_add_fields_mismatched_merge_keys_raises_error(tmp_path, sample_schema):
397
+ dataset = Dataset(dataset_name="test_dataset")
398
+ with pytest.raises(
399
+ ValueError,
400
+ match="The following merge keys were not found in the provided fields: does_not_exist",
401
+ ):
402
+ dataset.add_fields(fields=sample_schema.values(), merge_keys=["does_not_exist"])
403
+
404
+ with pytest.raises(TypeError, match="Merge key status conflict"):
405
+ dataset.add_fields(
406
+ fields=[Field("id", Datatype.int32()), Field("name", Datatype.string())],
407
+ merge_keys=["id"],
408
+ )
@@ -0,0 +1,67 @@
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from deltacat.experimental.storage.rivulet.dataset import Dataset
6
+ from deltacat.experimental.storage.rivulet.fs.file_store import FileStore
7
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
8
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltacatManifestIO
9
+ from deltacat.experimental.storage.rivulet import Schema, Field
10
+ import pyarrow as pa
11
+ import pyarrow.parquet
12
+
13
+
14
+ @pytest.fixture
15
+ def sample_schema():
16
+ return Schema(
17
+ fields=[
18
+ Field("id", Datatype.int32(), is_merge_key=True),
19
+ Field("name", Datatype.string()),
20
+ Field("age", Datatype.int32()),
21
+ ]
22
+ )
23
+
24
+
25
+ @pytest.fixture
26
+ def sample_pydict():
27
+ return {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}
28
+
29
+
30
+ @pytest.fixture
31
+ def path(tmp_path):
32
+ return tmp_path
33
+
34
+
35
+ @pytest.fixture
36
+ def sample_parquet_data(path, sample_pydict):
37
+ parquet_path = path / "test.parquet"
38
+ table = pa.Table.from_pydict(sample_pydict)
39
+ pyarrow.parquet.write_table(table, parquet_path)
40
+ return parquet_path
41
+
42
+
43
+ def test_write_manifest_round_trip(sample_parquet_data, sample_schema):
44
+ dataset = Dataset.from_parquet(
45
+ file_uri=sample_parquet_data, name="dataset", merge_keys="id"
46
+ )
47
+
48
+ path, filesystem = FileStore.filesystem(dataset._metadata_path)
49
+ file_store = FileStore(path, filesystem=filesystem)
50
+ manifest_io = DeltacatManifestIO(path, dataset._locator)
51
+
52
+ sst_files = ["sst1.sst", "sst2.sst"]
53
+ schema = Schema(
54
+ {("id", Datatype.int32()), ("name", Datatype.string())},
55
+ "id",
56
+ )
57
+ level = 2
58
+
59
+ uri = os.path.join(path, "manifest.json")
60
+
61
+ file_store.create_output_file(uri)
62
+ written = manifest_io.write(sst_files, schema, level)
63
+ manifest = manifest_io.read(written)
64
+
65
+ assert manifest.context.schema == schema
66
+ assert manifest.context.level == level
67
+ assert manifest.sst_files == sst_files
@@ -0,0 +1,232 @@
1
+ from typing import List, FrozenSet, Dict
2
+
3
+ import pytest
4
+
5
+ from deltacat.experimental.storage.rivulet.metastore.delta import DeltaContext
6
+ from deltacat.experimental.storage.rivulet.metastore.sst import SSTable, SSTableRow
7
+ from deltacat.experimental.storage.rivulet.metastore.sst_interval_tree import (
8
+ BlockIntervalTree,
9
+ BlockGroup,
10
+ OrderedBlockGroups,
11
+ Block,
12
+ )
13
+ from deltacat.experimental.storage.rivulet.schema.datatype import Datatype
14
+ from deltacat.experimental.storage.rivulet import Schema
15
+
16
+
17
+ @pytest.fixture
18
+ def schema1() -> Schema:
19
+ return Schema(
20
+ {
21
+ ("id", Datatype.int32()),
22
+ ("name", Datatype.string()),
23
+ ("age", Datatype.int32()),
24
+ },
25
+ "id",
26
+ )
27
+
28
+
29
+ @pytest.fixture
30
+ def schema2() -> Schema:
31
+ return Schema(
32
+ {
33
+ ("id", Datatype.int32()),
34
+ ("address", Datatype.string()),
35
+ ("zip", Datatype.string()),
36
+ },
37
+ "id",
38
+ )
39
+
40
+
41
+ @pytest.fixture
42
+ def sst_row_list() -> List[SSTableRow]:
43
+ return [
44
+ SSTableRow(0, 100, "block1", 0, 1),
45
+ SSTableRow(3, 90, "block2", 0, 1),
46
+ SSTableRow(10, 95, "block3", 0, 1),
47
+ SSTableRow(0, 10, "block4", 0, 1),
48
+ SSTableRow(0, 100, "block5", 0, 1),
49
+ ]
50
+
51
+
52
+ @pytest.fixture
53
+ def sst1(sst_row_list) -> SSTable:
54
+ return SSTable(sst_row_list[0:3], 0, 100)
55
+
56
+
57
+ @pytest.fixture
58
+ def sst2(sst_row_list) -> SSTable:
59
+ return SSTable(sst_row_list[3:5], 0, 100)
60
+
61
+
62
+ @pytest.fixture
63
+ def manifest_context1(schema1) -> DeltaContext:
64
+ return DeltaContext(schema1, "manifest-001", 0)
65
+
66
+
67
+ @pytest.fixture
68
+ def manifest_context2(schema2) -> DeltaContext:
69
+ return DeltaContext(schema2, "manifest-002", 1)
70
+
71
+
72
+ def with_field_group(
73
+ context: DeltaContext, rows: List[SSTableRow], indexes: List[int]
74
+ ) -> Dict[Schema, FrozenSet[Block]]:
75
+ """Construct a BlockGroup dict for a singular field group"""
76
+ schema = context.schema
77
+ return {schema: frozenset([Block(rows[i], context) for i in indexes])}
78
+
79
+
80
+ @pytest.fixture
81
+ def expected_block_groups(
82
+ manifest_context1, manifest_context2, sst_row_list
83
+ ) -> List[BlockGroup]:
84
+ return [
85
+ BlockGroup(
86
+ 0,
87
+ 3,
88
+ with_field_group(manifest_context1, sst_row_list, [0])
89
+ | with_field_group(manifest_context2, sst_row_list, [3, 4]),
90
+ ),
91
+ BlockGroup(
92
+ 3,
93
+ 10,
94
+ with_field_group(manifest_context1, sst_row_list, [0, 1])
95
+ | with_field_group(manifest_context2, sst_row_list, [3, 4]),
96
+ ),
97
+ BlockGroup(
98
+ 10,
99
+ 90,
100
+ with_field_group(manifest_context1, sst_row_list, [0, 1, 2])
101
+ | with_field_group(manifest_context2, sst_row_list, [3, 4]),
102
+ ),
103
+ BlockGroup(
104
+ 90,
105
+ 95,
106
+ with_field_group(manifest_context1, sst_row_list, [0, 1, 2])
107
+ | with_field_group(manifest_context2, sst_row_list, [4]),
108
+ ),
109
+ BlockGroup(
110
+ 95,
111
+ 100,
112
+ with_field_group(manifest_context1, sst_row_list, [0, 2])
113
+ | with_field_group(manifest_context2, sst_row_list, [4]),
114
+ ),
115
+ ]
116
+
117
+
118
+ def test_build_sst(
119
+ sst1,
120
+ sst2,
121
+ manifest_context1,
122
+ manifest_context2,
123
+ sst_row_list,
124
+ expected_block_groups,
125
+ ):
126
+ t = BlockIntervalTree()
127
+ t.add_sst_table(sst1, manifest_context1)
128
+ t.add_sst_table(sst2, manifest_context2)
129
+
130
+ block_groups = t.get_sorted_block_groups()
131
+ expected = _build_ordered_block_groups(expected_block_groups)
132
+ assert expected == block_groups
133
+
134
+
135
+ def test_build_sst_with_bounds(
136
+ sst1,
137
+ sst2,
138
+ manifest_context1,
139
+ manifest_context2,
140
+ sst_row_list,
141
+ expected_block_groups,
142
+ ):
143
+ t = BlockIntervalTree()
144
+ t.add_sst_table(sst1, manifest_context1)
145
+ t.add_sst_table(sst2, manifest_context2)
146
+
147
+ block_groups_filtered = t.get_sorted_block_groups(20, 100)
148
+ expected = _build_ordered_block_groups(expected_block_groups[2:])
149
+ assert expected == block_groups_filtered
150
+
151
+ block_groups_filtered = t.get_sorted_block_groups(96, 100)
152
+ expected = _build_ordered_block_groups(expected_block_groups[4:])
153
+ assert expected == block_groups_filtered
154
+
155
+ block_groups_filtered = t.get_sorted_block_groups(0, 10)
156
+ expected = _build_ordered_block_groups(expected_block_groups[0:3])
157
+ assert expected == block_groups_filtered
158
+
159
+ # Max key of 95 is inclusive of last range so it is included
160
+ block_groups_filtered = t.get_sorted_block_groups(None, 95)
161
+ expected = _build_ordered_block_groups(expected_block_groups)
162
+ assert expected == block_groups_filtered
163
+
164
+ block_groups_filtered = t.get_sorted_block_groups(None, 94)
165
+ expected = _build_ordered_block_groups(expected_block_groups[0:4])
166
+ assert expected == block_groups_filtered
167
+
168
+ block_groups_filtered = t.get_sorted_block_groups(0, 10)
169
+ expected = _build_ordered_block_groups(expected_block_groups[0:3])
170
+ assert expected == block_groups_filtered
171
+
172
+ block_groups_filtered = t.get_sorted_block_groups(0, 0)
173
+ expected = _build_ordered_block_groups(expected_block_groups[0:1])
174
+ assert expected == block_groups_filtered
175
+
176
+
177
+ def test_build_sst_with_non_zero_min_key_matching_global_min_key(manifest_context1):
178
+ # Using a non-0 value since 0 evaluates to False
179
+ min_key = 1
180
+ max_key = 95
181
+
182
+ sst_row = SSTableRow(min_key, max_key, "row-with-non-zero-min-key", 0, 1)
183
+ t = BlockIntervalTree()
184
+ t.add_sst_table(SSTable([sst_row], min_key, max_key), manifest_context1)
185
+
186
+ block_groups_filtered = t.get_sorted_block_groups(min_key, min_key + 1)
187
+ expected = _build_ordered_block_groups(
188
+ [
189
+ BlockGroup(
190
+ min_key,
191
+ max_key,
192
+ {
193
+ manifest_context1.schema: frozenset(
194
+ [Block(sst_row, manifest_context1)]
195
+ )
196
+ },
197
+ )
198
+ ]
199
+ )
200
+ assert expected == block_groups_filtered
201
+
202
+
203
+ def test_build_sst_invalid_bounds(
204
+ sst1, sst2, schema1, schema2, sst_row_list, expected_block_groups
205
+ ):
206
+ t = BlockIntervalTree()
207
+
208
+ with pytest.raises(ValueError):
209
+ t.get_sorted_block_groups(10, 0)
210
+
211
+
212
+ def _build_ordered_block_groups(block_groups: List[BlockGroup]) -> OrderedBlockGroups:
213
+ """
214
+ Helper method to build OrderedBlockGroups from a sorted list of block groups
215
+
216
+ """
217
+ ordered_groups = []
218
+ boundary_table = []
219
+ for i, bg in enumerate(block_groups):
220
+ boundary_table.append(bg.key_min)
221
+ is_last = i == len(block_groups) - 1
222
+ if is_last:
223
+ bg = BlockGroup(bg.key_min, bg.key_max, bg.field_group_to_blocks, True)
224
+ boundary_table.append(bg.key_max)
225
+ ordered_groups.append(bg)
226
+
227
+ return OrderedBlockGroups(
228
+ ordered_groups[0].key_min,
229
+ ordered_groups[-1].key_max,
230
+ ordered_groups,
231
+ boundary_table,
232
+ )