deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,8 @@ from fsspec import AbstractFileSystem
6
6
  from ray.data.datasource import FilenameProvider
7
7
  from deltacat.types.media import ContentType
8
8
  import ray
9
+ import gzip
10
+ import json
9
11
 
10
12
 
11
13
  class TestDatasetToFile:
@@ -20,7 +22,13 @@ class TestDatasetToFile:
20
22
 
21
23
  @pytest.fixture(scope="module")
22
24
  def mock_dataset(self):
23
- return from_items([{"col1": i, "col2": i * 2} for i in range(1000)])
25
+ # Include data that would need escaping to test quoting behavior
26
+ return from_items([{"col1": "a,b\tc|d", "col2": 0} for _ in range(5)])
27
+
28
+ @pytest.fixture(scope="module")
29
+ def mock_unescaped_dataset(self):
30
+ # Use data without delimiters for unescaped TSV test
31
+ return from_items([{"col1": "abc", "col2": 0} for _ in range(5)])
24
32
 
25
33
  @pytest.fixture(scope="module")
26
34
  def mock_filename_provider(self):
@@ -35,12 +43,12 @@ class TestDatasetToFile:
35
43
  def test_parquet_sanity(self, mock_dataset, mock_filename_provider):
36
44
  from deltacat.utils.ray_utils.dataset import dataset_to_file
37
45
 
38
- fs: AbstractFileSystem = fsspec.filesystem("local")
46
+ fs: AbstractFileSystem = fsspec.filesystem("file")
39
47
 
40
48
  dataset_to_file(
41
49
  mock_dataset,
42
50
  self.BASE_PATH,
43
- file_system=fs,
51
+ filesystem=fs,
44
52
  block_path_provider=mock_filename_provider,
45
53
  )
46
54
 
@@ -51,16 +59,126 @@ class TestDatasetToFile:
51
59
  def test_csv_sanity(self, mock_dataset, mock_filename_provider):
52
60
  from deltacat.utils.ray_utils.dataset import dataset_to_file
53
61
 
54
- fs: AbstractFileSystem = fsspec.filesystem("local")
62
+ fs: AbstractFileSystem = fsspec.filesystem("file")
55
63
 
56
64
  dataset_to_file(
57
65
  mock_dataset,
58
66
  self.BASE_PATH,
59
- file_system=fs,
67
+ filesystem=fs,
60
68
  block_path_provider=mock_filename_provider,
61
69
  content_type=ContentType.CSV.value,
62
70
  )
63
71
 
64
72
  file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
65
73
  assert fs.exists(file_expected_at), "file was not written"
74
+
75
+ # Verify CSV format and content
76
+ with fs.open(file_expected_at, "rb") as f:
77
+ with gzip.GzipFile(fileobj=f) as gz:
78
+ content = gz.read().decode("utf-8")
79
+ # Should be quoted due to commas in data
80
+ assert '"a,b\tc|d",0' in content
81
+
82
+ fs.delete(file_expected_at)
83
+
84
+ def test_tsv_sanity(self, mock_dataset, mock_filename_provider):
85
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
86
+
87
+ fs: AbstractFileSystem = fsspec.filesystem("file")
88
+
89
+ dataset_to_file(
90
+ mock_dataset,
91
+ self.BASE_PATH,
92
+ filesystem=fs,
93
+ block_path_provider=mock_filename_provider,
94
+ content_type=ContentType.TSV.value,
95
+ )
96
+
97
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
98
+ assert fs.exists(file_expected_at), "file was not written"
99
+
100
+ # Verify TSV format and content
101
+ with fs.open(file_expected_at, "rb") as f:
102
+ with gzip.GzipFile(fileobj=f) as gz:
103
+ content = gz.read().decode("utf-8")
104
+ # Should be quoted due to tabs in data
105
+ assert '"a,b\tc|d"\t0' in content
106
+
107
+ fs.delete(file_expected_at)
108
+
109
+ def test_psv_sanity(self, mock_dataset, mock_filename_provider):
110
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
111
+
112
+ fs: AbstractFileSystem = fsspec.filesystem("file")
113
+
114
+ dataset_to_file(
115
+ mock_dataset,
116
+ self.BASE_PATH,
117
+ filesystem=fs,
118
+ block_path_provider=mock_filename_provider,
119
+ content_type=ContentType.PSV.value,
120
+ )
121
+
122
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
123
+ assert fs.exists(file_expected_at), "file was not written"
124
+
125
+ # Verify PSV format and content
126
+ with fs.open(file_expected_at, "rb") as f:
127
+ with gzip.GzipFile(fileobj=f) as gz:
128
+ content = gz.read().decode("utf-8")
129
+ # Should be quoted due to pipes in data
130
+ assert '"a,b\tc|d"|0' in content
131
+
132
+ fs.delete(file_expected_at)
133
+
134
+ def test_unescaped_tsv_sanity(self, mock_unescaped_dataset, mock_filename_provider):
135
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
136
+
137
+ fs: AbstractFileSystem = fsspec.filesystem("file")
138
+
139
+ dataset_to_file(
140
+ mock_unescaped_dataset,
141
+ self.BASE_PATH,
142
+ filesystem=fs,
143
+ block_path_provider=mock_filename_provider,
144
+ content_type=ContentType.UNESCAPED_TSV.value,
145
+ )
146
+
147
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
148
+ assert fs.exists(file_expected_at), "file was not written"
149
+
150
+ # Verify UNESCAPED_TSV format and content
151
+ with fs.open(file_expected_at, "rb") as f:
152
+ with gzip.GzipFile(fileobj=f) as gz:
153
+ content = gz.read().decode("utf-8")
154
+ # Should NOT be quoted since data has no delimiters
155
+ assert "abc\t0" in content
156
+
157
+ fs.delete(file_expected_at)
158
+
159
+ def test_json_sanity(self, mock_dataset, mock_filename_provider):
160
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
161
+
162
+ fs: AbstractFileSystem = fsspec.filesystem("file")
163
+
164
+ dataset_to_file(
165
+ mock_dataset,
166
+ self.BASE_PATH,
167
+ filesystem=fs,
168
+ block_path_provider=mock_filename_provider,
169
+ content_type=ContentType.JSON.value,
170
+ )
171
+
172
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
173
+ assert fs.exists(file_expected_at), "file was not written"
174
+
175
+ # Verify JSON format and content
176
+ with fs.open(file_expected_at, "rb") as f:
177
+ with gzip.GzipFile(fileobj=f) as gz:
178
+ content = gz.read().decode("utf-8")
179
+ # Each line should be a valid JSON object
180
+ first_line = content.split("\n")[0]
181
+ record = json.loads(first_line)
182
+ assert record == {"col1": "a,b\tc|d", "col2": 0}
183
+
66
184
  fs.delete(file_expected_at)
@@ -1,6 +1,9 @@
1
1
  import unittest
2
2
  from deltacat.types.media import ContentEncoding, ContentType
3
- from deltacat.utils.daft import daft_s3_file_to_table, s3_files_to_dataframe
3
+ from deltacat.utils.daft import (
4
+ daft_file_to_pyarrow_table,
5
+ files_to_dataframe,
6
+ )
4
7
  from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
5
8
  from deltacat.types.partial_download import PartialParquetParameters
6
9
  import pyarrow as pa
@@ -8,11 +11,11 @@ import pyarrow as pa
8
11
  from pyarrow import parquet as pq
9
12
 
10
13
 
11
- class TestDaftS3FileToTable(unittest.TestCase):
14
+ class TestDaftFileToPyarrowTable(unittest.TestCase):
12
15
  MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
13
16
 
14
- def test_read_from_s3_all_columns(self):
15
- table = daft_s3_file_to_table(
17
+ def test_read_from_local_all_columns(self):
18
+ table = daft_file_to_pyarrow_table(
16
19
  self.MVP_PATH,
17
20
  content_encoding=ContentEncoding.IDENTITY.value,
18
21
  content_type=ContentType.PARQUET.value,
@@ -20,8 +23,8 @@ class TestDaftS3FileToTable(unittest.TestCase):
20
23
  self.assertEqual(table.schema.names, ["a", "b"])
21
24
  self.assertEqual(table.num_rows, 100)
22
25
 
23
- def test_read_from_s3_single_column_via_include_columns(self):
24
- table = daft_s3_file_to_table(
26
+ def test_read_from_local_single_column_via_include_columns(self):
27
+ table = daft_file_to_pyarrow_table(
25
28
  self.MVP_PATH,
26
29
  content_encoding=ContentEncoding.IDENTITY.value,
27
30
  content_type=ContentType.PARQUET.value,
@@ -30,8 +33,8 @@ class TestDaftS3FileToTable(unittest.TestCase):
30
33
  self.assertEqual(table.schema.names, ["b"])
31
34
  self.assertEqual(table.num_rows, 100)
32
35
 
33
- def test_read_from_s3_single_column_via_column_names(self):
34
- table = daft_s3_file_to_table(
36
+ def test_read_from_local_single_column_via_column_names(self):
37
+ table = daft_file_to_pyarrow_table(
35
38
  self.MVP_PATH,
36
39
  content_encoding=ContentEncoding.IDENTITY.value,
37
40
  content_type=ContentType.PARQUET.value,
@@ -40,12 +43,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
40
43
  self.assertEqual(table.schema.names, ["b"])
41
44
  self.assertEqual(table.num_rows, 100)
42
45
 
43
- def test_read_from_s3_single_column_with_schema(self):
46
+ def test_read_from_local_single_column_with_schema(self):
44
47
  schema = pa.schema([("a", pa.int8()), ("b", pa.string())])
45
48
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
46
49
  schema=schema
47
50
  )
48
- table = daft_s3_file_to_table(
51
+ table = daft_file_to_pyarrow_table(
49
52
  self.MVP_PATH,
50
53
  content_encoding=ContentEncoding.IDENTITY.value,
51
54
  content_type=ContentType.PARQUET.value,
@@ -56,12 +59,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
56
59
  self.assertEqual(table.schema.field("a").type, pa.int8())
57
60
  self.assertEqual(table.num_rows, 100)
58
61
 
59
- def test_read_from_s3_single_column_with_schema_reverse_order(self):
62
+ def test_read_from_local_single_column_with_schema_reverse_order(self):
60
63
  schema = pa.schema([("b", pa.string()), ("a", pa.int8())])
61
64
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
62
65
  schema=schema
63
66
  )
64
- table = daft_s3_file_to_table(
67
+ table = daft_file_to_pyarrow_table(
65
68
  self.MVP_PATH,
66
69
  content_encoding=ContentEncoding.IDENTITY.value,
67
70
  content_type=ContentType.PARQUET.value,
@@ -71,12 +74,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
71
74
  self.assertEqual(table.schema.field("a").type, pa.int8())
72
75
  self.assertEqual(table.num_rows, 100)
73
76
 
74
- def test_read_from_s3_single_column_with_schema_subset_cols(self):
77
+ def test_read_from_local_single_column_with_schema_subset_cols(self):
75
78
  schema = pa.schema([("a", pa.int8())])
76
79
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
77
80
  schema=schema
78
81
  )
79
- table = daft_s3_file_to_table(
82
+ table = daft_file_to_pyarrow_table(
80
83
  self.MVP_PATH,
81
84
  content_encoding=ContentEncoding.IDENTITY.value,
82
85
  content_type=ContentType.PARQUET.value,
@@ -86,12 +89,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
86
89
  self.assertEqual(table.schema.field("a").type, pa.int8())
87
90
  self.assertEqual(table.num_rows, 100)
88
91
 
89
- def test_read_from_s3_single_column_with_schema_extra_cols(self):
92
+ def test_read_from_local_single_column_with_schema_extra_cols(self):
90
93
  schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
91
94
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
92
95
  schema=schema
93
96
  )
94
- table = daft_s3_file_to_table(
97
+ table = daft_file_to_pyarrow_table(
95
98
  self.MVP_PATH,
96
99
  content_encoding=ContentEncoding.IDENTITY.value,
97
100
  content_type=ContentType.PARQUET.value,
@@ -104,12 +107,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
104
107
  self.assertEqual(table.schema.field("MISSING").type, pa.string())
105
108
  self.assertEqual(table.num_rows, 100)
106
109
 
107
- def test_read_from_s3_single_column_with_schema_extra_cols_column_names(self):
110
+ def test_read_from_local_single_column_with_schema_extra_cols_column_names(self):
108
111
  schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
109
112
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
110
113
  schema=schema
111
114
  )
112
- table = daft_s3_file_to_table(
115
+ table = daft_file_to_pyarrow_table(
113
116
  self.MVP_PATH,
114
117
  content_encoding=ContentEncoding.IDENTITY.value,
115
118
  content_type=ContentType.PARQUET.value,
@@ -123,12 +126,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
123
126
  self.assertEqual(table.schema.field("MISSING").type, pa.string())
124
127
  self.assertEqual(table.num_rows, 100)
125
128
 
126
- def test_read_from_s3_single_column_with_schema_only_missing_col(self):
129
+ def test_read_from_local_single_column_with_schema_only_missing_col(self):
127
130
  schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
128
131
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
129
132
  schema=schema
130
133
  )
131
- table = daft_s3_file_to_table(
134
+ table = daft_file_to_pyarrow_table(
132
135
  self.MVP_PATH,
133
136
  content_encoding=ContentEncoding.IDENTITY.value,
134
137
  content_type=ContentType.PARQUET.value,
@@ -142,12 +145,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
142
145
  self.assertEqual(table.schema.field("MISSING").type, pa.string())
143
146
  self.assertEqual(table.num_rows, 0)
144
147
 
145
- def test_read_from_s3_single_column_with_row_groups(self):
148
+ def test_read_from_local_single_column_with_row_groups(self):
146
149
 
147
150
  metadata = pq.read_metadata(self.MVP_PATH)
148
151
  ppp = PartialParquetParameters.of(pq_metadata=metadata)
149
152
  ppp["row_groups_to_download"] = ppp.row_groups_to_download[1:2]
150
- table = daft_s3_file_to_table(
153
+ table = daft_file_to_pyarrow_table(
151
154
  self.MVP_PATH,
152
155
  content_encoding=ContentEncoding.IDENTITY.value,
153
156
  content_type=ContentType.PARQUET.value,
@@ -158,45 +161,132 @@ class TestDaftS3FileToTable(unittest.TestCase):
158
161
  self.assertEqual(table.num_rows, 10)
159
162
 
160
163
 
161
- class TestDaftS3FilesToDataFrame(unittest.TestCase):
164
+ class TestFilesToDataFrame(unittest.TestCase):
162
165
  MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
163
166
 
164
- def test_read_from_s3_all_columns(self):
165
- df = s3_files_to_dataframe(
167
+ def test_read_local_files_all_columns(self):
168
+ df = files_to_dataframe(
166
169
  uris=[self.MVP_PATH],
167
170
  content_encoding=ContentEncoding.IDENTITY.value,
168
171
  content_type=ContentType.PARQUET.value,
169
- ray_init_options={"local_mode": True},
172
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
170
173
  )
171
174
 
172
175
  table = df.to_arrow()
173
176
  self.assertEqual(table.schema.names, ["a", "b"])
174
177
  self.assertEqual(table.num_rows, 100)
175
178
 
176
- def test_does_not_read_from_s3_if_not_materialized(self):
177
- df = s3_files_to_dataframe(
179
+ def test_read_local_files_with_column_selection(self):
180
+ df = files_to_dataframe(
178
181
  uris=[self.MVP_PATH],
179
182
  content_encoding=ContentEncoding.IDENTITY.value,
180
183
  content_type=ContentType.PARQUET.value,
181
- ray_init_options={"local_mode": True},
184
+ include_columns=["b"],
185
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
186
+ )
187
+
188
+ table = df.to_arrow()
189
+ self.assertEqual(table.schema.names, ["b"])
190
+ self.assertEqual(table.num_rows, 100)
191
+
192
+ def test_read_local_files_does_not_materialize_by_default(self):
193
+ df = files_to_dataframe(
194
+ uris=[self.MVP_PATH],
195
+ content_encoding=ContentEncoding.IDENTITY.value,
196
+ content_type=ContentType.PARQUET.value,
197
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
182
198
  )
183
199
 
200
+ # Should raise RuntimeError because df is not materialized yet
184
201
  self.assertRaises(RuntimeError, lambda: len(df))
202
+
203
+ # After collecting, it should work
185
204
  df.collect()
186
205
  self.assertEqual(len(df), 100)
187
206
 
188
- def test_raises_error_if_not_supported_content_type(self):
207
+ def test_supports_unescaped_tsv_content_type(self):
208
+ # Test that UNESCAPED_TSV is now supported (was previously unsupported)
209
+ # Use a CSV file since we're testing TSV reader functionality
210
+ csv_path = "deltacat/tests/utils/data/non_empty_valid.csv"
211
+ df = files_to_dataframe(
212
+ uris=[csv_path],
213
+ content_encoding=ContentEncoding.IDENTITY.value,
214
+ content_type=ContentType.UNESCAPED_TSV.value,
215
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
216
+ )
217
+ # Should succeed without raising an exception - this tests that UNESCAPED_TSV is supported
218
+ table = df.to_arrow()
219
+ # Just verify we got some data back, don't assert specific schema since we're reading CSV as TSV
220
+ self.assertGreater(table.num_rows, 0)
221
+ self.assertGreater(len(table.schema.names), 0)
189
222
 
223
+ def test_supports_gzip_content_encoding(self):
224
+ # Test that GZIP encoding is now supported (was previously unsupported)
225
+ df = files_to_dataframe(
226
+ uris=[self.MVP_PATH],
227
+ content_encoding=ContentEncoding.GZIP.value,
228
+ content_type=ContentType.PARQUET.value,
229
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
230
+ )
231
+ # Should succeed without raising an exception
232
+ table = df.to_arrow()
233
+ self.assertEqual(table.schema.names, ["a", "b"])
234
+ self.assertEqual(table.num_rows, 100)
235
+
236
+ def test_raises_error_if_not_supported_content_type(self):
237
+ # Test that truly unsupported content types raise NotImplementedError
190
238
  self.assertRaises(
191
- AssertionError,
192
- lambda: s3_files_to_dataframe(
239
+ NotImplementedError,
240
+ lambda: files_to_dataframe(
193
241
  uris=[self.MVP_PATH],
194
242
  content_encoding=ContentEncoding.IDENTITY.value,
195
- content_type=ContentType.UNESCAPED_TSV.value,
196
- ray_init_options={"local_mode": True},
243
+ content_type=ContentType.AVRO.value, # AVRO is actually unsupported
244
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
197
245
  ),
198
246
  )
199
247
 
248
+ def test_raises_error_if_not_supported_content_encoding(self):
249
+ # Test that truly unsupported content encodings raise NotImplementedError
250
+ self.assertRaises(
251
+ NotImplementedError,
252
+ lambda: files_to_dataframe(
253
+ uris=[self.MVP_PATH],
254
+ content_encoding=ContentEncoding.ZSTD.value, # ZSTD is actually unsupported
255
+ content_type=ContentType.PARQUET.value,
256
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
257
+ ),
258
+ )
259
+
260
+ def test_accepts_custom_kwargs(self):
261
+ # Test that custom kwargs are passed through to daft.read_parquet
262
+ df = files_to_dataframe(
263
+ uris=[self.MVP_PATH],
264
+ content_encoding=ContentEncoding.IDENTITY.value,
265
+ content_type=ContentType.PARQUET.value,
266
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
267
+ # Custom kwarg that should be passed to daft.read_parquet
268
+ coerce_int96_timestamp_unit="ns",
269
+ )
270
+
271
+ table = df.to_arrow()
272
+ self.assertEqual(table.schema.names, ["a", "b"])
273
+ self.assertEqual(table.num_rows, 100)
274
+
275
+ def test_accepts_io_config(self):
276
+ # Test that io_config parameter is accepted and passed correctly
277
+ df = files_to_dataframe(
278
+ uris=[self.MVP_PATH],
279
+ content_encoding=ContentEncoding.IDENTITY.value,
280
+ content_type=ContentType.PARQUET.value,
281
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
282
+ # io_config=None should work fine for local files
283
+ io_config=None,
284
+ )
285
+
286
+ table = df.to_arrow()
287
+ self.assertEqual(table.schema.names, ["a", "b"])
288
+ self.assertEqual(table.num_rows, 100)
289
+
200
290
 
201
291
  if __name__ == "__main__":
202
292
  unittest.main()